blob: 54eee542ed52c331992de42fc2486cd5d4cd485a [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8//
9// Auto-generated file. Do not edit!
10// Specification: test/f32-igemm.yaml
11// Generator: tools/generate-gemm-test.py
12
13
XNNPACK Teamb455b122019-09-27 18:10:33 -070014#include <gtest/gtest.h>
15
Marat Dukhan1dadbf72019-10-01 10:46:20 -070016#include <xnnpack/common.h>
17#include <xnnpack/isa-checks.h>
18
XNNPACK Teamb455b122019-09-27 18:10:33 -070019#include <xnnpack/gemm.h>
20#include <xnnpack/igemm.h>
21#include <xnnpack/ppmm.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070022#include "gemm-microkernel-tester.h"
23
24
Frank Barchard7e955972019-10-11 10:34:25 -070025#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard21be34f2019-10-09 19:32:19 -070026 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
27 TEST_REQUIRES_ARM_NEON_FMA;
28 GemmMicrokernelTester()
29 .mr(1)
30 .nr(8)
31 .kr(1)
32 .sr(1)
33 .m(1)
34 .n(8)
35 .k(8)
36 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
37 }
38
39 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
40 TEST_REQUIRES_ARM_NEON_FMA;
41 GemmMicrokernelTester()
42 .mr(1)
43 .nr(8)
44 .kr(1)
45 .sr(1)
46 .m(1)
47 .n(8)
48 .k(8)
49 .cn_stride(11)
50 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
51 }
52
53 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
54 TEST_REQUIRES_ARM_NEON_FMA;
55 for (uint32_t m = 1; m <= 1; m++) {
56 for (uint32_t n = 1; n <= 8; n++) {
57 GemmMicrokernelTester()
58 .mr(1)
59 .nr(8)
60 .kr(1)
61 .sr(1)
62 .m(m)
63 .n(n)
64 .k(8)
65 .iterations(1)
66 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
67 }
68 }
69 }
70
71 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile_m) {
72 TEST_REQUIRES_ARM_NEON_FMA;
73 for (uint32_t m = 1; m <= 1; m++) {
74 GemmMicrokernelTester()
75 .mr(1)
76 .nr(8)
77 .kr(1)
78 .sr(1)
79 .m(m)
80 .n(8)
81 .k(8)
82 .iterations(1)
83 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
84 }
85 }
86
87 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile_n) {
88 TEST_REQUIRES_ARM_NEON_FMA;
89 for (uint32_t n = 1; n <= 8; n++) {
90 GemmMicrokernelTester()
91 .mr(1)
92 .nr(8)
93 .kr(1)
94 .sr(1)
95 .m(1)
96 .n(n)
97 .k(8)
98 .iterations(1)
99 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
100 }
101 }
102
103 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_16) {
104 TEST_REQUIRES_ARM_NEON_FMA;
105 GemmMicrokernelTester()
106 .mr(1)
107 .nr(8)
108 .kr(1)
109 .sr(1)
110 .m(1)
111 .n(8)
112 .k(16)
113 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
114 }
115
116 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_16_subtile) {
117 TEST_REQUIRES_ARM_NEON_FMA;
118 for (uint32_t m = 1; m <= 1; m++) {
119 for (uint32_t n = 1; n <= 8; n++) {
120 GemmMicrokernelTester()
121 .mr(1)
122 .nr(8)
123 .kr(1)
124 .sr(1)
125 .m(m)
126 .n(n)
127 .k(16)
128 .iterations(1)
129 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
130 }
131 }
132 }
133
134 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_16) {
135 TEST_REQUIRES_ARM_NEON_FMA;
136 for (size_t k = 1; k < 16; k++) {
137 GemmMicrokernelTester()
138 .mr(1)
139 .nr(8)
140 .kr(1)
141 .sr(1)
142 .m(1)
143 .n(8)
144 .k(k)
145 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
146 }
147 }
148
149 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_16_subtile) {
150 TEST_REQUIRES_ARM_NEON_FMA;
151 for (size_t k = 1; k < 16; k++) {
152 for (uint32_t m = 1; m <= 1; m++) {
153 for (uint32_t n = 1; n <= 8; n++) {
154 GemmMicrokernelTester()
155 .mr(1)
156 .nr(8)
157 .kr(1)
158 .sr(1)
159 .m(m)
160 .n(n)
161 .k(k)
162 .iterations(1)
163 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
164 }
165 }
166 }
167 }
168
169 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_16) {
170 TEST_REQUIRES_ARM_NEON_FMA;
171 for (size_t k = 17; k < 16; k++) {
172 GemmMicrokernelTester()
173 .mr(1)
174 .nr(8)
175 .kr(1)
176 .sr(1)
177 .m(1)
178 .n(8)
179 .k(k)
180 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
181 }
182 }
183
184 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8_subtile) {
185 TEST_REQUIRES_ARM_NEON_FMA;
186 for (size_t k = 17; k < 16; k++) {
187 for (uint32_t m = 1; m <= 1; m++) {
188 for (uint32_t n = 1; n <= 8; n++) {
189 GemmMicrokernelTester()
190 .mr(1)
191 .nr(8)
192 .kr(1)
193 .sr(1)
194 .m(m)
195 .n(n)
196 .k(k)
197 .iterations(1)
198 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
199 }
200 }
201 }
202 }
203
204 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_div_8) {
205 TEST_REQUIRES_ARM_NEON_FMA;
206 for (size_t k = 24; k <= 80; k += 8) {
207 GemmMicrokernelTester()
208 .mr(1)
209 .nr(8)
210 .kr(1)
211 .sr(1)
212 .m(1)
213 .n(8)
214 .k(k)
215 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
216 }
217 }
218
219 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_div_8_subtile) {
220 TEST_REQUIRES_ARM_NEON_FMA;
221 for (size_t k = 24; k <= 80; k += 8) {
222 for (uint32_t m = 1; m <= 1; m++) {
223 for (uint32_t n = 1; n <= 8; n++) {
224 GemmMicrokernelTester()
225 .mr(1)
226 .nr(8)
227 .kr(1)
228 .sr(1)
229 .m(m)
230 .n(n)
231 .k(k)
232 .iterations(1)
233 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
234 }
235 }
236 }
237 }
238
239 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
240 TEST_REQUIRES_ARM_NEON_FMA;
241 for (uint32_t n = 9; n < 16; n++) {
242 for (size_t k = 1; k <= 40; k += 9) {
243 GemmMicrokernelTester()
244 .mr(1)
245 .nr(8)
246 .kr(1)
247 .sr(1)
248 .m(1)
249 .n(8)
250 .k(k)
251 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
252 }
253 }
254 }
255
256 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
257 TEST_REQUIRES_ARM_NEON_FMA;
258 for (uint32_t n = 9; n < 16; n++) {
259 for (size_t k = 1; k <= 40; k += 9) {
260 GemmMicrokernelTester()
261 .mr(1)
262 .nr(8)
263 .kr(1)
264 .sr(1)
265 .m(1)
266 .n(8)
267 .k(k)
268 .cn_stride(11)
269 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
270 }
271 }
272 }
273
274 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
275 TEST_REQUIRES_ARM_NEON_FMA;
276 for (uint32_t n = 9; n < 16; n++) {
277 for (size_t k = 1; k <= 40; k += 9) {
278 for (uint32_t m = 1; m <= 1; m++) {
279 GemmMicrokernelTester()
280 .mr(1)
281 .nr(8)
282 .kr(1)
283 .sr(1)
284 .m(m)
285 .n(n)
286 .k(k)
287 .iterations(1)
288 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
289 }
290 }
291 }
292 }
293
294 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
295 TEST_REQUIRES_ARM_NEON_FMA;
296 for (uint32_t n = 16; n <= 24; n += 8) {
297 for (size_t k = 1; k <= 40; k += 9) {
298 GemmMicrokernelTester()
299 .mr(1)
300 .nr(8)
301 .kr(1)
302 .sr(1)
303 .m(1)
304 .n(8)
305 .k(k)
306 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
307 }
308 }
309 }
310
311 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
312 TEST_REQUIRES_ARM_NEON_FMA;
313 for (uint32_t n = 16; n <= 24; n += 8) {
314 for (size_t k = 1; k <= 40; k += 9) {
315 GemmMicrokernelTester()
316 .mr(1)
317 .nr(8)
318 .kr(1)
319 .sr(1)
320 .m(1)
321 .n(n)
322 .k(k)
323 .cn_stride(11)
324 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
325 }
326 }
327 }
328
329 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
330 TEST_REQUIRES_ARM_NEON_FMA;
331 for (uint32_t n = 16; n <= 24; n += 8) {
332 for (size_t k = 1; k <= 40; k += 9) {
333 for (uint32_t m = 1; m <= 1; m++) {
334 GemmMicrokernelTester()
335 .mr(1)
336 .nr(8)
337 .kr(1)
338 .sr(1)
339 .m(m)
340 .n(n)
341 .k(k)
342 .iterations(1)
343 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
344 }
345 }
346 }
347 }
348
349 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, small_kernel) {
350 TEST_REQUIRES_ARM_NEON_FMA;
351 for (size_t k = 1; k <= 40; k += 9) {
352 GemmMicrokernelTester()
353 .mr(1)
354 .nr(8)
355 .kr(1)
356 .sr(1)
357 .m(1)
358 .n(8)
359 .k(k)
360 .ks(3)
361 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
362 }
363 }
364
365 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, small_kernel_subtile) {
366 TEST_REQUIRES_ARM_NEON_FMA;
367 for (size_t k = 1; k <= 40; k += 9) {
368 for (uint32_t m = 1; m <= 1; m++) {
369 for (uint32_t n = 1; n <= 8; n++) {
370 GemmMicrokernelTester()
371 .mr(1)
372 .nr(8)
373 .kr(1)
374 .sr(1)
375 .m(m)
376 .n(n)
377 .k(k)
378 .ks(3)
379 .iterations(1)
380 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
381 }
382 }
383 }
384 }
385
386 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_small_kernel) {
387 TEST_REQUIRES_ARM_NEON_FMA;
388 for (uint32_t n = 9; n < 16; n++) {
389 for (size_t k = 1; k <= 40; k += 9) {
390 GemmMicrokernelTester()
391 .mr(1)
392 .nr(8)
393 .kr(1)
394 .sr(1)
395 .m(1)
396 .n(8)
397 .k(k)
398 .ks(3)
399 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
400 }
401 }
402 }
403
404 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_small_kernel) {
405 TEST_REQUIRES_ARM_NEON_FMA;
406 for (uint32_t n = 16; n <= 24; n += 8) {
407 for (size_t k = 1; k <= 40; k += 9) {
408 GemmMicrokernelTester()
409 .mr(1)
410 .nr(8)
411 .kr(1)
412 .sr(1)
413 .m(1)
414 .n(8)
415 .k(k)
416 .ks(3)
417 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
418 }
419 }
420 }
421
422 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
423 TEST_REQUIRES_ARM_NEON_FMA;
424 for (size_t k = 1; k <= 40; k += 9) {
425 for (uint32_t m = 1; m <= 1; m++) {
426 for (uint32_t n = 1; n <= 8; n++) {
427 GemmMicrokernelTester()
428 .mr(1)
429 .nr(8)
430 .kr(1)
431 .sr(1)
432 .m(m)
433 .n(n)
434 .k(k)
435 .cm_stride(11)
436 .iterations(1)
437 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
438 }
439 }
440 }
441 }
442
443 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, a_offset) {
444 TEST_REQUIRES_ARM_NEON_FMA;
445 for (size_t k = 1; k <= 40; k += 9) {
446 GemmMicrokernelTester()
447 .mr(1)
448 .nr(8)
449 .kr(1)
450 .sr(1)
451 .m(1)
452 .n(8)
453 .k(k)
454 .ks(3)
455 .a_offset(43)
456 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
457 }
458 }
459
460 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, zero) {
461 TEST_REQUIRES_ARM_NEON_FMA;
462 for (uint32_t mz = 0; mz < 1; mz++) {
463 for (size_t k = 1; k <= 40; k += 9) {
464 GemmMicrokernelTester()
465 .mr(1)
466 .nr(8)
467 .kr(1)
468 .sr(1)
469 .m(1)
470 .n(8)
471 .k(k)
472 .ks(3)
473 .a_offset(43)
474 .zero_index(mz)
475 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
476 }
477 }
478 }
479
480 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, qmin) {
481 TEST_REQUIRES_ARM_NEON_FMA;
482 GemmMicrokernelTester()
483 .mr(1)
484 .nr(8)
485 .kr(1)
486 .sr(1)
487 .m(1)
488 .n(8)
489 .k(8)
490 .qmin(128)
491 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
492 }
493
494 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, qmax) {
495 TEST_REQUIRES_ARM_NEON_FMA;
496 GemmMicrokernelTester()
497 .mr(1)
498 .nr(8)
499 .kr(1)
500 .sr(1)
501 .m(1)
502 .n(8)
503 .k(8)
504 .qmax(128)
505 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
506 }
507
508 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
509 TEST_REQUIRES_ARM_NEON_FMA;
510 GemmMicrokernelTester()
511 .mr(1)
512 .nr(8)
513 .kr(1)
514 .sr(1)
515 .m(1)
516 .n(8)
517 .k(8)
518 .cm_stride(11)
519 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
520 }
Frank Barchard7e955972019-10-11 10:34:25 -0700521#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard21be34f2019-10-09 19:32:19 -0700522
523
Frank Barchard7e955972019-10-11 10:34:25 -0700524#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700525 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
526 TEST_REQUIRES_ARM_NEON_FMA;
527 GemmMicrokernelTester()
528 .mr(1)
529 .nr(8)
530 .kr(1)
531 .sr(1)
532 .m(1)
533 .n(8)
534 .k(8)
535 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
536 }
537
538 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, strided_cn) {
539 TEST_REQUIRES_ARM_NEON_FMA;
540 GemmMicrokernelTester()
541 .mr(1)
542 .nr(8)
543 .kr(1)
544 .sr(1)
545 .m(1)
546 .n(8)
547 .k(8)
548 .cn_stride(11)
549 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
550 }
551
552 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile) {
553 TEST_REQUIRES_ARM_NEON_FMA;
554 for (uint32_t m = 1; m <= 1; m++) {
555 for (uint32_t n = 1; n <= 8; n++) {
556 GemmMicrokernelTester()
557 .mr(1)
558 .nr(8)
559 .kr(1)
560 .sr(1)
561 .m(m)
562 .n(n)
563 .k(8)
564 .iterations(1)
565 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
566 }
567 }
568 }
569
570 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_m) {
571 TEST_REQUIRES_ARM_NEON_FMA;
572 for (uint32_t m = 1; m <= 1; m++) {
573 GemmMicrokernelTester()
574 .mr(1)
575 .nr(8)
576 .kr(1)
577 .sr(1)
578 .m(m)
579 .n(8)
580 .k(8)
581 .iterations(1)
582 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
583 }
584 }
585
586 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_n) {
587 TEST_REQUIRES_ARM_NEON_FMA;
588 for (uint32_t n = 1; n <= 8; n++) {
589 GemmMicrokernelTester()
590 .mr(1)
591 .nr(8)
592 .kr(1)
593 .sr(1)
594 .m(1)
595 .n(n)
596 .k(8)
597 .iterations(1)
598 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
599 }
600 }
601
602 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16) {
603 TEST_REQUIRES_ARM_NEON_FMA;
604 GemmMicrokernelTester()
605 .mr(1)
606 .nr(8)
607 .kr(1)
608 .sr(1)
609 .m(1)
610 .n(8)
611 .k(16)
612 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
613 }
614
615 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_subtile) {
616 TEST_REQUIRES_ARM_NEON_FMA;
617 for (uint32_t m = 1; m <= 1; m++) {
618 for (uint32_t n = 1; n <= 8; n++) {
619 GemmMicrokernelTester()
620 .mr(1)
621 .nr(8)
622 .kr(1)
623 .sr(1)
624 .m(m)
625 .n(n)
626 .k(16)
627 .iterations(1)
628 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
629 }
630 }
631 }
632
633 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16) {
634 TEST_REQUIRES_ARM_NEON_FMA;
635 for (size_t k = 1; k < 16; k++) {
636 GemmMicrokernelTester()
637 .mr(1)
638 .nr(8)
639 .kr(1)
640 .sr(1)
641 .m(1)
642 .n(8)
643 .k(k)
644 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
645 }
646 }
647
648 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_subtile) {
649 TEST_REQUIRES_ARM_NEON_FMA;
650 for (size_t k = 1; k < 16; k++) {
651 for (uint32_t m = 1; m <= 1; m++) {
652 for (uint32_t n = 1; n <= 8; n++) {
653 GemmMicrokernelTester()
654 .mr(1)
655 .nr(8)
656 .kr(1)
657 .sr(1)
658 .m(m)
659 .n(n)
660 .k(k)
661 .iterations(1)
662 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
663 }
664 }
665 }
666 }
667
668 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_16) {
669 TEST_REQUIRES_ARM_NEON_FMA;
670 for (size_t k = 17; k < 16; k++) {
671 GemmMicrokernelTester()
672 .mr(1)
673 .nr(8)
674 .kr(1)
675 .sr(1)
676 .m(1)
677 .n(8)
678 .k(k)
679 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
680 }
681 }
682
683 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_subtile) {
684 TEST_REQUIRES_ARM_NEON_FMA;
685 for (size_t k = 17; k < 16; k++) {
686 for (uint32_t m = 1; m <= 1; m++) {
687 for (uint32_t n = 1; n <= 8; n++) {
688 GemmMicrokernelTester()
689 .mr(1)
690 .nr(8)
691 .kr(1)
692 .sr(1)
693 .m(m)
694 .n(n)
695 .k(k)
696 .iterations(1)
697 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
698 }
699 }
700 }
701 }
702
703 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8) {
704 TEST_REQUIRES_ARM_NEON_FMA;
705 for (size_t k = 24; k <= 80; k += 8) {
706 GemmMicrokernelTester()
707 .mr(1)
708 .nr(8)
709 .kr(1)
710 .sr(1)
711 .m(1)
712 .n(8)
713 .k(k)
714 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
715 }
716 }
717
718 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_subtile) {
719 TEST_REQUIRES_ARM_NEON_FMA;
720 for (size_t k = 24; k <= 80; k += 8) {
721 for (uint32_t m = 1; m <= 1; m++) {
722 for (uint32_t n = 1; n <= 8; n++) {
723 GemmMicrokernelTester()
724 .mr(1)
725 .nr(8)
726 .kr(1)
727 .sr(1)
728 .m(m)
729 .n(n)
730 .k(k)
731 .iterations(1)
732 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
733 }
734 }
735 }
736 }
737
738 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8) {
739 TEST_REQUIRES_ARM_NEON_FMA;
740 for (uint32_t n = 9; n < 16; n++) {
741 for (size_t k = 1; k <= 40; k += 9) {
742 GemmMicrokernelTester()
743 .mr(1)
744 .nr(8)
745 .kr(1)
746 .sr(1)
747 .m(1)
748 .n(8)
749 .k(k)
750 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
751 }
752 }
753 }
754
755 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_cn) {
756 TEST_REQUIRES_ARM_NEON_FMA;
757 for (uint32_t n = 9; n < 16; n++) {
758 for (size_t k = 1; k <= 40; k += 9) {
759 GemmMicrokernelTester()
760 .mr(1)
761 .nr(8)
762 .kr(1)
763 .sr(1)
764 .m(1)
765 .n(8)
766 .k(k)
767 .cn_stride(11)
768 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
769 }
770 }
771 }
772
773 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_subtile) {
774 TEST_REQUIRES_ARM_NEON_FMA;
775 for (uint32_t n = 9; n < 16; n++) {
776 for (size_t k = 1; k <= 40; k += 9) {
777 for (uint32_t m = 1; m <= 1; m++) {
778 GemmMicrokernelTester()
779 .mr(1)
780 .nr(8)
781 .kr(1)
782 .sr(1)
783 .m(m)
784 .n(n)
785 .k(k)
786 .iterations(1)
787 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
788 }
789 }
790 }
791 }
792
793 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8) {
794 TEST_REQUIRES_ARM_NEON_FMA;
795 for (uint32_t n = 16; n <= 24; n += 8) {
796 for (size_t k = 1; k <= 40; k += 9) {
797 GemmMicrokernelTester()
798 .mr(1)
799 .nr(8)
800 .kr(1)
801 .sr(1)
802 .m(1)
803 .n(8)
804 .k(k)
805 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
806 }
807 }
808 }
809
810 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_cn) {
811 TEST_REQUIRES_ARM_NEON_FMA;
812 for (uint32_t n = 16; n <= 24; n += 8) {
813 for (size_t k = 1; k <= 40; k += 9) {
814 GemmMicrokernelTester()
815 .mr(1)
816 .nr(8)
817 .kr(1)
818 .sr(1)
819 .m(1)
820 .n(n)
821 .k(k)
822 .cn_stride(11)
823 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
824 }
825 }
826 }
827
828 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_subtile) {
829 TEST_REQUIRES_ARM_NEON_FMA;
830 for (uint32_t n = 16; n <= 24; n += 8) {
831 for (size_t k = 1; k <= 40; k += 9) {
832 for (uint32_t m = 1; m <= 1; m++) {
833 GemmMicrokernelTester()
834 .mr(1)
835 .nr(8)
836 .kr(1)
837 .sr(1)
838 .m(m)
839 .n(n)
840 .k(k)
841 .iterations(1)
842 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
843 }
844 }
845 }
846 }
847
848 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, small_kernel) {
849 TEST_REQUIRES_ARM_NEON_FMA;
850 for (size_t k = 1; k <= 40; k += 9) {
851 GemmMicrokernelTester()
852 .mr(1)
853 .nr(8)
854 .kr(1)
855 .sr(1)
856 .m(1)
857 .n(8)
858 .k(k)
859 .ks(3)
860 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
861 }
862 }
863
864 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, small_kernel_subtile) {
865 TEST_REQUIRES_ARM_NEON_FMA;
866 for (size_t k = 1; k <= 40; k += 9) {
867 for (uint32_t m = 1; m <= 1; m++) {
868 for (uint32_t n = 1; n <= 8; n++) {
869 GemmMicrokernelTester()
870 .mr(1)
871 .nr(8)
872 .kr(1)
873 .sr(1)
874 .m(m)
875 .n(n)
876 .k(k)
877 .ks(3)
878 .iterations(1)
879 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
880 }
881 }
882 }
883 }
884
885 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_small_kernel) {
886 TEST_REQUIRES_ARM_NEON_FMA;
887 for (uint32_t n = 9; n < 16; n++) {
888 for (size_t k = 1; k <= 40; k += 9) {
889 GemmMicrokernelTester()
890 .mr(1)
891 .nr(8)
892 .kr(1)
893 .sr(1)
894 .m(1)
895 .n(8)
896 .k(k)
897 .ks(3)
898 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
899 }
900 }
901 }
902
903 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_small_kernel) {
904 TEST_REQUIRES_ARM_NEON_FMA;
905 for (uint32_t n = 16; n <= 24; n += 8) {
906 for (size_t k = 1; k <= 40; k += 9) {
907 GemmMicrokernelTester()
908 .mr(1)
909 .nr(8)
910 .kr(1)
911 .sr(1)
912 .m(1)
913 .n(8)
914 .k(k)
915 .ks(3)
916 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
917 }
918 }
919 }
920
921 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm_subtile) {
922 TEST_REQUIRES_ARM_NEON_FMA;
923 for (size_t k = 1; k <= 40; k += 9) {
924 for (uint32_t m = 1; m <= 1; m++) {
925 for (uint32_t n = 1; n <= 8; n++) {
926 GemmMicrokernelTester()
927 .mr(1)
928 .nr(8)
929 .kr(1)
930 .sr(1)
931 .m(m)
932 .n(n)
933 .k(k)
934 .cm_stride(11)
935 .iterations(1)
936 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
937 }
938 }
939 }
940 }
941
942 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, a_offset) {
943 TEST_REQUIRES_ARM_NEON_FMA;
944 for (size_t k = 1; k <= 40; k += 9) {
945 GemmMicrokernelTester()
946 .mr(1)
947 .nr(8)
948 .kr(1)
949 .sr(1)
950 .m(1)
951 .n(8)
952 .k(k)
953 .ks(3)
954 .a_offset(43)
955 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
956 }
957 }
958
959 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, zero) {
960 TEST_REQUIRES_ARM_NEON_FMA;
961 for (uint32_t mz = 0; mz < 1; mz++) {
962 for (size_t k = 1; k <= 40; k += 9) {
963 GemmMicrokernelTester()
964 .mr(1)
965 .nr(8)
966 .kr(1)
967 .sr(1)
968 .m(1)
969 .n(8)
970 .k(k)
971 .ks(3)
972 .a_offset(43)
973 .zero_index(mz)
974 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
975 }
976 }
977 }
978
979 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, qmin) {
980 TEST_REQUIRES_ARM_NEON_FMA;
981 GemmMicrokernelTester()
982 .mr(1)
983 .nr(8)
984 .kr(1)
985 .sr(1)
986 .m(1)
987 .n(8)
988 .k(8)
989 .qmin(128)
990 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
991 }
992
993 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, qmax) {
994 TEST_REQUIRES_ARM_NEON_FMA;
995 GemmMicrokernelTester()
996 .mr(1)
997 .nr(8)
998 .kr(1)
999 .sr(1)
1000 .m(1)
1001 .n(8)
1002 .k(8)
1003 .qmax(128)
1004 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
1005 }
1006
1007 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm) {
1008 TEST_REQUIRES_ARM_NEON_FMA;
1009 GemmMicrokernelTester()
1010 .mr(1)
1011 .nr(8)
1012 .kr(1)
1013 .sr(1)
1014 .m(1)
1015 .n(8)
1016 .k(8)
1017 .cm_stride(11)
1018 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
1019 }
Frank Barchard7e955972019-10-11 10:34:25 -07001020#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07001021
1022
Frank Barchard7e955972019-10-11 10:34:25 -07001023#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07001024 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
1025 TEST_REQUIRES_ARM_NEON_FMA;
1026 GemmMicrokernelTester()
1027 .mr(1)
1028 .nr(8)
1029 .kr(1)
1030 .sr(1)
1031 .m(1)
1032 .n(8)
1033 .k(8)
1034 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1035 }
1036
1037 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
1038 TEST_REQUIRES_ARM_NEON_FMA;
1039 GemmMicrokernelTester()
1040 .mr(1)
1041 .nr(8)
1042 .kr(1)
1043 .sr(1)
1044 .m(1)
1045 .n(8)
1046 .k(8)
1047 .cn_stride(11)
1048 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1049 }
1050
1051 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
1052 TEST_REQUIRES_ARM_NEON_FMA;
1053 for (uint32_t m = 1; m <= 1; m++) {
1054 for (uint32_t n = 1; n <= 8; n++) {
1055 GemmMicrokernelTester()
1056 .mr(1)
1057 .nr(8)
1058 .kr(1)
1059 .sr(1)
1060 .m(m)
1061 .n(n)
1062 .k(8)
1063 .iterations(1)
1064 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1065 }
1066 }
1067 }
1068
1069 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
1070 TEST_REQUIRES_ARM_NEON_FMA;
1071 for (uint32_t m = 1; m <= 1; m++) {
1072 GemmMicrokernelTester()
1073 .mr(1)
1074 .nr(8)
1075 .kr(1)
1076 .sr(1)
1077 .m(m)
1078 .n(8)
1079 .k(8)
1080 .iterations(1)
1081 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1082 }
1083 }
1084
1085 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
1086 TEST_REQUIRES_ARM_NEON_FMA;
1087 for (uint32_t n = 1; n <= 8; n++) {
1088 GemmMicrokernelTester()
1089 .mr(1)
1090 .nr(8)
1091 .kr(1)
1092 .sr(1)
1093 .m(1)
1094 .n(n)
1095 .k(8)
1096 .iterations(1)
1097 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1098 }
1099 }
1100
1101 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
1102 TEST_REQUIRES_ARM_NEON_FMA;
1103 GemmMicrokernelTester()
1104 .mr(1)
1105 .nr(8)
1106 .kr(1)
1107 .sr(1)
1108 .m(1)
1109 .n(8)
1110 .k(16)
1111 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1112 }
1113
1114 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
1115 TEST_REQUIRES_ARM_NEON_FMA;
1116 for (uint32_t m = 1; m <= 1; m++) {
1117 for (uint32_t n = 1; n <= 8; n++) {
1118 GemmMicrokernelTester()
1119 .mr(1)
1120 .nr(8)
1121 .kr(1)
1122 .sr(1)
1123 .m(m)
1124 .n(n)
1125 .k(16)
1126 .iterations(1)
1127 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1128 }
1129 }
1130 }
1131
1132 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
1133 TEST_REQUIRES_ARM_NEON_FMA;
1134 for (size_t k = 1; k < 16; k++) {
1135 GemmMicrokernelTester()
1136 .mr(1)
1137 .nr(8)
1138 .kr(1)
1139 .sr(1)
1140 .m(1)
1141 .n(8)
1142 .k(k)
1143 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1144 }
1145 }
1146
1147 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
1148 TEST_REQUIRES_ARM_NEON_FMA;
1149 for (size_t k = 1; k < 16; k++) {
1150 for (uint32_t m = 1; m <= 1; m++) {
1151 for (uint32_t n = 1; n <= 8; n++) {
1152 GemmMicrokernelTester()
1153 .mr(1)
1154 .nr(8)
1155 .kr(1)
1156 .sr(1)
1157 .m(m)
1158 .n(n)
1159 .k(k)
1160 .iterations(1)
1161 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1162 }
1163 }
1164 }
1165 }
1166
1167 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
1168 TEST_REQUIRES_ARM_NEON_FMA;
1169 for (size_t k = 17; k < 16; k++) {
1170 GemmMicrokernelTester()
1171 .mr(1)
1172 .nr(8)
1173 .kr(1)
1174 .sr(1)
1175 .m(1)
1176 .n(8)
1177 .k(k)
1178 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1179 }
1180 }
1181
1182 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
1183 TEST_REQUIRES_ARM_NEON_FMA;
1184 for (size_t k = 17; k < 16; k++) {
1185 for (uint32_t m = 1; m <= 1; m++) {
1186 for (uint32_t n = 1; n <= 8; n++) {
1187 GemmMicrokernelTester()
1188 .mr(1)
1189 .nr(8)
1190 .kr(1)
1191 .sr(1)
1192 .m(m)
1193 .n(n)
1194 .k(k)
1195 .iterations(1)
1196 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1197 }
1198 }
1199 }
1200 }
1201
1202 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
1203 TEST_REQUIRES_ARM_NEON_FMA;
1204 for (size_t k = 24; k <= 80; k += 8) {
1205 GemmMicrokernelTester()
1206 .mr(1)
1207 .nr(8)
1208 .kr(1)
1209 .sr(1)
1210 .m(1)
1211 .n(8)
1212 .k(k)
1213 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1214 }
1215 }
1216
1217 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
1218 TEST_REQUIRES_ARM_NEON_FMA;
1219 for (size_t k = 24; k <= 80; k += 8) {
1220 for (uint32_t m = 1; m <= 1; m++) {
1221 for (uint32_t n = 1; n <= 8; n++) {
1222 GemmMicrokernelTester()
1223 .mr(1)
1224 .nr(8)
1225 .kr(1)
1226 .sr(1)
1227 .m(m)
1228 .n(n)
1229 .k(k)
1230 .iterations(1)
1231 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1232 }
1233 }
1234 }
1235 }
1236
1237 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
1238 TEST_REQUIRES_ARM_NEON_FMA;
1239 for (uint32_t n = 9; n < 16; n++) {
1240 for (size_t k = 1; k <= 40; k += 9) {
1241 GemmMicrokernelTester()
1242 .mr(1)
1243 .nr(8)
1244 .kr(1)
1245 .sr(1)
1246 .m(1)
1247 .n(8)
1248 .k(k)
1249 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1250 }
1251 }
1252 }
1253
1254 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
1255 TEST_REQUIRES_ARM_NEON_FMA;
1256 for (uint32_t n = 9; n < 16; n++) {
1257 for (size_t k = 1; k <= 40; k += 9) {
1258 GemmMicrokernelTester()
1259 .mr(1)
1260 .nr(8)
1261 .kr(1)
1262 .sr(1)
1263 .m(1)
1264 .n(8)
1265 .k(k)
1266 .cn_stride(11)
1267 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1268 }
1269 }
1270 }
1271
1272 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
1273 TEST_REQUIRES_ARM_NEON_FMA;
1274 for (uint32_t n = 9; n < 16; n++) {
1275 for (size_t k = 1; k <= 40; k += 9) {
1276 for (uint32_t m = 1; m <= 1; m++) {
1277 GemmMicrokernelTester()
1278 .mr(1)
1279 .nr(8)
1280 .kr(1)
1281 .sr(1)
1282 .m(m)
1283 .n(n)
1284 .k(k)
1285 .iterations(1)
1286 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1287 }
1288 }
1289 }
1290 }
1291
1292 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
1293 TEST_REQUIRES_ARM_NEON_FMA;
1294 for (uint32_t n = 16; n <= 24; n += 8) {
1295 for (size_t k = 1; k <= 40; k += 9) {
1296 GemmMicrokernelTester()
1297 .mr(1)
1298 .nr(8)
1299 .kr(1)
1300 .sr(1)
1301 .m(1)
1302 .n(8)
1303 .k(k)
1304 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1305 }
1306 }
1307 }
1308
1309 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
1310 TEST_REQUIRES_ARM_NEON_FMA;
1311 for (uint32_t n = 16; n <= 24; n += 8) {
1312 for (size_t k = 1; k <= 40; k += 9) {
1313 GemmMicrokernelTester()
1314 .mr(1)
1315 .nr(8)
1316 .kr(1)
1317 .sr(1)
1318 .m(1)
1319 .n(n)
1320 .k(k)
1321 .cn_stride(11)
1322 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1323 }
1324 }
1325 }
1326
1327 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
1328 TEST_REQUIRES_ARM_NEON_FMA;
1329 for (uint32_t n = 16; n <= 24; n += 8) {
1330 for (size_t k = 1; k <= 40; k += 9) {
1331 for (uint32_t m = 1; m <= 1; m++) {
1332 GemmMicrokernelTester()
1333 .mr(1)
1334 .nr(8)
1335 .kr(1)
1336 .sr(1)
1337 .m(m)
1338 .n(n)
1339 .k(k)
1340 .iterations(1)
1341 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1342 }
1343 }
1344 }
1345 }
1346
1347 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, small_kernel) {
1348 TEST_REQUIRES_ARM_NEON_FMA;
1349 for (size_t k = 1; k <= 40; k += 9) {
1350 GemmMicrokernelTester()
1351 .mr(1)
1352 .nr(8)
1353 .kr(1)
1354 .sr(1)
1355 .m(1)
1356 .n(8)
1357 .k(k)
1358 .ks(3)
1359 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1360 }
1361 }
1362
1363 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, small_kernel_subtile) {
1364 TEST_REQUIRES_ARM_NEON_FMA;
1365 for (size_t k = 1; k <= 40; k += 9) {
1366 for (uint32_t m = 1; m <= 1; m++) {
1367 for (uint32_t n = 1; n <= 8; n++) {
1368 GemmMicrokernelTester()
1369 .mr(1)
1370 .nr(8)
1371 .kr(1)
1372 .sr(1)
1373 .m(m)
1374 .n(n)
1375 .k(k)
1376 .ks(3)
1377 .iterations(1)
1378 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1379 }
1380 }
1381 }
1382 }
1383
1384 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_small_kernel) {
1385 TEST_REQUIRES_ARM_NEON_FMA;
1386 for (uint32_t n = 9; n < 16; n++) {
1387 for (size_t k = 1; k <= 40; k += 9) {
1388 GemmMicrokernelTester()
1389 .mr(1)
1390 .nr(8)
1391 .kr(1)
1392 .sr(1)
1393 .m(1)
1394 .n(8)
1395 .k(k)
1396 .ks(3)
1397 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1398 }
1399 }
1400 }
1401
1402 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_small_kernel) {
1403 TEST_REQUIRES_ARM_NEON_FMA;
1404 for (uint32_t n = 16; n <= 24; n += 8) {
1405 for (size_t k = 1; k <= 40; k += 9) {
1406 GemmMicrokernelTester()
1407 .mr(1)
1408 .nr(8)
1409 .kr(1)
1410 .sr(1)
1411 .m(1)
1412 .n(8)
1413 .k(k)
1414 .ks(3)
1415 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1416 }
1417 }
1418 }
1419
1420 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
1421 TEST_REQUIRES_ARM_NEON_FMA;
1422 for (size_t k = 1; k <= 40; k += 9) {
1423 for (uint32_t m = 1; m <= 1; m++) {
1424 for (uint32_t n = 1; n <= 8; n++) {
1425 GemmMicrokernelTester()
1426 .mr(1)
1427 .nr(8)
1428 .kr(1)
1429 .sr(1)
1430 .m(m)
1431 .n(n)
1432 .k(k)
1433 .cm_stride(11)
1434 .iterations(1)
1435 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1436 }
1437 }
1438 }
1439 }
1440
1441 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, a_offset) {
1442 TEST_REQUIRES_ARM_NEON_FMA;
1443 for (size_t k = 1; k <= 40; k += 9) {
1444 GemmMicrokernelTester()
1445 .mr(1)
1446 .nr(8)
1447 .kr(1)
1448 .sr(1)
1449 .m(1)
1450 .n(8)
1451 .k(k)
1452 .ks(3)
1453 .a_offset(43)
1454 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1455 }
1456 }
1457
1458 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, zero) {
1459 TEST_REQUIRES_ARM_NEON_FMA;
1460 for (uint32_t mz = 0; mz < 1; mz++) {
1461 for (size_t k = 1; k <= 40; k += 9) {
1462 GemmMicrokernelTester()
1463 .mr(1)
1464 .nr(8)
1465 .kr(1)
1466 .sr(1)
1467 .m(1)
1468 .n(8)
1469 .k(k)
1470 .ks(3)
1471 .a_offset(43)
1472 .zero_index(mz)
1473 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1474 }
1475 }
1476 }
1477
1478 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
1479 TEST_REQUIRES_ARM_NEON_FMA;
1480 GemmMicrokernelTester()
1481 .mr(1)
1482 .nr(8)
1483 .kr(1)
1484 .sr(1)
1485 .m(1)
1486 .n(8)
1487 .k(8)
1488 .qmin(128)
1489 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1490 }
1491
1492 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
1493 TEST_REQUIRES_ARM_NEON_FMA;
1494 GemmMicrokernelTester()
1495 .mr(1)
1496 .nr(8)
1497 .kr(1)
1498 .sr(1)
1499 .m(1)
1500 .n(8)
1501 .k(8)
1502 .qmax(128)
1503 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1504 }
1505
1506 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
1507 TEST_REQUIRES_ARM_NEON_FMA;
1508 GemmMicrokernelTester()
1509 .mr(1)
1510 .nr(8)
1511 .kr(1)
1512 .sr(1)
1513 .m(1)
1514 .n(8)
1515 .k(8)
1516 .cm_stride(11)
1517 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1518 }
Frank Barchard7e955972019-10-11 10:34:25 -07001519#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07001520
1521
Frank Barchard7e955972019-10-11 10:34:25 -07001522#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001523 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001524 TEST_REQUIRES_ARM_NEON_FMA;
1525 GemmMicrokernelTester()
1526 .mr(4)
1527 .nr(8)
1528 .kr(1)
1529 .sr(1)
1530 .m(4)
1531 .n(8)
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001532 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07001533 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1534 }
1535
1536 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
1537 TEST_REQUIRES_ARM_NEON_FMA;
1538 GemmMicrokernelTester()
1539 .mr(4)
1540 .nr(8)
1541 .kr(1)
1542 .sr(1)
1543 .m(4)
1544 .n(8)
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001545 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07001546 .cn_stride(11)
1547 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1548 }
1549
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001550 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001551 TEST_REQUIRES_ARM_NEON_FMA;
1552 for (uint32_t m = 1; m <= 4; m++) {
1553 for (uint32_t n = 1; n <= 8; n++) {
1554 GemmMicrokernelTester()
1555 .mr(4)
1556 .nr(8)
1557 .kr(1)
1558 .sr(1)
1559 .m(m)
1560 .n(n)
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001561 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07001562 .iterations(1)
1563 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1564 }
1565 }
1566 }
1567
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001568 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001569 TEST_REQUIRES_ARM_NEON_FMA;
1570 for (uint32_t m = 1; m <= 4; m++) {
1571 GemmMicrokernelTester()
1572 .mr(4)
1573 .nr(8)
1574 .kr(1)
1575 .sr(1)
1576 .m(m)
1577 .n(8)
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001578 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07001579 .iterations(1)
1580 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1581 }
1582 }
1583
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001584 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001585 TEST_REQUIRES_ARM_NEON_FMA;
1586 for (uint32_t n = 1; n <= 8; n++) {
1587 GemmMicrokernelTester()
1588 .mr(4)
1589 .nr(8)
1590 .kr(1)
1591 .sr(1)
1592 .m(4)
1593 .n(n)
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001594 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07001595 .iterations(1)
1596 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1597 }
1598 }
1599
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001600 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001601 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001602 GemmMicrokernelTester()
1603 .mr(4)
1604 .nr(8)
1605 .kr(1)
1606 .sr(1)
1607 .m(4)
1608 .n(8)
1609 .k(8)
1610 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1611 }
1612
1613 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
1614 TEST_REQUIRES_ARM_NEON_FMA;
1615 for (uint32_t m = 1; m <= 4; m++) {
1616 for (uint32_t n = 1; n <= 8; n++) {
1617 GemmMicrokernelTester()
1618 .mr(4)
1619 .nr(8)
1620 .kr(1)
1621 .sr(1)
1622 .m(m)
1623 .n(n)
1624 .k(8)
1625 .iterations(1)
1626 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1627 }
1628 }
1629 }
1630
1631 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
1632 TEST_REQUIRES_ARM_NEON_FMA;
1633 for (size_t k = 1; k < 8; k++) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001634 GemmMicrokernelTester()
1635 .mr(4)
1636 .nr(8)
1637 .kr(1)
1638 .sr(1)
1639 .m(4)
1640 .n(8)
1641 .k(k)
1642 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1643 }
1644 }
1645
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001646 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001647 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001648 for (size_t k = 1; k < 8; k++) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001649 for (uint32_t m = 1; m <= 4; m++) {
1650 for (uint32_t n = 1; n <= 8; n++) {
1651 GemmMicrokernelTester()
1652 .mr(4)
1653 .nr(8)
1654 .kr(1)
1655 .sr(1)
1656 .m(m)
1657 .n(n)
1658 .k(k)
1659 .iterations(1)
1660 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1661 }
1662 }
1663 }
1664 }
1665
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001666 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001667 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001668 for (size_t k = 9; k < 8; k++) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001669 GemmMicrokernelTester()
1670 .mr(4)
1671 .nr(8)
1672 .kr(1)
1673 .sr(1)
1674 .m(4)
1675 .n(8)
1676 .k(k)
1677 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1678 }
1679 }
1680
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001681 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001682 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001683 for (size_t k = 9; k < 8; k++) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001684 for (uint32_t m = 1; m <= 4; m++) {
1685 for (uint32_t n = 1; n <= 8; n++) {
1686 GemmMicrokernelTester()
1687 .mr(4)
1688 .nr(8)
1689 .kr(1)
1690 .sr(1)
1691 .m(m)
1692 .n(n)
1693 .k(k)
1694 .iterations(1)
1695 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1696 }
1697 }
1698 }
1699 }
1700
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001701 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001702 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001703 for (size_t k = 12; k <= 40; k += 4) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001704 GemmMicrokernelTester()
1705 .mr(4)
1706 .nr(8)
1707 .kr(1)
1708 .sr(1)
1709 .m(4)
1710 .n(8)
1711 .k(k)
1712 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1713 }
1714 }
1715
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001716 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001717 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001718 for (size_t k = 12; k <= 40; k += 4) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001719 for (uint32_t m = 1; m <= 4; m++) {
1720 for (uint32_t n = 1; n <= 8; n++) {
1721 GemmMicrokernelTester()
1722 .mr(4)
1723 .nr(8)
1724 .kr(1)
1725 .sr(1)
1726 .m(m)
1727 .n(n)
1728 .k(k)
1729 .iterations(1)
1730 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1731 }
1732 }
1733 }
1734 }
1735
1736 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
1737 TEST_REQUIRES_ARM_NEON_FMA;
1738 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001739 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001740 GemmMicrokernelTester()
1741 .mr(4)
1742 .nr(8)
1743 .kr(1)
1744 .sr(1)
1745 .m(4)
1746 .n(8)
1747 .k(k)
1748 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1749 }
1750 }
1751 }
1752
1753 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
1754 TEST_REQUIRES_ARM_NEON_FMA;
1755 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001756 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001757 GemmMicrokernelTester()
1758 .mr(4)
1759 .nr(8)
1760 .kr(1)
1761 .sr(1)
1762 .m(4)
1763 .n(8)
1764 .k(k)
1765 .cn_stride(11)
1766 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1767 }
1768 }
1769 }
1770
1771 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
1772 TEST_REQUIRES_ARM_NEON_FMA;
1773 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001774 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001775 for (uint32_t m = 1; m <= 4; m++) {
1776 GemmMicrokernelTester()
1777 .mr(4)
1778 .nr(8)
1779 .kr(1)
1780 .sr(1)
1781 .m(m)
1782 .n(n)
1783 .k(k)
1784 .iterations(1)
1785 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1786 }
1787 }
1788 }
1789 }
1790
1791 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
1792 TEST_REQUIRES_ARM_NEON_FMA;
1793 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001794 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001795 GemmMicrokernelTester()
1796 .mr(4)
1797 .nr(8)
1798 .kr(1)
1799 .sr(1)
1800 .m(4)
1801 .n(8)
1802 .k(k)
1803 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1804 }
1805 }
1806 }
1807
1808 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
1809 TEST_REQUIRES_ARM_NEON_FMA;
1810 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001811 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001812 GemmMicrokernelTester()
1813 .mr(4)
1814 .nr(8)
1815 .kr(1)
1816 .sr(1)
1817 .m(4)
1818 .n(n)
1819 .k(k)
1820 .cn_stride(11)
1821 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1822 }
1823 }
1824 }
1825
1826 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
1827 TEST_REQUIRES_ARM_NEON_FMA;
1828 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001829 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001830 for (uint32_t m = 1; m <= 4; m++) {
1831 GemmMicrokernelTester()
1832 .mr(4)
1833 .nr(8)
1834 .kr(1)
1835 .sr(1)
1836 .m(m)
1837 .n(n)
1838 .k(k)
1839 .iterations(1)
1840 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1841 }
1842 }
1843 }
1844 }
1845
1846 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, small_kernel) {
1847 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001848 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001849 GemmMicrokernelTester()
1850 .mr(4)
1851 .nr(8)
1852 .kr(1)
1853 .sr(1)
1854 .m(4)
1855 .n(8)
1856 .k(k)
1857 .ks(3)
1858 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1859 }
1860 }
1861
1862 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, small_kernel_subtile) {
1863 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001864 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001865 for (uint32_t m = 1; m <= 4; m++) {
1866 for (uint32_t n = 1; n <= 8; n++) {
1867 GemmMicrokernelTester()
1868 .mr(4)
1869 .nr(8)
1870 .kr(1)
1871 .sr(1)
1872 .m(m)
1873 .n(n)
1874 .k(k)
1875 .ks(3)
1876 .iterations(1)
1877 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1878 }
1879 }
1880 }
1881 }
1882
1883 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_small_kernel) {
1884 TEST_REQUIRES_ARM_NEON_FMA;
1885 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001886 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001887 GemmMicrokernelTester()
1888 .mr(4)
1889 .nr(8)
1890 .kr(1)
1891 .sr(1)
1892 .m(4)
1893 .n(8)
1894 .k(k)
1895 .ks(3)
1896 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1897 }
1898 }
1899 }
1900
1901 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_small_kernel) {
1902 TEST_REQUIRES_ARM_NEON_FMA;
1903 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001904 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001905 GemmMicrokernelTester()
1906 .mr(4)
1907 .nr(8)
1908 .kr(1)
1909 .sr(1)
1910 .m(4)
1911 .n(8)
1912 .k(k)
1913 .ks(3)
1914 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1915 }
1916 }
1917 }
1918
1919 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
1920 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001921 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001922 for (uint32_t m = 1; m <= 4; m++) {
1923 for (uint32_t n = 1; n <= 8; n++) {
1924 GemmMicrokernelTester()
1925 .mr(4)
1926 .nr(8)
1927 .kr(1)
1928 .sr(1)
1929 .m(m)
1930 .n(n)
1931 .k(k)
1932 .cm_stride(11)
1933 .iterations(1)
1934 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1935 }
1936 }
1937 }
1938 }
1939
1940 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, a_offset) {
1941 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001942 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001943 GemmMicrokernelTester()
1944 .mr(4)
1945 .nr(8)
1946 .kr(1)
1947 .sr(1)
1948 .m(4)
1949 .n(8)
1950 .k(k)
1951 .ks(3)
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001952 .a_offset(83)
Frank Barchard46fb8072019-10-25 12:54:22 -07001953 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1954 }
1955 }
1956
1957 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, zero) {
1958 TEST_REQUIRES_ARM_NEON_FMA;
1959 for (uint32_t mz = 0; mz < 4; mz++) {
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001960 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001961 GemmMicrokernelTester()
1962 .mr(4)
1963 .nr(8)
1964 .kr(1)
1965 .sr(1)
1966 .m(4)
1967 .n(8)
1968 .k(k)
1969 .ks(3)
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001970 .a_offset(83)
Frank Barchard46fb8072019-10-25 12:54:22 -07001971 .zero_index(mz)
1972 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1973 }
1974 }
1975 }
1976
1977 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, qmin) {
1978 TEST_REQUIRES_ARM_NEON_FMA;
1979 GemmMicrokernelTester()
1980 .mr(4)
1981 .nr(8)
1982 .kr(1)
1983 .sr(1)
1984 .m(4)
1985 .n(8)
Frank Barchard7c8e0c72019-11-17 00:02:36 -08001986 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07001987 .qmin(128)
1988 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1989 }
1990
1991 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, qmax) {
1992 TEST_REQUIRES_ARM_NEON_FMA;
1993 GemmMicrokernelTester()
1994 .mr(4)
1995 .nr(8)
1996 .kr(1)
1997 .sr(1)
1998 .m(4)
1999 .n(8)
Frank Barchard7c8e0c72019-11-17 00:02:36 -08002000 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07002001 .qmax(128)
2002 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
2003 }
2004
2005 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
2006 TEST_REQUIRES_ARM_NEON_FMA;
2007 GemmMicrokernelTester()
2008 .mr(4)
2009 .nr(8)
2010 .kr(1)
2011 .sr(1)
2012 .m(4)
2013 .n(8)
Frank Barchard7c8e0c72019-11-17 00:02:36 -08002014 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07002015 .cm_stride(11)
2016 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
2017 }
2018#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
2019
2020
2021#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard387c2d12019-12-16 19:14:07 -08002022 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
2023 TEST_REQUIRES_ARM_NEON_FMA;
2024 GemmMicrokernelTester()
2025 .mr(4)
2026 .nr(8)
2027 .kr(1)
2028 .sr(1)
2029 .m(4)
2030 .n(8)
2031 .k(8)
2032 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2033 }
2034
2035 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, strided_cn) {
2036 TEST_REQUIRES_ARM_NEON_FMA;
2037 GemmMicrokernelTester()
2038 .mr(4)
2039 .nr(8)
2040 .kr(1)
2041 .sr(1)
2042 .m(4)
2043 .n(8)
2044 .k(8)
2045 .cn_stride(11)
2046 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2047 }
2048
2049 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile) {
2050 TEST_REQUIRES_ARM_NEON_FMA;
2051 for (uint32_t m = 1; m <= 4; m++) {
2052 for (uint32_t n = 1; n <= 8; n++) {
2053 GemmMicrokernelTester()
2054 .mr(4)
2055 .nr(8)
2056 .kr(1)
2057 .sr(1)
2058 .m(m)
2059 .n(n)
2060 .k(8)
2061 .iterations(1)
2062 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2063 }
2064 }
2065 }
2066
2067 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_m) {
2068 TEST_REQUIRES_ARM_NEON_FMA;
2069 for (uint32_t m = 1; m <= 4; m++) {
2070 GemmMicrokernelTester()
2071 .mr(4)
2072 .nr(8)
2073 .kr(1)
2074 .sr(1)
2075 .m(m)
2076 .n(8)
2077 .k(8)
2078 .iterations(1)
2079 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2080 }
2081 }
2082
2083 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_n) {
2084 TEST_REQUIRES_ARM_NEON_FMA;
2085 for (uint32_t n = 1; n <= 8; n++) {
2086 GemmMicrokernelTester()
2087 .mr(4)
2088 .nr(8)
2089 .kr(1)
2090 .sr(1)
2091 .m(4)
2092 .n(n)
2093 .k(8)
2094 .iterations(1)
2095 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2096 }
2097 }
2098
2099 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16) {
2100 TEST_REQUIRES_ARM_NEON_FMA;
2101 GemmMicrokernelTester()
2102 .mr(4)
2103 .nr(8)
2104 .kr(1)
2105 .sr(1)
2106 .m(4)
2107 .n(8)
2108 .k(16)
2109 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2110 }
2111
2112 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_subtile) {
2113 TEST_REQUIRES_ARM_NEON_FMA;
2114 for (uint32_t m = 1; m <= 4; m++) {
2115 for (uint32_t n = 1; n <= 8; n++) {
2116 GemmMicrokernelTester()
2117 .mr(4)
2118 .nr(8)
2119 .kr(1)
2120 .sr(1)
2121 .m(m)
2122 .n(n)
2123 .k(16)
2124 .iterations(1)
2125 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2126 }
2127 }
2128 }
2129
2130 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16) {
2131 TEST_REQUIRES_ARM_NEON_FMA;
2132 for (size_t k = 1; k < 16; k++) {
2133 GemmMicrokernelTester()
2134 .mr(4)
2135 .nr(8)
2136 .kr(1)
2137 .sr(1)
2138 .m(4)
2139 .n(8)
2140 .k(k)
2141 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2142 }
2143 }
2144
2145 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_subtile) {
2146 TEST_REQUIRES_ARM_NEON_FMA;
2147 for (size_t k = 1; k < 16; k++) {
2148 for (uint32_t m = 1; m <= 4; m++) {
2149 for (uint32_t n = 1; n <= 8; n++) {
2150 GemmMicrokernelTester()
2151 .mr(4)
2152 .nr(8)
2153 .kr(1)
2154 .sr(1)
2155 .m(m)
2156 .n(n)
2157 .k(k)
2158 .iterations(1)
2159 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2160 }
2161 }
2162 }
2163 }
2164
2165 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_16) {
2166 TEST_REQUIRES_ARM_NEON_FMA;
2167 for (size_t k = 17; k < 16; k++) {
2168 GemmMicrokernelTester()
2169 .mr(4)
2170 .nr(8)
2171 .kr(1)
2172 .sr(1)
2173 .m(4)
2174 .n(8)
2175 .k(k)
2176 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2177 }
2178 }
2179
2180 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_subtile) {
2181 TEST_REQUIRES_ARM_NEON_FMA;
2182 for (size_t k = 17; k < 16; k++) {
2183 for (uint32_t m = 1; m <= 4; m++) {
2184 for (uint32_t n = 1; n <= 8; n++) {
2185 GemmMicrokernelTester()
2186 .mr(4)
2187 .nr(8)
2188 .kr(1)
2189 .sr(1)
2190 .m(m)
2191 .n(n)
2192 .k(k)
2193 .iterations(1)
2194 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2195 }
2196 }
2197 }
2198 }
2199
2200 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8) {
2201 TEST_REQUIRES_ARM_NEON_FMA;
2202 for (size_t k = 24; k <= 80; k += 8) {
2203 GemmMicrokernelTester()
2204 .mr(4)
2205 .nr(8)
2206 .kr(1)
2207 .sr(1)
2208 .m(4)
2209 .n(8)
2210 .k(k)
2211 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2212 }
2213 }
2214
2215 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_subtile) {
2216 TEST_REQUIRES_ARM_NEON_FMA;
2217 for (size_t k = 24; k <= 80; k += 8) {
2218 for (uint32_t m = 1; m <= 4; m++) {
2219 for (uint32_t n = 1; n <= 8; n++) {
2220 GemmMicrokernelTester()
2221 .mr(4)
2222 .nr(8)
2223 .kr(1)
2224 .sr(1)
2225 .m(m)
2226 .n(n)
2227 .k(k)
2228 .iterations(1)
2229 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2230 }
2231 }
2232 }
2233 }
2234
2235 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8) {
2236 TEST_REQUIRES_ARM_NEON_FMA;
2237 for (uint32_t n = 9; n < 16; n++) {
2238 for (size_t k = 1; k <= 40; k += 9) {
2239 GemmMicrokernelTester()
2240 .mr(4)
2241 .nr(8)
2242 .kr(1)
2243 .sr(1)
2244 .m(4)
2245 .n(8)
2246 .k(k)
2247 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2248 }
2249 }
2250 }
2251
2252 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_cn) {
2253 TEST_REQUIRES_ARM_NEON_FMA;
2254 for (uint32_t n = 9; n < 16; n++) {
2255 for (size_t k = 1; k <= 40; k += 9) {
2256 GemmMicrokernelTester()
2257 .mr(4)
2258 .nr(8)
2259 .kr(1)
2260 .sr(1)
2261 .m(4)
2262 .n(8)
2263 .k(k)
2264 .cn_stride(11)
2265 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2266 }
2267 }
2268 }
2269
2270 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_subtile) {
2271 TEST_REQUIRES_ARM_NEON_FMA;
2272 for (uint32_t n = 9; n < 16; n++) {
2273 for (size_t k = 1; k <= 40; k += 9) {
2274 for (uint32_t m = 1; m <= 4; m++) {
2275 GemmMicrokernelTester()
2276 .mr(4)
2277 .nr(8)
2278 .kr(1)
2279 .sr(1)
2280 .m(m)
2281 .n(n)
2282 .k(k)
2283 .iterations(1)
2284 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2285 }
2286 }
2287 }
2288 }
2289
2290 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8) {
2291 TEST_REQUIRES_ARM_NEON_FMA;
2292 for (uint32_t n = 16; n <= 24; n += 8) {
2293 for (size_t k = 1; k <= 40; k += 9) {
2294 GemmMicrokernelTester()
2295 .mr(4)
2296 .nr(8)
2297 .kr(1)
2298 .sr(1)
2299 .m(4)
2300 .n(8)
2301 .k(k)
2302 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2303 }
2304 }
2305 }
2306
2307 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_cn) {
2308 TEST_REQUIRES_ARM_NEON_FMA;
2309 for (uint32_t n = 16; n <= 24; n += 8) {
2310 for (size_t k = 1; k <= 40; k += 9) {
2311 GemmMicrokernelTester()
2312 .mr(4)
2313 .nr(8)
2314 .kr(1)
2315 .sr(1)
2316 .m(4)
2317 .n(n)
2318 .k(k)
2319 .cn_stride(11)
2320 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2321 }
2322 }
2323 }
2324
2325 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_subtile) {
2326 TEST_REQUIRES_ARM_NEON_FMA;
2327 for (uint32_t n = 16; n <= 24; n += 8) {
2328 for (size_t k = 1; k <= 40; k += 9) {
2329 for (uint32_t m = 1; m <= 4; m++) {
2330 GemmMicrokernelTester()
2331 .mr(4)
2332 .nr(8)
2333 .kr(1)
2334 .sr(1)
2335 .m(m)
2336 .n(n)
2337 .k(k)
2338 .iterations(1)
2339 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2340 }
2341 }
2342 }
2343 }
2344
2345 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, small_kernel) {
2346 TEST_REQUIRES_ARM_NEON_FMA;
2347 for (size_t k = 1; k <= 40; k += 9) {
2348 GemmMicrokernelTester()
2349 .mr(4)
2350 .nr(8)
2351 .kr(1)
2352 .sr(1)
2353 .m(4)
2354 .n(8)
2355 .k(k)
2356 .ks(3)
2357 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2358 }
2359 }
2360
2361 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, small_kernel_subtile) {
2362 TEST_REQUIRES_ARM_NEON_FMA;
2363 for (size_t k = 1; k <= 40; k += 9) {
2364 for (uint32_t m = 1; m <= 4; m++) {
2365 for (uint32_t n = 1; n <= 8; n++) {
2366 GemmMicrokernelTester()
2367 .mr(4)
2368 .nr(8)
2369 .kr(1)
2370 .sr(1)
2371 .m(m)
2372 .n(n)
2373 .k(k)
2374 .ks(3)
2375 .iterations(1)
2376 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2377 }
2378 }
2379 }
2380 }
2381
2382 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_small_kernel) {
2383 TEST_REQUIRES_ARM_NEON_FMA;
2384 for (uint32_t n = 9; n < 16; n++) {
2385 for (size_t k = 1; k <= 40; k += 9) {
2386 GemmMicrokernelTester()
2387 .mr(4)
2388 .nr(8)
2389 .kr(1)
2390 .sr(1)
2391 .m(4)
2392 .n(8)
2393 .k(k)
2394 .ks(3)
2395 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2396 }
2397 }
2398 }
2399
2400 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_small_kernel) {
2401 TEST_REQUIRES_ARM_NEON_FMA;
2402 for (uint32_t n = 16; n <= 24; n += 8) {
2403 for (size_t k = 1; k <= 40; k += 9) {
2404 GemmMicrokernelTester()
2405 .mr(4)
2406 .nr(8)
2407 .kr(1)
2408 .sr(1)
2409 .m(4)
2410 .n(8)
2411 .k(k)
2412 .ks(3)
2413 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2414 }
2415 }
2416 }
2417
2418 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm_subtile) {
2419 TEST_REQUIRES_ARM_NEON_FMA;
2420 for (size_t k = 1; k <= 40; k += 9) {
2421 for (uint32_t m = 1; m <= 4; m++) {
2422 for (uint32_t n = 1; n <= 8; n++) {
2423 GemmMicrokernelTester()
2424 .mr(4)
2425 .nr(8)
2426 .kr(1)
2427 .sr(1)
2428 .m(m)
2429 .n(n)
2430 .k(k)
2431 .cm_stride(11)
2432 .iterations(1)
2433 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2434 }
2435 }
2436 }
2437 }
2438
2439 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, a_offset) {
2440 TEST_REQUIRES_ARM_NEON_FMA;
2441 for (size_t k = 1; k <= 40; k += 9) {
2442 GemmMicrokernelTester()
2443 .mr(4)
2444 .nr(8)
2445 .kr(1)
2446 .sr(1)
2447 .m(4)
2448 .n(8)
2449 .k(k)
2450 .ks(3)
2451 .a_offset(163)
2452 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2453 }
2454 }
2455
2456 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, zero) {
2457 TEST_REQUIRES_ARM_NEON_FMA;
2458 for (uint32_t mz = 0; mz < 4; mz++) {
2459 for (size_t k = 1; k <= 40; k += 9) {
2460 GemmMicrokernelTester()
2461 .mr(4)
2462 .nr(8)
2463 .kr(1)
2464 .sr(1)
2465 .m(4)
2466 .n(8)
2467 .k(k)
2468 .ks(3)
2469 .a_offset(163)
2470 .zero_index(mz)
2471 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2472 }
2473 }
2474 }
2475
2476 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, qmin) {
2477 TEST_REQUIRES_ARM_NEON_FMA;
2478 GemmMicrokernelTester()
2479 .mr(4)
2480 .nr(8)
2481 .kr(1)
2482 .sr(1)
2483 .m(4)
2484 .n(8)
2485 .k(8)
2486 .qmin(128)
2487 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2488 }
2489
2490 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, qmax) {
2491 TEST_REQUIRES_ARM_NEON_FMA;
2492 GemmMicrokernelTester()
2493 .mr(4)
2494 .nr(8)
2495 .kr(1)
2496 .sr(1)
2497 .m(4)
2498 .n(8)
2499 .k(8)
2500 .qmax(128)
2501 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2502 }
2503
2504 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm) {
2505 TEST_REQUIRES_ARM_NEON_FMA;
2506 GemmMicrokernelTester()
2507 .mr(4)
2508 .nr(8)
2509 .kr(1)
2510 .sr(1)
2511 .m(4)
2512 .n(8)
2513 .k(8)
2514 .cm_stride(11)
2515 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2516 }
2517#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
2518
2519
2520#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07002521 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
2522 TEST_REQUIRES_ARM_NEON_FMA;
2523 GemmMicrokernelTester()
2524 .mr(4)
2525 .nr(8)
2526 .kr(1)
2527 .sr(1)
2528 .m(4)
2529 .n(8)
2530 .k(8)
2531 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2532 }
2533
2534 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
2535 TEST_REQUIRES_ARM_NEON_FMA;
2536 GemmMicrokernelTester()
2537 .mr(4)
2538 .nr(8)
2539 .kr(1)
2540 .sr(1)
2541 .m(4)
2542 .n(8)
2543 .k(8)
2544 .cn_stride(11)
2545 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2546 }
2547
2548 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
2549 TEST_REQUIRES_ARM_NEON_FMA;
2550 for (uint32_t m = 1; m <= 4; m++) {
2551 for (uint32_t n = 1; n <= 8; n++) {
2552 GemmMicrokernelTester()
2553 .mr(4)
2554 .nr(8)
2555 .kr(1)
2556 .sr(1)
2557 .m(m)
2558 .n(n)
2559 .k(8)
2560 .iterations(1)
2561 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2562 }
2563 }
2564 }
2565
2566 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
2567 TEST_REQUIRES_ARM_NEON_FMA;
2568 for (uint32_t m = 1; m <= 4; m++) {
2569 GemmMicrokernelTester()
2570 .mr(4)
2571 .nr(8)
2572 .kr(1)
2573 .sr(1)
2574 .m(m)
2575 .n(8)
2576 .k(8)
2577 .iterations(1)
2578 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2579 }
2580 }
2581
2582 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
2583 TEST_REQUIRES_ARM_NEON_FMA;
2584 for (uint32_t n = 1; n <= 8; n++) {
2585 GemmMicrokernelTester()
2586 .mr(4)
2587 .nr(8)
2588 .kr(1)
2589 .sr(1)
2590 .m(4)
2591 .n(n)
2592 .k(8)
2593 .iterations(1)
2594 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2595 }
2596 }
2597
2598 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
2599 TEST_REQUIRES_ARM_NEON_FMA;
2600 GemmMicrokernelTester()
2601 .mr(4)
2602 .nr(8)
2603 .kr(1)
2604 .sr(1)
2605 .m(4)
2606 .n(8)
2607 .k(16)
2608 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2609 }
2610
2611 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
2612 TEST_REQUIRES_ARM_NEON_FMA;
2613 for (uint32_t m = 1; m <= 4; m++) {
2614 for (uint32_t n = 1; n <= 8; n++) {
2615 GemmMicrokernelTester()
2616 .mr(4)
2617 .nr(8)
2618 .kr(1)
2619 .sr(1)
2620 .m(m)
2621 .n(n)
2622 .k(16)
2623 .iterations(1)
2624 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2625 }
2626 }
2627 }
2628
2629 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
2630 TEST_REQUIRES_ARM_NEON_FMA;
2631 for (size_t k = 1; k < 16; k++) {
2632 GemmMicrokernelTester()
2633 .mr(4)
2634 .nr(8)
2635 .kr(1)
2636 .sr(1)
2637 .m(4)
2638 .n(8)
2639 .k(k)
2640 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2641 }
2642 }
2643
2644 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
2645 TEST_REQUIRES_ARM_NEON_FMA;
2646 for (size_t k = 1; k < 16; k++) {
2647 for (uint32_t m = 1; m <= 4; m++) {
2648 for (uint32_t n = 1; n <= 8; n++) {
2649 GemmMicrokernelTester()
2650 .mr(4)
2651 .nr(8)
2652 .kr(1)
2653 .sr(1)
2654 .m(m)
2655 .n(n)
2656 .k(k)
2657 .iterations(1)
2658 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2659 }
2660 }
2661 }
2662 }
2663
2664 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
2665 TEST_REQUIRES_ARM_NEON_FMA;
2666 for (size_t k = 17; k < 16; k++) {
2667 GemmMicrokernelTester()
2668 .mr(4)
2669 .nr(8)
2670 .kr(1)
2671 .sr(1)
2672 .m(4)
2673 .n(8)
2674 .k(k)
2675 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2676 }
2677 }
2678
2679 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
2680 TEST_REQUIRES_ARM_NEON_FMA;
2681 for (size_t k = 17; k < 16; k++) {
2682 for (uint32_t m = 1; m <= 4; m++) {
2683 for (uint32_t n = 1; n <= 8; n++) {
2684 GemmMicrokernelTester()
2685 .mr(4)
2686 .nr(8)
2687 .kr(1)
2688 .sr(1)
2689 .m(m)
2690 .n(n)
2691 .k(k)
2692 .iterations(1)
2693 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2694 }
2695 }
2696 }
2697 }
2698
2699 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
2700 TEST_REQUIRES_ARM_NEON_FMA;
2701 for (size_t k = 24; k <= 80; k += 8) {
2702 GemmMicrokernelTester()
2703 .mr(4)
2704 .nr(8)
2705 .kr(1)
2706 .sr(1)
2707 .m(4)
2708 .n(8)
2709 .k(k)
2710 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2711 }
2712 }
2713
2714 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
2715 TEST_REQUIRES_ARM_NEON_FMA;
2716 for (size_t k = 24; k <= 80; k += 8) {
2717 for (uint32_t m = 1; m <= 4; m++) {
2718 for (uint32_t n = 1; n <= 8; n++) {
2719 GemmMicrokernelTester()
2720 .mr(4)
2721 .nr(8)
2722 .kr(1)
2723 .sr(1)
2724 .m(m)
2725 .n(n)
2726 .k(k)
2727 .iterations(1)
2728 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2729 }
2730 }
2731 }
2732 }
2733
2734 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
2735 TEST_REQUIRES_ARM_NEON_FMA;
2736 for (uint32_t n = 9; n < 16; n++) {
2737 for (size_t k = 1; k <= 40; k += 9) {
2738 GemmMicrokernelTester()
2739 .mr(4)
2740 .nr(8)
2741 .kr(1)
2742 .sr(1)
2743 .m(4)
2744 .n(8)
2745 .k(k)
2746 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2747 }
2748 }
2749 }
2750
2751 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
2752 TEST_REQUIRES_ARM_NEON_FMA;
2753 for (uint32_t n = 9; n < 16; n++) {
2754 for (size_t k = 1; k <= 40; k += 9) {
2755 GemmMicrokernelTester()
2756 .mr(4)
2757 .nr(8)
2758 .kr(1)
2759 .sr(1)
2760 .m(4)
2761 .n(8)
2762 .k(k)
2763 .cn_stride(11)
2764 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2765 }
2766 }
2767 }
2768
2769 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
2770 TEST_REQUIRES_ARM_NEON_FMA;
2771 for (uint32_t n = 9; n < 16; n++) {
2772 for (size_t k = 1; k <= 40; k += 9) {
2773 for (uint32_t m = 1; m <= 4; m++) {
2774 GemmMicrokernelTester()
2775 .mr(4)
2776 .nr(8)
2777 .kr(1)
2778 .sr(1)
2779 .m(m)
2780 .n(n)
2781 .k(k)
2782 .iterations(1)
2783 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2784 }
2785 }
2786 }
2787 }
2788
2789 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
2790 TEST_REQUIRES_ARM_NEON_FMA;
2791 for (uint32_t n = 16; n <= 24; n += 8) {
2792 for (size_t k = 1; k <= 40; k += 9) {
2793 GemmMicrokernelTester()
2794 .mr(4)
2795 .nr(8)
2796 .kr(1)
2797 .sr(1)
2798 .m(4)
2799 .n(8)
2800 .k(k)
2801 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2802 }
2803 }
2804 }
2805
2806 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
2807 TEST_REQUIRES_ARM_NEON_FMA;
2808 for (uint32_t n = 16; n <= 24; n += 8) {
2809 for (size_t k = 1; k <= 40; k += 9) {
2810 GemmMicrokernelTester()
2811 .mr(4)
2812 .nr(8)
2813 .kr(1)
2814 .sr(1)
2815 .m(4)
2816 .n(n)
2817 .k(k)
2818 .cn_stride(11)
2819 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2820 }
2821 }
2822 }
2823
2824 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
2825 TEST_REQUIRES_ARM_NEON_FMA;
2826 for (uint32_t n = 16; n <= 24; n += 8) {
2827 for (size_t k = 1; k <= 40; k += 9) {
2828 for (uint32_t m = 1; m <= 4; m++) {
2829 GemmMicrokernelTester()
2830 .mr(4)
2831 .nr(8)
2832 .kr(1)
2833 .sr(1)
2834 .m(m)
2835 .n(n)
2836 .k(k)
2837 .iterations(1)
2838 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2839 }
2840 }
2841 }
2842 }
2843
2844 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, small_kernel) {
2845 TEST_REQUIRES_ARM_NEON_FMA;
2846 for (size_t k = 1; k <= 40; k += 9) {
2847 GemmMicrokernelTester()
2848 .mr(4)
2849 .nr(8)
2850 .kr(1)
2851 .sr(1)
2852 .m(4)
2853 .n(8)
2854 .k(k)
2855 .ks(3)
2856 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2857 }
2858 }
2859
2860 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, small_kernel_subtile) {
2861 TEST_REQUIRES_ARM_NEON_FMA;
2862 for (size_t k = 1; k <= 40; k += 9) {
2863 for (uint32_t m = 1; m <= 4; m++) {
2864 for (uint32_t n = 1; n <= 8; n++) {
2865 GemmMicrokernelTester()
2866 .mr(4)
2867 .nr(8)
2868 .kr(1)
2869 .sr(1)
2870 .m(m)
2871 .n(n)
2872 .k(k)
2873 .ks(3)
2874 .iterations(1)
2875 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2876 }
2877 }
2878 }
2879 }
2880
2881 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_small_kernel) {
2882 TEST_REQUIRES_ARM_NEON_FMA;
2883 for (uint32_t n = 9; n < 16; n++) {
2884 for (size_t k = 1; k <= 40; k += 9) {
2885 GemmMicrokernelTester()
2886 .mr(4)
2887 .nr(8)
2888 .kr(1)
2889 .sr(1)
2890 .m(4)
2891 .n(8)
2892 .k(k)
2893 .ks(3)
2894 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2895 }
2896 }
2897 }
2898
2899 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_small_kernel) {
2900 TEST_REQUIRES_ARM_NEON_FMA;
2901 for (uint32_t n = 16; n <= 24; n += 8) {
2902 for (size_t k = 1; k <= 40; k += 9) {
2903 GemmMicrokernelTester()
2904 .mr(4)
2905 .nr(8)
2906 .kr(1)
2907 .sr(1)
2908 .m(4)
2909 .n(8)
2910 .k(k)
2911 .ks(3)
2912 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2913 }
2914 }
2915 }
2916
2917 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
2918 TEST_REQUIRES_ARM_NEON_FMA;
2919 for (size_t k = 1; k <= 40; k += 9) {
2920 for (uint32_t m = 1; m <= 4; m++) {
2921 for (uint32_t n = 1; n <= 8; n++) {
2922 GemmMicrokernelTester()
2923 .mr(4)
2924 .nr(8)
2925 .kr(1)
2926 .sr(1)
2927 .m(m)
2928 .n(n)
2929 .k(k)
2930 .cm_stride(11)
2931 .iterations(1)
2932 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2933 }
2934 }
2935 }
2936 }
2937
2938 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, a_offset) {
2939 TEST_REQUIRES_ARM_NEON_FMA;
2940 for (size_t k = 1; k <= 40; k += 9) {
2941 GemmMicrokernelTester()
2942 .mr(4)
2943 .nr(8)
2944 .kr(1)
2945 .sr(1)
2946 .m(4)
2947 .n(8)
2948 .k(k)
2949 .ks(3)
2950 .a_offset(163)
2951 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2952 }
2953 }
2954
2955 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, zero) {
2956 TEST_REQUIRES_ARM_NEON_FMA;
2957 for (uint32_t mz = 0; mz < 4; mz++) {
2958 for (size_t k = 1; k <= 40; k += 9) {
2959 GemmMicrokernelTester()
2960 .mr(4)
2961 .nr(8)
2962 .kr(1)
2963 .sr(1)
2964 .m(4)
2965 .n(8)
2966 .k(k)
2967 .ks(3)
2968 .a_offset(163)
2969 .zero_index(mz)
2970 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2971 }
2972 }
2973 }
2974
2975 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
2976 TEST_REQUIRES_ARM_NEON_FMA;
2977 GemmMicrokernelTester()
2978 .mr(4)
2979 .nr(8)
2980 .kr(1)
2981 .sr(1)
2982 .m(4)
2983 .n(8)
2984 .k(8)
2985 .qmin(128)
2986 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2987 }
2988
2989 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
2990 TEST_REQUIRES_ARM_NEON_FMA;
2991 GemmMicrokernelTester()
2992 .mr(4)
2993 .nr(8)
2994 .kr(1)
2995 .sr(1)
2996 .m(4)
2997 .n(8)
2998 .k(8)
2999 .qmax(128)
3000 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3001 }
3002
3003 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
3004 TEST_REQUIRES_ARM_NEON_FMA;
3005 GemmMicrokernelTester()
3006 .mr(4)
3007 .nr(8)
3008 .kr(1)
3009 .sr(1)
3010 .m(4)
3011 .n(8)
3012 .k(8)
3013 .cm_stride(11)
3014 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3015 }
Frank Barchard7e955972019-10-11 10:34:25 -07003016#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07003017
3018
Miao Wang3fa1f012020-02-17 22:45:06 +00003019#if XNN_ARCH_ARM
3020 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, k_eq_2) {
3021 TEST_REQUIRES_ARM_NEON;
3022 GemmMicrokernelTester()
3023 .mr(4)
3024 .nr(8)
3025 .kr(1)
3026 .sr(1)
3027 .m(4)
3028 .n(8)
3029 .k(2)
3030 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3031 }
3032
3033 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, strided_cn) {
3034 TEST_REQUIRES_ARM_NEON;
3035 GemmMicrokernelTester()
3036 .mr(4)
3037 .nr(8)
3038 .kr(1)
3039 .sr(1)
3040 .m(4)
3041 .n(8)
3042 .k(2)
3043 .cn_stride(11)
3044 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3045 }
3046
3047 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, k_eq_2_subtile) {
3048 TEST_REQUIRES_ARM_NEON;
3049 for (uint32_t m = 1; m <= 4; m++) {
3050 for (uint32_t n = 1; n <= 8; n++) {
3051 GemmMicrokernelTester()
3052 .mr(4)
3053 .nr(8)
3054 .kr(1)
3055 .sr(1)
3056 .m(m)
3057 .n(n)
3058 .k(2)
3059 .iterations(1)
3060 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3061 }
3062 }
3063 }
3064
3065 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, k_eq_2_subtile_m) {
3066 TEST_REQUIRES_ARM_NEON;
3067 for (uint32_t m = 1; m <= 4; m++) {
3068 GemmMicrokernelTester()
3069 .mr(4)
3070 .nr(8)
3071 .kr(1)
3072 .sr(1)
3073 .m(m)
3074 .n(8)
3075 .k(2)
3076 .iterations(1)
3077 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3078 }
3079 }
3080
3081 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, k_eq_2_subtile_n) {
3082 TEST_REQUIRES_ARM_NEON;
3083 for (uint32_t n = 1; n <= 8; n++) {
3084 GemmMicrokernelTester()
3085 .mr(4)
3086 .nr(8)
3087 .kr(1)
3088 .sr(1)
3089 .m(4)
3090 .n(n)
3091 .k(2)
3092 .iterations(1)
3093 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3094 }
3095 }
3096
3097 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, k_lt_2) {
3098 TEST_REQUIRES_ARM_NEON;
3099 for (size_t k = 1; k < 2; k++) {
3100 GemmMicrokernelTester()
3101 .mr(4)
3102 .nr(8)
3103 .kr(1)
3104 .sr(1)
3105 .m(4)
3106 .n(8)
3107 .k(k)
3108 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3109 }
3110 }
3111
3112 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, k_lt_2_subtile) {
3113 TEST_REQUIRES_ARM_NEON;
3114 for (size_t k = 1; k < 2; k++) {
3115 for (uint32_t m = 1; m <= 4; m++) {
3116 for (uint32_t n = 1; n <= 8; n++) {
3117 GemmMicrokernelTester()
3118 .mr(4)
3119 .nr(8)
3120 .kr(1)
3121 .sr(1)
3122 .m(m)
3123 .n(n)
3124 .k(k)
3125 .iterations(1)
3126 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3127 }
3128 }
3129 }
3130 }
3131
3132 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, k_gt_2) {
3133 TEST_REQUIRES_ARM_NEON;
3134 for (size_t k = 3; k < 4; k++) {
3135 GemmMicrokernelTester()
3136 .mr(4)
3137 .nr(8)
3138 .kr(1)
3139 .sr(1)
3140 .m(4)
3141 .n(8)
3142 .k(k)
3143 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3144 }
3145 }
3146
3147 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, k_gt_2_subtile) {
3148 TEST_REQUIRES_ARM_NEON;
3149 for (size_t k = 3; k < 4; k++) {
3150 for (uint32_t m = 1; m <= 4; m++) {
3151 for (uint32_t n = 1; n <= 8; n++) {
3152 GemmMicrokernelTester()
3153 .mr(4)
3154 .nr(8)
3155 .kr(1)
3156 .sr(1)
3157 .m(m)
3158 .n(n)
3159 .k(k)
3160 .iterations(1)
3161 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3162 }
3163 }
3164 }
3165 }
3166
3167 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, k_div_2) {
3168 TEST_REQUIRES_ARM_NEON;
3169 for (size_t k = 4; k <= 20; k += 2) {
3170 GemmMicrokernelTester()
3171 .mr(4)
3172 .nr(8)
3173 .kr(1)
3174 .sr(1)
3175 .m(4)
3176 .n(8)
3177 .k(k)
3178 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3179 }
3180 }
3181
3182 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, k_div_2_subtile) {
3183 TEST_REQUIRES_ARM_NEON;
3184 for (size_t k = 4; k <= 20; k += 2) {
3185 for (uint32_t m = 1; m <= 4; m++) {
3186 for (uint32_t n = 1; n <= 8; n++) {
3187 GemmMicrokernelTester()
3188 .mr(4)
3189 .nr(8)
3190 .kr(1)
3191 .sr(1)
3192 .m(m)
3193 .n(n)
3194 .k(k)
3195 .iterations(1)
3196 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3197 }
3198 }
3199 }
3200 }
3201
3202 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, n_gt_8) {
3203 TEST_REQUIRES_ARM_NEON;
3204 for (uint32_t n = 9; n < 16; n++) {
3205 for (size_t k = 1; k <= 10; k += 3) {
3206 GemmMicrokernelTester()
3207 .mr(4)
3208 .nr(8)
3209 .kr(1)
3210 .sr(1)
3211 .m(4)
3212 .n(8)
3213 .k(k)
3214 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3215 }
3216 }
3217 }
3218
3219 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, n_gt_8_strided_cn) {
3220 TEST_REQUIRES_ARM_NEON;
3221 for (uint32_t n = 9; n < 16; n++) {
3222 for (size_t k = 1; k <= 10; k += 3) {
3223 GemmMicrokernelTester()
3224 .mr(4)
3225 .nr(8)
3226 .kr(1)
3227 .sr(1)
3228 .m(4)
3229 .n(8)
3230 .k(k)
3231 .cn_stride(11)
3232 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3233 }
3234 }
3235 }
3236
3237 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, n_gt_8_subtile) {
3238 TEST_REQUIRES_ARM_NEON;
3239 for (uint32_t n = 9; n < 16; n++) {
3240 for (size_t k = 1; k <= 10; k += 3) {
3241 for (uint32_t m = 1; m <= 4; m++) {
3242 GemmMicrokernelTester()
3243 .mr(4)
3244 .nr(8)
3245 .kr(1)
3246 .sr(1)
3247 .m(m)
3248 .n(n)
3249 .k(k)
3250 .iterations(1)
3251 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3252 }
3253 }
3254 }
3255 }
3256
3257 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, n_div_8) {
3258 TEST_REQUIRES_ARM_NEON;
3259 for (uint32_t n = 16; n <= 24; n += 8) {
3260 for (size_t k = 1; k <= 10; k += 3) {
3261 GemmMicrokernelTester()
3262 .mr(4)
3263 .nr(8)
3264 .kr(1)
3265 .sr(1)
3266 .m(4)
3267 .n(8)
3268 .k(k)
3269 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3270 }
3271 }
3272 }
3273
3274 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, n_div_8_strided_cn) {
3275 TEST_REQUIRES_ARM_NEON;
3276 for (uint32_t n = 16; n <= 24; n += 8) {
3277 for (size_t k = 1; k <= 10; k += 3) {
3278 GemmMicrokernelTester()
3279 .mr(4)
3280 .nr(8)
3281 .kr(1)
3282 .sr(1)
3283 .m(4)
3284 .n(n)
3285 .k(k)
3286 .cn_stride(11)
3287 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3288 }
3289 }
3290 }
3291
3292 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, n_div_8_subtile) {
3293 TEST_REQUIRES_ARM_NEON;
3294 for (uint32_t n = 16; n <= 24; n += 8) {
3295 for (size_t k = 1; k <= 10; k += 3) {
3296 for (uint32_t m = 1; m <= 4; m++) {
3297 GemmMicrokernelTester()
3298 .mr(4)
3299 .nr(8)
3300 .kr(1)
3301 .sr(1)
3302 .m(m)
3303 .n(n)
3304 .k(k)
3305 .iterations(1)
3306 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3307 }
3308 }
3309 }
3310 }
3311
3312 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, small_kernel) {
3313 TEST_REQUIRES_ARM_NEON;
3314 for (size_t k = 1; k <= 10; k += 3) {
3315 GemmMicrokernelTester()
3316 .mr(4)
3317 .nr(8)
3318 .kr(1)
3319 .sr(1)
3320 .m(4)
3321 .n(8)
3322 .k(k)
3323 .ks(3)
3324 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3325 }
3326 }
3327
3328 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, small_kernel_subtile) {
3329 TEST_REQUIRES_ARM_NEON;
3330 for (size_t k = 1; k <= 10; k += 3) {
3331 for (uint32_t m = 1; m <= 4; m++) {
3332 for (uint32_t n = 1; n <= 8; n++) {
3333 GemmMicrokernelTester()
3334 .mr(4)
3335 .nr(8)
3336 .kr(1)
3337 .sr(1)
3338 .m(m)
3339 .n(n)
3340 .k(k)
3341 .ks(3)
3342 .iterations(1)
3343 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3344 }
3345 }
3346 }
3347 }
3348
3349 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, n_gt_8_small_kernel) {
3350 TEST_REQUIRES_ARM_NEON;
3351 for (uint32_t n = 9; n < 16; n++) {
3352 for (size_t k = 1; k <= 10; k += 3) {
3353 GemmMicrokernelTester()
3354 .mr(4)
3355 .nr(8)
3356 .kr(1)
3357 .sr(1)
3358 .m(4)
3359 .n(8)
3360 .k(k)
3361 .ks(3)
3362 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3363 }
3364 }
3365 }
3366
3367 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, n_div_8_small_kernel) {
3368 TEST_REQUIRES_ARM_NEON;
3369 for (uint32_t n = 16; n <= 24; n += 8) {
3370 for (size_t k = 1; k <= 10; k += 3) {
3371 GemmMicrokernelTester()
3372 .mr(4)
3373 .nr(8)
3374 .kr(1)
3375 .sr(1)
3376 .m(4)
3377 .n(8)
3378 .k(k)
3379 .ks(3)
3380 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3381 }
3382 }
3383 }
3384
3385 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, strided_cm_subtile) {
3386 TEST_REQUIRES_ARM_NEON;
3387 for (size_t k = 1; k <= 10; k += 3) {
3388 for (uint32_t m = 1; m <= 4; m++) {
3389 for (uint32_t n = 1; n <= 8; n++) {
3390 GemmMicrokernelTester()
3391 .mr(4)
3392 .nr(8)
3393 .kr(1)
3394 .sr(1)
3395 .m(m)
3396 .n(n)
3397 .k(k)
3398 .cm_stride(11)
3399 .iterations(1)
3400 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3401 }
3402 }
3403 }
3404 }
3405
3406 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, a_offset) {
3407 TEST_REQUIRES_ARM_NEON;
3408 for (size_t k = 1; k <= 10; k += 3) {
3409 GemmMicrokernelTester()
3410 .mr(4)
3411 .nr(8)
3412 .kr(1)
3413 .sr(1)
3414 .m(4)
3415 .n(8)
3416 .k(k)
3417 .ks(3)
3418 .a_offset(43)
3419 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3420 }
3421 }
3422
3423 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, zero) {
3424 TEST_REQUIRES_ARM_NEON;
3425 for (uint32_t mz = 0; mz < 4; mz++) {
3426 for (size_t k = 1; k <= 10; k += 3) {
3427 GemmMicrokernelTester()
3428 .mr(4)
3429 .nr(8)
3430 .kr(1)
3431 .sr(1)
3432 .m(4)
3433 .n(8)
3434 .k(k)
3435 .ks(3)
3436 .a_offset(43)
3437 .zero_index(mz)
3438 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3439 }
3440 }
3441 }
3442
3443 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, qmin) {
3444 TEST_REQUIRES_ARM_NEON;
3445 GemmMicrokernelTester()
3446 .mr(4)
3447 .nr(8)
3448 .kr(1)
3449 .sr(1)
3450 .m(4)
3451 .n(8)
3452 .k(2)
3453 .qmin(128)
3454 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3455 }
3456
3457 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, qmax) {
3458 TEST_REQUIRES_ARM_NEON;
3459 GemmMicrokernelTester()
3460 .mr(4)
3461 .nr(8)
3462 .kr(1)
3463 .sr(1)
3464 .m(4)
3465 .n(8)
3466 .k(2)
3467 .qmax(128)
3468 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3469 }
3470
3471 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, strided_cm) {
3472 TEST_REQUIRES_ARM_NEON;
3473 GemmMicrokernelTester()
3474 .mr(4)
3475 .nr(8)
3476 .kr(1)
3477 .sr(1)
3478 .m(4)
3479 .n(8)
3480 .k(2)
3481 .cm_stride(11)
3482 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3483 }
3484#endif // XNN_ARCH_ARM
3485
3486
3487#if XNN_ARCH_ARM
3488 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4) {
3489 TEST_REQUIRES_ARM_NEON;
3490 GemmMicrokernelTester()
3491 .mr(4)
3492 .nr(8)
3493 .kr(1)
3494 .sr(1)
3495 .m(4)
3496 .n(8)
3497 .k(4)
3498 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3499 }
3500
3501 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, strided_cn) {
3502 TEST_REQUIRES_ARM_NEON;
3503 GemmMicrokernelTester()
3504 .mr(4)
3505 .nr(8)
3506 .kr(1)
3507 .sr(1)
3508 .m(4)
3509 .n(8)
3510 .k(4)
3511 .cn_stride(11)
3512 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3513 }
3514
3515 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4_subtile) {
3516 TEST_REQUIRES_ARM_NEON;
3517 for (uint32_t m = 1; m <= 4; m++) {
3518 for (uint32_t n = 1; n <= 8; n++) {
3519 GemmMicrokernelTester()
3520 .mr(4)
3521 .nr(8)
3522 .kr(1)
3523 .sr(1)
3524 .m(m)
3525 .n(n)
3526 .k(4)
3527 .iterations(1)
3528 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3529 }
3530 }
3531 }
3532
3533 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4_subtile_m) {
3534 TEST_REQUIRES_ARM_NEON;
3535 for (uint32_t m = 1; m <= 4; m++) {
3536 GemmMicrokernelTester()
3537 .mr(4)
3538 .nr(8)
3539 .kr(1)
3540 .sr(1)
3541 .m(m)
3542 .n(8)
3543 .k(4)
3544 .iterations(1)
3545 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3546 }
3547 }
3548
3549 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4_subtile_n) {
3550 TEST_REQUIRES_ARM_NEON;
3551 for (uint32_t n = 1; n <= 8; n++) {
3552 GemmMicrokernelTester()
3553 .mr(4)
3554 .nr(8)
3555 .kr(1)
3556 .sr(1)
3557 .m(4)
3558 .n(n)
3559 .k(4)
3560 .iterations(1)
3561 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3562 }
3563 }
3564
3565 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_8) {
3566 TEST_REQUIRES_ARM_NEON;
3567 GemmMicrokernelTester()
3568 .mr(4)
3569 .nr(8)
3570 .kr(1)
3571 .sr(1)
3572 .m(4)
3573 .n(8)
3574 .k(8)
3575 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3576 }
3577
3578 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_8_subtile) {
3579 TEST_REQUIRES_ARM_NEON;
3580 for (uint32_t m = 1; m <= 4; m++) {
3581 for (uint32_t n = 1; n <= 8; n++) {
3582 GemmMicrokernelTester()
3583 .mr(4)
3584 .nr(8)
3585 .kr(1)
3586 .sr(1)
3587 .m(m)
3588 .n(n)
3589 .k(8)
3590 .iterations(1)
3591 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3592 }
3593 }
3594 }
3595
3596 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_lt_8) {
3597 TEST_REQUIRES_ARM_NEON;
3598 for (size_t k = 1; k < 8; k++) {
3599 GemmMicrokernelTester()
3600 .mr(4)
3601 .nr(8)
3602 .kr(1)
3603 .sr(1)
3604 .m(4)
3605 .n(8)
3606 .k(k)
3607 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3608 }
3609 }
3610
3611 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_lt_8_subtile) {
3612 TEST_REQUIRES_ARM_NEON;
3613 for (size_t k = 1; k < 8; k++) {
3614 for (uint32_t m = 1; m <= 4; m++) {
3615 for (uint32_t n = 1; n <= 8; n++) {
3616 GemmMicrokernelTester()
3617 .mr(4)
3618 .nr(8)
3619 .kr(1)
3620 .sr(1)
3621 .m(m)
3622 .n(n)
3623 .k(k)
3624 .iterations(1)
3625 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3626 }
3627 }
3628 }
3629 }
3630
3631 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_gt_8) {
3632 TEST_REQUIRES_ARM_NEON;
3633 for (size_t k = 9; k < 8; k++) {
3634 GemmMicrokernelTester()
3635 .mr(4)
3636 .nr(8)
3637 .kr(1)
3638 .sr(1)
3639 .m(4)
3640 .n(8)
3641 .k(k)
3642 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3643 }
3644 }
3645
3646 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_gt_4_subtile) {
3647 TEST_REQUIRES_ARM_NEON;
3648 for (size_t k = 9; k < 8; k++) {
3649 for (uint32_t m = 1; m <= 4; m++) {
3650 for (uint32_t n = 1; n <= 8; n++) {
3651 GemmMicrokernelTester()
3652 .mr(4)
3653 .nr(8)
3654 .kr(1)
3655 .sr(1)
3656 .m(m)
3657 .n(n)
3658 .k(k)
3659 .iterations(1)
3660 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3661 }
3662 }
3663 }
3664 }
3665
3666 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_div_4) {
3667 TEST_REQUIRES_ARM_NEON;
3668 for (size_t k = 12; k <= 40; k += 4) {
3669 GemmMicrokernelTester()
3670 .mr(4)
3671 .nr(8)
3672 .kr(1)
3673 .sr(1)
3674 .m(4)
3675 .n(8)
3676 .k(k)
3677 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3678 }
3679 }
3680
3681 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_div_4_subtile) {
3682 TEST_REQUIRES_ARM_NEON;
3683 for (size_t k = 12; k <= 40; k += 4) {
3684 for (uint32_t m = 1; m <= 4; m++) {
3685 for (uint32_t n = 1; n <= 8; n++) {
3686 GemmMicrokernelTester()
3687 .mr(4)
3688 .nr(8)
3689 .kr(1)
3690 .sr(1)
3691 .m(m)
3692 .n(n)
3693 .k(k)
3694 .iterations(1)
3695 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3696 }
3697 }
3698 }
3699 }
3700
3701 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8) {
3702 TEST_REQUIRES_ARM_NEON;
3703 for (uint32_t n = 9; n < 16; n++) {
3704 for (size_t k = 1; k <= 20; k += 5) {
3705 GemmMicrokernelTester()
3706 .mr(4)
3707 .nr(8)
3708 .kr(1)
3709 .sr(1)
3710 .m(4)
3711 .n(8)
3712 .k(k)
3713 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3714 }
3715 }
3716 }
3717
3718 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8_strided_cn) {
3719 TEST_REQUIRES_ARM_NEON;
3720 for (uint32_t n = 9; n < 16; n++) {
3721 for (size_t k = 1; k <= 20; k += 5) {
3722 GemmMicrokernelTester()
3723 .mr(4)
3724 .nr(8)
3725 .kr(1)
3726 .sr(1)
3727 .m(4)
3728 .n(8)
3729 .k(k)
3730 .cn_stride(11)
3731 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3732 }
3733 }
3734 }
3735
3736 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8_subtile) {
3737 TEST_REQUIRES_ARM_NEON;
3738 for (uint32_t n = 9; n < 16; n++) {
3739 for (size_t k = 1; k <= 20; k += 5) {
3740 for (uint32_t m = 1; m <= 4; m++) {
3741 GemmMicrokernelTester()
3742 .mr(4)
3743 .nr(8)
3744 .kr(1)
3745 .sr(1)
3746 .m(m)
3747 .n(n)
3748 .k(k)
3749 .iterations(1)
3750 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3751 }
3752 }
3753 }
3754 }
3755
3756 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, n_div_8) {
3757 TEST_REQUIRES_ARM_NEON;
3758 for (uint32_t n = 16; n <= 24; n += 8) {
3759 for (size_t k = 1; k <= 20; k += 5) {
3760 GemmMicrokernelTester()
3761 .mr(4)
3762 .nr(8)
3763 .kr(1)
3764 .sr(1)
3765 .m(4)
3766 .n(8)
3767 .k(k)
3768 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3769 }
3770 }
3771 }
3772
3773 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, n_div_8_strided_cn) {
3774 TEST_REQUIRES_ARM_NEON;
3775 for (uint32_t n = 16; n <= 24; n += 8) {
3776 for (size_t k = 1; k <= 20; k += 5) {
3777 GemmMicrokernelTester()
3778 .mr(4)
3779 .nr(8)
3780 .kr(1)
3781 .sr(1)
3782 .m(4)
3783 .n(n)
3784 .k(k)
3785 .cn_stride(11)
3786 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3787 }
3788 }
3789 }
3790
3791 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, n_div_8_subtile) {
3792 TEST_REQUIRES_ARM_NEON;
3793 for (uint32_t n = 16; n <= 24; n += 8) {
3794 for (size_t k = 1; k <= 20; k += 5) {
3795 for (uint32_t m = 1; m <= 4; m++) {
3796 GemmMicrokernelTester()
3797 .mr(4)
3798 .nr(8)
3799 .kr(1)
3800 .sr(1)
3801 .m(m)
3802 .n(n)
3803 .k(k)
3804 .iterations(1)
3805 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3806 }
3807 }
3808 }
3809 }
3810
3811 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, small_kernel) {
3812 TEST_REQUIRES_ARM_NEON;
3813 for (size_t k = 1; k <= 20; k += 5) {
3814 GemmMicrokernelTester()
3815 .mr(4)
3816 .nr(8)
3817 .kr(1)
3818 .sr(1)
3819 .m(4)
3820 .n(8)
3821 .k(k)
3822 .ks(3)
3823 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3824 }
3825 }
3826
3827 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, small_kernel_subtile) {
3828 TEST_REQUIRES_ARM_NEON;
3829 for (size_t k = 1; k <= 20; k += 5) {
3830 for (uint32_t m = 1; m <= 4; m++) {
3831 for (uint32_t n = 1; n <= 8; n++) {
3832 GemmMicrokernelTester()
3833 .mr(4)
3834 .nr(8)
3835 .kr(1)
3836 .sr(1)
3837 .m(m)
3838 .n(n)
3839 .k(k)
3840 .ks(3)
3841 .iterations(1)
3842 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3843 }
3844 }
3845 }
3846 }
3847
3848 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8_small_kernel) {
3849 TEST_REQUIRES_ARM_NEON;
3850 for (uint32_t n = 9; n < 16; n++) {
3851 for (size_t k = 1; k <= 20; k += 5) {
3852 GemmMicrokernelTester()
3853 .mr(4)
3854 .nr(8)
3855 .kr(1)
3856 .sr(1)
3857 .m(4)
3858 .n(8)
3859 .k(k)
3860 .ks(3)
3861 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3862 }
3863 }
3864 }
3865
3866 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, n_div_8_small_kernel) {
3867 TEST_REQUIRES_ARM_NEON;
3868 for (uint32_t n = 16; n <= 24; n += 8) {
3869 for (size_t k = 1; k <= 20; k += 5) {
3870 GemmMicrokernelTester()
3871 .mr(4)
3872 .nr(8)
3873 .kr(1)
3874 .sr(1)
3875 .m(4)
3876 .n(8)
3877 .k(k)
3878 .ks(3)
3879 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3880 }
3881 }
3882 }
3883
3884 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, strided_cm_subtile) {
3885 TEST_REQUIRES_ARM_NEON;
3886 for (size_t k = 1; k <= 20; k += 5) {
3887 for (uint32_t m = 1; m <= 4; m++) {
3888 for (uint32_t n = 1; n <= 8; n++) {
3889 GemmMicrokernelTester()
3890 .mr(4)
3891 .nr(8)
3892 .kr(1)
3893 .sr(1)
3894 .m(m)
3895 .n(n)
3896 .k(k)
3897 .cm_stride(11)
3898 .iterations(1)
3899 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3900 }
3901 }
3902 }
3903 }
3904
3905 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, a_offset) {
3906 TEST_REQUIRES_ARM_NEON;
3907 for (size_t k = 1; k <= 20; k += 5) {
3908 GemmMicrokernelTester()
3909 .mr(4)
3910 .nr(8)
3911 .kr(1)
3912 .sr(1)
3913 .m(4)
3914 .n(8)
3915 .k(k)
3916 .ks(3)
3917 .a_offset(83)
3918 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3919 }
3920 }
3921
3922 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, zero) {
3923 TEST_REQUIRES_ARM_NEON;
3924 for (uint32_t mz = 0; mz < 4; mz++) {
3925 for (size_t k = 1; k <= 20; k += 5) {
3926 GemmMicrokernelTester()
3927 .mr(4)
3928 .nr(8)
3929 .kr(1)
3930 .sr(1)
3931 .m(4)
3932 .n(8)
3933 .k(k)
3934 .ks(3)
3935 .a_offset(83)
3936 .zero_index(mz)
3937 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3938 }
3939 }
3940 }
3941
3942 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, qmin) {
3943 TEST_REQUIRES_ARM_NEON;
3944 GemmMicrokernelTester()
3945 .mr(4)
3946 .nr(8)
3947 .kr(1)
3948 .sr(1)
3949 .m(4)
3950 .n(8)
3951 .k(4)
3952 .qmin(128)
3953 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3954 }
3955
3956 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, qmax) {
3957 TEST_REQUIRES_ARM_NEON;
3958 GemmMicrokernelTester()
3959 .mr(4)
3960 .nr(8)
3961 .kr(1)
3962 .sr(1)
3963 .m(4)
3964 .n(8)
3965 .k(4)
3966 .qmax(128)
3967 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3968 }
3969
3970 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, strided_cm) {
3971 TEST_REQUIRES_ARM_NEON;
3972 GemmMicrokernelTester()
3973 .mr(4)
3974 .nr(8)
3975 .kr(1)
3976 .sr(1)
3977 .m(4)
3978 .n(8)
3979 .k(4)
3980 .cm_stride(11)
3981 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3982 }
3983#endif // XNN_ARCH_ARM
3984
3985
3986#if XNN_ARCH_ARM
3987 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_4) {
3988 TEST_REQUIRES_ARM_NEON;
3989 GemmMicrokernelTester()
3990 .mr(4)
3991 .nr(8)
3992 .kr(1)
3993 .sr(1)
3994 .m(4)
3995 .n(8)
3996 .k(4)
3997 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
3998 }
3999
4000 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, strided_cn) {
4001 TEST_REQUIRES_ARM_NEON;
4002 GemmMicrokernelTester()
4003 .mr(4)
4004 .nr(8)
4005 .kr(1)
4006 .sr(1)
4007 .m(4)
4008 .n(8)
4009 .k(4)
4010 .cn_stride(11)
4011 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4012 }
4013
4014 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_4_subtile) {
4015 TEST_REQUIRES_ARM_NEON;
4016 for (uint32_t m = 1; m <= 4; m++) {
4017 for (uint32_t n = 1; n <= 8; n++) {
4018 GemmMicrokernelTester()
4019 .mr(4)
4020 .nr(8)
4021 .kr(1)
4022 .sr(1)
4023 .m(m)
4024 .n(n)
4025 .k(4)
4026 .iterations(1)
4027 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4028 }
4029 }
4030 }
4031
4032 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_4_subtile_m) {
4033 TEST_REQUIRES_ARM_NEON;
4034 for (uint32_t m = 1; m <= 4; m++) {
4035 GemmMicrokernelTester()
4036 .mr(4)
4037 .nr(8)
4038 .kr(1)
4039 .sr(1)
4040 .m(m)
4041 .n(8)
4042 .k(4)
4043 .iterations(1)
4044 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4045 }
4046 }
4047
4048 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_4_subtile_n) {
4049 TEST_REQUIRES_ARM_NEON;
4050 for (uint32_t n = 1; n <= 8; n++) {
4051 GemmMicrokernelTester()
4052 .mr(4)
4053 .nr(8)
4054 .kr(1)
4055 .sr(1)
4056 .m(4)
4057 .n(n)
4058 .k(4)
4059 .iterations(1)
4060 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4061 }
4062 }
4063
4064 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_8) {
4065 TEST_REQUIRES_ARM_NEON;
4066 GemmMicrokernelTester()
4067 .mr(4)
4068 .nr(8)
4069 .kr(1)
4070 .sr(1)
4071 .m(4)
4072 .n(8)
4073 .k(8)
4074 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4075 }
4076
4077 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_8_subtile) {
4078 TEST_REQUIRES_ARM_NEON;
4079 for (uint32_t m = 1; m <= 4; m++) {
4080 for (uint32_t n = 1; n <= 8; n++) {
4081 GemmMicrokernelTester()
4082 .mr(4)
4083 .nr(8)
4084 .kr(1)
4085 .sr(1)
4086 .m(m)
4087 .n(n)
4088 .k(8)
4089 .iterations(1)
4090 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4091 }
4092 }
4093 }
4094
4095 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_lt_8) {
4096 TEST_REQUIRES_ARM_NEON;
4097 for (size_t k = 1; k < 8; k++) {
4098 GemmMicrokernelTester()
4099 .mr(4)
4100 .nr(8)
4101 .kr(1)
4102 .sr(1)
4103 .m(4)
4104 .n(8)
4105 .k(k)
4106 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4107 }
4108 }
4109
4110 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_lt_8_subtile) {
4111 TEST_REQUIRES_ARM_NEON;
4112 for (size_t k = 1; k < 8; k++) {
4113 for (uint32_t m = 1; m <= 4; m++) {
4114 for (uint32_t n = 1; n <= 8; n++) {
4115 GemmMicrokernelTester()
4116 .mr(4)
4117 .nr(8)
4118 .kr(1)
4119 .sr(1)
4120 .m(m)
4121 .n(n)
4122 .k(k)
4123 .iterations(1)
4124 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4125 }
4126 }
4127 }
4128 }
4129
4130 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_gt_8) {
4131 TEST_REQUIRES_ARM_NEON;
4132 for (size_t k = 9; k < 8; k++) {
4133 GemmMicrokernelTester()
4134 .mr(4)
4135 .nr(8)
4136 .kr(1)
4137 .sr(1)
4138 .m(4)
4139 .n(8)
4140 .k(k)
4141 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4142 }
4143 }
4144
4145 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_gt_4_subtile) {
4146 TEST_REQUIRES_ARM_NEON;
4147 for (size_t k = 9; k < 8; k++) {
4148 for (uint32_t m = 1; m <= 4; m++) {
4149 for (uint32_t n = 1; n <= 8; n++) {
4150 GemmMicrokernelTester()
4151 .mr(4)
4152 .nr(8)
4153 .kr(1)
4154 .sr(1)
4155 .m(m)
4156 .n(n)
4157 .k(k)
4158 .iterations(1)
4159 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4160 }
4161 }
4162 }
4163 }
4164
4165 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_div_4) {
4166 TEST_REQUIRES_ARM_NEON;
4167 for (size_t k = 12; k <= 40; k += 4) {
4168 GemmMicrokernelTester()
4169 .mr(4)
4170 .nr(8)
4171 .kr(1)
4172 .sr(1)
4173 .m(4)
4174 .n(8)
4175 .k(k)
4176 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4177 }
4178 }
4179
4180 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_div_4_subtile) {
4181 TEST_REQUIRES_ARM_NEON;
4182 for (size_t k = 12; k <= 40; k += 4) {
4183 for (uint32_t m = 1; m <= 4; m++) {
4184 for (uint32_t n = 1; n <= 8; n++) {
4185 GemmMicrokernelTester()
4186 .mr(4)
4187 .nr(8)
4188 .kr(1)
4189 .sr(1)
4190 .m(m)
4191 .n(n)
4192 .k(k)
4193 .iterations(1)
4194 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4195 }
4196 }
4197 }
4198 }
4199
4200 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_gt_8) {
4201 TEST_REQUIRES_ARM_NEON;
4202 for (uint32_t n = 9; n < 16; n++) {
4203 for (size_t k = 1; k <= 20; k += 5) {
4204 GemmMicrokernelTester()
4205 .mr(4)
4206 .nr(8)
4207 .kr(1)
4208 .sr(1)
4209 .m(4)
4210 .n(8)
4211 .k(k)
4212 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4213 }
4214 }
4215 }
4216
4217 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_gt_8_strided_cn) {
4218 TEST_REQUIRES_ARM_NEON;
4219 for (uint32_t n = 9; n < 16; n++) {
4220 for (size_t k = 1; k <= 20; k += 5) {
4221 GemmMicrokernelTester()
4222 .mr(4)
4223 .nr(8)
4224 .kr(1)
4225 .sr(1)
4226 .m(4)
4227 .n(8)
4228 .k(k)
4229 .cn_stride(11)
4230 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4231 }
4232 }
4233 }
4234
4235 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_gt_8_subtile) {
4236 TEST_REQUIRES_ARM_NEON;
4237 for (uint32_t n = 9; n < 16; n++) {
4238 for (size_t k = 1; k <= 20; k += 5) {
4239 for (uint32_t m = 1; m <= 4; m++) {
4240 GemmMicrokernelTester()
4241 .mr(4)
4242 .nr(8)
4243 .kr(1)
4244 .sr(1)
4245 .m(m)
4246 .n(n)
4247 .k(k)
4248 .iterations(1)
4249 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4250 }
4251 }
4252 }
4253 }
4254
4255 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_div_8) {
4256 TEST_REQUIRES_ARM_NEON;
4257 for (uint32_t n = 16; n <= 24; n += 8) {
4258 for (size_t k = 1; k <= 20; k += 5) {
4259 GemmMicrokernelTester()
4260 .mr(4)
4261 .nr(8)
4262 .kr(1)
4263 .sr(1)
4264 .m(4)
4265 .n(8)
4266 .k(k)
4267 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4268 }
4269 }
4270 }
4271
4272 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_div_8_strided_cn) {
4273 TEST_REQUIRES_ARM_NEON;
4274 for (uint32_t n = 16; n <= 24; n += 8) {
4275 for (size_t k = 1; k <= 20; k += 5) {
4276 GemmMicrokernelTester()
4277 .mr(4)
4278 .nr(8)
4279 .kr(1)
4280 .sr(1)
4281 .m(4)
4282 .n(n)
4283 .k(k)
4284 .cn_stride(11)
4285 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4286 }
4287 }
4288 }
4289
4290 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_div_8_subtile) {
4291 TEST_REQUIRES_ARM_NEON;
4292 for (uint32_t n = 16; n <= 24; n += 8) {
4293 for (size_t k = 1; k <= 20; k += 5) {
4294 for (uint32_t m = 1; m <= 4; m++) {
4295 GemmMicrokernelTester()
4296 .mr(4)
4297 .nr(8)
4298 .kr(1)
4299 .sr(1)
4300 .m(m)
4301 .n(n)
4302 .k(k)
4303 .iterations(1)
4304 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4305 }
4306 }
4307 }
4308 }
4309
4310 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, small_kernel) {
4311 TEST_REQUIRES_ARM_NEON;
4312 for (size_t k = 1; k <= 20; k += 5) {
4313 GemmMicrokernelTester()
4314 .mr(4)
4315 .nr(8)
4316 .kr(1)
4317 .sr(1)
4318 .m(4)
4319 .n(8)
4320 .k(k)
4321 .ks(3)
4322 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4323 }
4324 }
4325
4326 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, small_kernel_subtile) {
4327 TEST_REQUIRES_ARM_NEON;
4328 for (size_t k = 1; k <= 20; k += 5) {
4329 for (uint32_t m = 1; m <= 4; m++) {
4330 for (uint32_t n = 1; n <= 8; n++) {
4331 GemmMicrokernelTester()
4332 .mr(4)
4333 .nr(8)
4334 .kr(1)
4335 .sr(1)
4336 .m(m)
4337 .n(n)
4338 .k(k)
4339 .ks(3)
4340 .iterations(1)
4341 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4342 }
4343 }
4344 }
4345 }
4346
4347 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_gt_8_small_kernel) {
4348 TEST_REQUIRES_ARM_NEON;
4349 for (uint32_t n = 9; n < 16; n++) {
4350 for (size_t k = 1; k <= 20; k += 5) {
4351 GemmMicrokernelTester()
4352 .mr(4)
4353 .nr(8)
4354 .kr(1)
4355 .sr(1)
4356 .m(4)
4357 .n(8)
4358 .k(k)
4359 .ks(3)
4360 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4361 }
4362 }
4363 }
4364
4365 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_div_8_small_kernel) {
4366 TEST_REQUIRES_ARM_NEON;
4367 for (uint32_t n = 16; n <= 24; n += 8) {
4368 for (size_t k = 1; k <= 20; k += 5) {
4369 GemmMicrokernelTester()
4370 .mr(4)
4371 .nr(8)
4372 .kr(1)
4373 .sr(1)
4374 .m(4)
4375 .n(8)
4376 .k(k)
4377 .ks(3)
4378 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4379 }
4380 }
4381 }
4382
4383 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, strided_cm_subtile) {
4384 TEST_REQUIRES_ARM_NEON;
4385 for (size_t k = 1; k <= 20; k += 5) {
4386 for (uint32_t m = 1; m <= 4; m++) {
4387 for (uint32_t n = 1; n <= 8; n++) {
4388 GemmMicrokernelTester()
4389 .mr(4)
4390 .nr(8)
4391 .kr(1)
4392 .sr(1)
4393 .m(m)
4394 .n(n)
4395 .k(k)
4396 .cm_stride(11)
4397 .iterations(1)
4398 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4399 }
4400 }
4401 }
4402 }
4403
4404 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, a_offset) {
4405 TEST_REQUIRES_ARM_NEON;
4406 for (size_t k = 1; k <= 20; k += 5) {
4407 GemmMicrokernelTester()
4408 .mr(4)
4409 .nr(8)
4410 .kr(1)
4411 .sr(1)
4412 .m(4)
4413 .n(8)
4414 .k(k)
4415 .ks(3)
4416 .a_offset(83)
4417 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4418 }
4419 }
4420
4421 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, zero) {
4422 TEST_REQUIRES_ARM_NEON;
4423 for (uint32_t mz = 0; mz < 4; mz++) {
4424 for (size_t k = 1; k <= 20; k += 5) {
4425 GemmMicrokernelTester()
4426 .mr(4)
4427 .nr(8)
4428 .kr(1)
4429 .sr(1)
4430 .m(4)
4431 .n(8)
4432 .k(k)
4433 .ks(3)
4434 .a_offset(83)
4435 .zero_index(mz)
4436 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4437 }
4438 }
4439 }
4440
4441 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, qmin) {
4442 TEST_REQUIRES_ARM_NEON;
4443 GemmMicrokernelTester()
4444 .mr(4)
4445 .nr(8)
4446 .kr(1)
4447 .sr(1)
4448 .m(4)
4449 .n(8)
4450 .k(4)
4451 .qmin(128)
4452 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4453 }
4454
4455 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, qmax) {
4456 TEST_REQUIRES_ARM_NEON;
4457 GemmMicrokernelTester()
4458 .mr(4)
4459 .nr(8)
4460 .kr(1)
4461 .sr(1)
4462 .m(4)
4463 .n(8)
4464 .k(4)
4465 .qmax(128)
4466 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4467 }
4468
4469 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, strided_cm) {
4470 TEST_REQUIRES_ARM_NEON;
4471 GemmMicrokernelTester()
4472 .mr(4)
4473 .nr(8)
4474 .kr(1)
4475 .sr(1)
4476 .m(4)
4477 .n(8)
4478 .k(4)
4479 .cm_stride(11)
4480 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4481 }
4482#endif // XNN_ARCH_ARM
4483
4484
Frank Barchard7e955972019-10-11 10:34:25 -07004485#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard387c2d12019-12-16 19:14:07 -08004486 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
4487 TEST_REQUIRES_ARM_NEON_FMA;
4488 GemmMicrokernelTester()
4489 .mr(5)
4490 .nr(8)
4491 .kr(1)
4492 .sr(1)
4493 .m(5)
4494 .n(8)
4495 .k(8)
4496 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4497 }
4498
4499 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, strided_cn) {
4500 TEST_REQUIRES_ARM_NEON_FMA;
4501 GemmMicrokernelTester()
4502 .mr(5)
4503 .nr(8)
4504 .kr(1)
4505 .sr(1)
4506 .m(5)
4507 .n(8)
4508 .k(8)
4509 .cn_stride(11)
4510 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4511 }
4512
4513 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile) {
4514 TEST_REQUIRES_ARM_NEON_FMA;
4515 for (uint32_t m = 1; m <= 5; m++) {
4516 for (uint32_t n = 1; n <= 8; n++) {
4517 GemmMicrokernelTester()
4518 .mr(5)
4519 .nr(8)
4520 .kr(1)
4521 .sr(1)
4522 .m(m)
4523 .n(n)
4524 .k(8)
4525 .iterations(1)
4526 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4527 }
4528 }
4529 }
4530
4531 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_m) {
4532 TEST_REQUIRES_ARM_NEON_FMA;
4533 for (uint32_t m = 1; m <= 5; m++) {
4534 GemmMicrokernelTester()
4535 .mr(5)
4536 .nr(8)
4537 .kr(1)
4538 .sr(1)
4539 .m(m)
4540 .n(8)
4541 .k(8)
4542 .iterations(1)
4543 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4544 }
4545 }
4546
4547 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_n) {
4548 TEST_REQUIRES_ARM_NEON_FMA;
4549 for (uint32_t n = 1; n <= 8; n++) {
4550 GemmMicrokernelTester()
4551 .mr(5)
4552 .nr(8)
4553 .kr(1)
4554 .sr(1)
4555 .m(5)
4556 .n(n)
4557 .k(8)
4558 .iterations(1)
4559 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4560 }
4561 }
4562
4563 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16) {
4564 TEST_REQUIRES_ARM_NEON_FMA;
4565 GemmMicrokernelTester()
4566 .mr(5)
4567 .nr(8)
4568 .kr(1)
4569 .sr(1)
4570 .m(5)
4571 .n(8)
4572 .k(16)
4573 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4574 }
4575
4576 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_subtile) {
4577 TEST_REQUIRES_ARM_NEON_FMA;
4578 for (uint32_t m = 1; m <= 5; m++) {
4579 for (uint32_t n = 1; n <= 8; n++) {
4580 GemmMicrokernelTester()
4581 .mr(5)
4582 .nr(8)
4583 .kr(1)
4584 .sr(1)
4585 .m(m)
4586 .n(n)
4587 .k(16)
4588 .iterations(1)
4589 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4590 }
4591 }
4592 }
4593
4594 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16) {
4595 TEST_REQUIRES_ARM_NEON_FMA;
4596 for (size_t k = 1; k < 16; k++) {
4597 GemmMicrokernelTester()
4598 .mr(5)
4599 .nr(8)
4600 .kr(1)
4601 .sr(1)
4602 .m(5)
4603 .n(8)
4604 .k(k)
4605 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4606 }
4607 }
4608
4609 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_subtile) {
4610 TEST_REQUIRES_ARM_NEON_FMA;
4611 for (size_t k = 1; k < 16; k++) {
4612 for (uint32_t m = 1; m <= 5; m++) {
4613 for (uint32_t n = 1; n <= 8; n++) {
4614 GemmMicrokernelTester()
4615 .mr(5)
4616 .nr(8)
4617 .kr(1)
4618 .sr(1)
4619 .m(m)
4620 .n(n)
4621 .k(k)
4622 .iterations(1)
4623 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4624 }
4625 }
4626 }
4627 }
4628
4629 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_16) {
4630 TEST_REQUIRES_ARM_NEON_FMA;
4631 for (size_t k = 17; k < 16; k++) {
4632 GemmMicrokernelTester()
4633 .mr(5)
4634 .nr(8)
4635 .kr(1)
4636 .sr(1)
4637 .m(5)
4638 .n(8)
4639 .k(k)
4640 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4641 }
4642 }
4643
4644 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_subtile) {
4645 TEST_REQUIRES_ARM_NEON_FMA;
4646 for (size_t k = 17; k < 16; k++) {
4647 for (uint32_t m = 1; m <= 5; m++) {
4648 for (uint32_t n = 1; n <= 8; n++) {
4649 GemmMicrokernelTester()
4650 .mr(5)
4651 .nr(8)
4652 .kr(1)
4653 .sr(1)
4654 .m(m)
4655 .n(n)
4656 .k(k)
4657 .iterations(1)
4658 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4659 }
4660 }
4661 }
4662 }
4663
4664 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8) {
4665 TEST_REQUIRES_ARM_NEON_FMA;
4666 for (size_t k = 24; k <= 80; k += 8) {
4667 GemmMicrokernelTester()
4668 .mr(5)
4669 .nr(8)
4670 .kr(1)
4671 .sr(1)
4672 .m(5)
4673 .n(8)
4674 .k(k)
4675 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4676 }
4677 }
4678
4679 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_subtile) {
4680 TEST_REQUIRES_ARM_NEON_FMA;
4681 for (size_t k = 24; k <= 80; k += 8) {
4682 for (uint32_t m = 1; m <= 5; m++) {
4683 for (uint32_t n = 1; n <= 8; n++) {
4684 GemmMicrokernelTester()
4685 .mr(5)
4686 .nr(8)
4687 .kr(1)
4688 .sr(1)
4689 .m(m)
4690 .n(n)
4691 .k(k)
4692 .iterations(1)
4693 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4694 }
4695 }
4696 }
4697 }
4698
4699 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8) {
4700 TEST_REQUIRES_ARM_NEON_FMA;
4701 for (uint32_t n = 9; n < 16; n++) {
4702 for (size_t k = 1; k <= 40; k += 9) {
4703 GemmMicrokernelTester()
4704 .mr(5)
4705 .nr(8)
4706 .kr(1)
4707 .sr(1)
4708 .m(5)
4709 .n(8)
4710 .k(k)
4711 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4712 }
4713 }
4714 }
4715
4716 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_cn) {
4717 TEST_REQUIRES_ARM_NEON_FMA;
4718 for (uint32_t n = 9; n < 16; n++) {
4719 for (size_t k = 1; k <= 40; k += 9) {
4720 GemmMicrokernelTester()
4721 .mr(5)
4722 .nr(8)
4723 .kr(1)
4724 .sr(1)
4725 .m(5)
4726 .n(8)
4727 .k(k)
4728 .cn_stride(11)
4729 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4730 }
4731 }
4732 }
4733
4734 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_subtile) {
4735 TEST_REQUIRES_ARM_NEON_FMA;
4736 for (uint32_t n = 9; n < 16; n++) {
4737 for (size_t k = 1; k <= 40; k += 9) {
4738 for (uint32_t m = 1; m <= 5; m++) {
4739 GemmMicrokernelTester()
4740 .mr(5)
4741 .nr(8)
4742 .kr(1)
4743 .sr(1)
4744 .m(m)
4745 .n(n)
4746 .k(k)
4747 .iterations(1)
4748 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4749 }
4750 }
4751 }
4752 }
4753
4754 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8) {
4755 TEST_REQUIRES_ARM_NEON_FMA;
4756 for (uint32_t n = 16; n <= 24; n += 8) {
4757 for (size_t k = 1; k <= 40; k += 9) {
4758 GemmMicrokernelTester()
4759 .mr(5)
4760 .nr(8)
4761 .kr(1)
4762 .sr(1)
4763 .m(5)
4764 .n(8)
4765 .k(k)
4766 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4767 }
4768 }
4769 }
4770
4771 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_cn) {
4772 TEST_REQUIRES_ARM_NEON_FMA;
4773 for (uint32_t n = 16; n <= 24; n += 8) {
4774 for (size_t k = 1; k <= 40; k += 9) {
4775 GemmMicrokernelTester()
4776 .mr(5)
4777 .nr(8)
4778 .kr(1)
4779 .sr(1)
4780 .m(5)
4781 .n(n)
4782 .k(k)
4783 .cn_stride(11)
4784 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4785 }
4786 }
4787 }
4788
4789 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_subtile) {
4790 TEST_REQUIRES_ARM_NEON_FMA;
4791 for (uint32_t n = 16; n <= 24; n += 8) {
4792 for (size_t k = 1; k <= 40; k += 9) {
4793 for (uint32_t m = 1; m <= 5; m++) {
4794 GemmMicrokernelTester()
4795 .mr(5)
4796 .nr(8)
4797 .kr(1)
4798 .sr(1)
4799 .m(m)
4800 .n(n)
4801 .k(k)
4802 .iterations(1)
4803 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4804 }
4805 }
4806 }
4807 }
4808
4809 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, small_kernel) {
4810 TEST_REQUIRES_ARM_NEON_FMA;
4811 for (size_t k = 1; k <= 40; k += 9) {
4812 GemmMicrokernelTester()
4813 .mr(5)
4814 .nr(8)
4815 .kr(1)
4816 .sr(1)
4817 .m(5)
4818 .n(8)
4819 .k(k)
4820 .ks(3)
4821 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4822 }
4823 }
4824
4825 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, small_kernel_subtile) {
4826 TEST_REQUIRES_ARM_NEON_FMA;
4827 for (size_t k = 1; k <= 40; k += 9) {
4828 for (uint32_t m = 1; m <= 5; m++) {
4829 for (uint32_t n = 1; n <= 8; n++) {
4830 GemmMicrokernelTester()
4831 .mr(5)
4832 .nr(8)
4833 .kr(1)
4834 .sr(1)
4835 .m(m)
4836 .n(n)
4837 .k(k)
4838 .ks(3)
4839 .iterations(1)
4840 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4841 }
4842 }
4843 }
4844 }
4845
4846 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_small_kernel) {
4847 TEST_REQUIRES_ARM_NEON_FMA;
4848 for (uint32_t n = 9; n < 16; n++) {
4849 for (size_t k = 1; k <= 40; k += 9) {
4850 GemmMicrokernelTester()
4851 .mr(5)
4852 .nr(8)
4853 .kr(1)
4854 .sr(1)
4855 .m(5)
4856 .n(8)
4857 .k(k)
4858 .ks(3)
4859 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4860 }
4861 }
4862 }
4863
4864 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_small_kernel) {
4865 TEST_REQUIRES_ARM_NEON_FMA;
4866 for (uint32_t n = 16; n <= 24; n += 8) {
4867 for (size_t k = 1; k <= 40; k += 9) {
4868 GemmMicrokernelTester()
4869 .mr(5)
4870 .nr(8)
4871 .kr(1)
4872 .sr(1)
4873 .m(5)
4874 .n(8)
4875 .k(k)
4876 .ks(3)
4877 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4878 }
4879 }
4880 }
4881
4882 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm_subtile) {
4883 TEST_REQUIRES_ARM_NEON_FMA;
4884 for (size_t k = 1; k <= 40; k += 9) {
4885 for (uint32_t m = 1; m <= 5; m++) {
4886 for (uint32_t n = 1; n <= 8; n++) {
4887 GemmMicrokernelTester()
4888 .mr(5)
4889 .nr(8)
4890 .kr(1)
4891 .sr(1)
4892 .m(m)
4893 .n(n)
4894 .k(k)
4895 .cm_stride(11)
4896 .iterations(1)
4897 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4898 }
4899 }
4900 }
4901 }
4902
4903 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, a_offset) {
4904 TEST_REQUIRES_ARM_NEON_FMA;
4905 for (size_t k = 1; k <= 40; k += 9) {
4906 GemmMicrokernelTester()
4907 .mr(5)
4908 .nr(8)
4909 .kr(1)
4910 .sr(1)
4911 .m(5)
4912 .n(8)
4913 .k(k)
4914 .ks(3)
4915 .a_offset(211)
4916 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4917 }
4918 }
4919
4920 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, zero) {
4921 TEST_REQUIRES_ARM_NEON_FMA;
4922 for (uint32_t mz = 0; mz < 5; mz++) {
4923 for (size_t k = 1; k <= 40; k += 9) {
4924 GemmMicrokernelTester()
4925 .mr(5)
4926 .nr(8)
4927 .kr(1)
4928 .sr(1)
4929 .m(5)
4930 .n(8)
4931 .k(k)
4932 .ks(3)
4933 .a_offset(211)
4934 .zero_index(mz)
4935 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4936 }
4937 }
4938 }
4939
4940 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, qmin) {
4941 TEST_REQUIRES_ARM_NEON_FMA;
4942 GemmMicrokernelTester()
4943 .mr(5)
4944 .nr(8)
4945 .kr(1)
4946 .sr(1)
4947 .m(5)
4948 .n(8)
4949 .k(8)
4950 .qmin(128)
4951 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4952 }
4953
4954 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, qmax) {
4955 TEST_REQUIRES_ARM_NEON_FMA;
4956 GemmMicrokernelTester()
4957 .mr(5)
4958 .nr(8)
4959 .kr(1)
4960 .sr(1)
4961 .m(5)
4962 .n(8)
4963 .k(8)
4964 .qmax(128)
4965 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4966 }
4967
4968 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm) {
4969 TEST_REQUIRES_ARM_NEON_FMA;
4970 GemmMicrokernelTester()
4971 .mr(5)
4972 .nr(8)
4973 .kr(1)
4974 .sr(1)
4975 .m(5)
4976 .n(8)
4977 .k(8)
4978 .cm_stride(11)
4979 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4980 }
4981#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
4982
4983
4984#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07004985 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
4986 TEST_REQUIRES_ARM_NEON_FMA;
4987 GemmMicrokernelTester()
4988 .mr(5)
4989 .nr(8)
4990 .kr(1)
4991 .sr(1)
4992 .m(5)
4993 .n(8)
4994 .k(8)
4995 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4996 }
4997
4998 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
4999 TEST_REQUIRES_ARM_NEON_FMA;
5000 GemmMicrokernelTester()
5001 .mr(5)
5002 .nr(8)
5003 .kr(1)
5004 .sr(1)
5005 .m(5)
5006 .n(8)
5007 .k(8)
5008 .cn_stride(11)
5009 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5010 }
5011
5012 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
5013 TEST_REQUIRES_ARM_NEON_FMA;
5014 for (uint32_t m = 1; m <= 5; m++) {
5015 for (uint32_t n = 1; n <= 8; n++) {
5016 GemmMicrokernelTester()
5017 .mr(5)
5018 .nr(8)
5019 .kr(1)
5020 .sr(1)
5021 .m(m)
5022 .n(n)
5023 .k(8)
5024 .iterations(1)
5025 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5026 }
5027 }
5028 }
5029
5030 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
5031 TEST_REQUIRES_ARM_NEON_FMA;
5032 for (uint32_t m = 1; m <= 5; m++) {
5033 GemmMicrokernelTester()
5034 .mr(5)
5035 .nr(8)
5036 .kr(1)
5037 .sr(1)
5038 .m(m)
5039 .n(8)
5040 .k(8)
5041 .iterations(1)
5042 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5043 }
5044 }
5045
5046 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
5047 TEST_REQUIRES_ARM_NEON_FMA;
5048 for (uint32_t n = 1; n <= 8; n++) {
5049 GemmMicrokernelTester()
5050 .mr(5)
5051 .nr(8)
5052 .kr(1)
5053 .sr(1)
5054 .m(5)
5055 .n(n)
5056 .k(8)
5057 .iterations(1)
5058 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5059 }
5060 }
5061
5062 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
5063 TEST_REQUIRES_ARM_NEON_FMA;
5064 GemmMicrokernelTester()
5065 .mr(5)
5066 .nr(8)
5067 .kr(1)
5068 .sr(1)
5069 .m(5)
5070 .n(8)
5071 .k(16)
5072 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5073 }
5074
5075 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
5076 TEST_REQUIRES_ARM_NEON_FMA;
5077 for (uint32_t m = 1; m <= 5; m++) {
5078 for (uint32_t n = 1; n <= 8; n++) {
5079 GemmMicrokernelTester()
5080 .mr(5)
5081 .nr(8)
5082 .kr(1)
5083 .sr(1)
5084 .m(m)
5085 .n(n)
5086 .k(16)
5087 .iterations(1)
5088 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5089 }
5090 }
5091 }
5092
5093 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
5094 TEST_REQUIRES_ARM_NEON_FMA;
5095 for (size_t k = 1; k < 16; k++) {
5096 GemmMicrokernelTester()
5097 .mr(5)
5098 .nr(8)
5099 .kr(1)
5100 .sr(1)
5101 .m(5)
5102 .n(8)
5103 .k(k)
5104 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5105 }
5106 }
5107
5108 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
5109 TEST_REQUIRES_ARM_NEON_FMA;
5110 for (size_t k = 1; k < 16; k++) {
5111 for (uint32_t m = 1; m <= 5; m++) {
5112 for (uint32_t n = 1; n <= 8; n++) {
5113 GemmMicrokernelTester()
5114 .mr(5)
5115 .nr(8)
5116 .kr(1)
5117 .sr(1)
5118 .m(m)
5119 .n(n)
5120 .k(k)
5121 .iterations(1)
5122 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5123 }
5124 }
5125 }
5126 }
5127
5128 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
5129 TEST_REQUIRES_ARM_NEON_FMA;
5130 for (size_t k = 17; k < 16; k++) {
5131 GemmMicrokernelTester()
5132 .mr(5)
5133 .nr(8)
5134 .kr(1)
5135 .sr(1)
5136 .m(5)
5137 .n(8)
5138 .k(k)
5139 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5140 }
5141 }
5142
5143 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
5144 TEST_REQUIRES_ARM_NEON_FMA;
5145 for (size_t k = 17; k < 16; k++) {
5146 for (uint32_t m = 1; m <= 5; m++) {
5147 for (uint32_t n = 1; n <= 8; n++) {
5148 GemmMicrokernelTester()
5149 .mr(5)
5150 .nr(8)
5151 .kr(1)
5152 .sr(1)
5153 .m(m)
5154 .n(n)
5155 .k(k)
5156 .iterations(1)
5157 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5158 }
5159 }
5160 }
5161 }
5162
5163 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
5164 TEST_REQUIRES_ARM_NEON_FMA;
5165 for (size_t k = 24; k <= 80; k += 8) {
5166 GemmMicrokernelTester()
5167 .mr(5)
5168 .nr(8)
5169 .kr(1)
5170 .sr(1)
5171 .m(5)
5172 .n(8)
5173 .k(k)
5174 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5175 }
5176 }
5177
5178 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
5179 TEST_REQUIRES_ARM_NEON_FMA;
5180 for (size_t k = 24; k <= 80; k += 8) {
5181 for (uint32_t m = 1; m <= 5; m++) {
5182 for (uint32_t n = 1; n <= 8; n++) {
5183 GemmMicrokernelTester()
5184 .mr(5)
5185 .nr(8)
5186 .kr(1)
5187 .sr(1)
5188 .m(m)
5189 .n(n)
5190 .k(k)
5191 .iterations(1)
5192 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5193 }
5194 }
5195 }
5196 }
5197
5198 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
5199 TEST_REQUIRES_ARM_NEON_FMA;
5200 for (uint32_t n = 9; n < 16; n++) {
5201 for (size_t k = 1; k <= 40; k += 9) {
5202 GemmMicrokernelTester()
5203 .mr(5)
5204 .nr(8)
5205 .kr(1)
5206 .sr(1)
5207 .m(5)
5208 .n(8)
5209 .k(k)
5210 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5211 }
5212 }
5213 }
5214
5215 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
5216 TEST_REQUIRES_ARM_NEON_FMA;
5217 for (uint32_t n = 9; n < 16; n++) {
5218 for (size_t k = 1; k <= 40; k += 9) {
5219 GemmMicrokernelTester()
5220 .mr(5)
5221 .nr(8)
5222 .kr(1)
5223 .sr(1)
5224 .m(5)
5225 .n(8)
5226 .k(k)
5227 .cn_stride(11)
5228 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5229 }
5230 }
5231 }
5232
5233 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
5234 TEST_REQUIRES_ARM_NEON_FMA;
5235 for (uint32_t n = 9; n < 16; n++) {
5236 for (size_t k = 1; k <= 40; k += 9) {
5237 for (uint32_t m = 1; m <= 5; m++) {
5238 GemmMicrokernelTester()
5239 .mr(5)
5240 .nr(8)
5241 .kr(1)
5242 .sr(1)
5243 .m(m)
5244 .n(n)
5245 .k(k)
5246 .iterations(1)
5247 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5248 }
5249 }
5250 }
5251 }
5252
5253 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
5254 TEST_REQUIRES_ARM_NEON_FMA;
5255 for (uint32_t n = 16; n <= 24; n += 8) {
5256 for (size_t k = 1; k <= 40; k += 9) {
5257 GemmMicrokernelTester()
5258 .mr(5)
5259 .nr(8)
5260 .kr(1)
5261 .sr(1)
5262 .m(5)
5263 .n(8)
5264 .k(k)
5265 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5266 }
5267 }
5268 }
5269
5270 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
5271 TEST_REQUIRES_ARM_NEON_FMA;
5272 for (uint32_t n = 16; n <= 24; n += 8) {
5273 for (size_t k = 1; k <= 40; k += 9) {
5274 GemmMicrokernelTester()
5275 .mr(5)
5276 .nr(8)
5277 .kr(1)
5278 .sr(1)
5279 .m(5)
5280 .n(n)
5281 .k(k)
5282 .cn_stride(11)
5283 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5284 }
5285 }
5286 }
5287
5288 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
5289 TEST_REQUIRES_ARM_NEON_FMA;
5290 for (uint32_t n = 16; n <= 24; n += 8) {
5291 for (size_t k = 1; k <= 40; k += 9) {
5292 for (uint32_t m = 1; m <= 5; m++) {
5293 GemmMicrokernelTester()
5294 .mr(5)
5295 .nr(8)
5296 .kr(1)
5297 .sr(1)
5298 .m(m)
5299 .n(n)
5300 .k(k)
5301 .iterations(1)
5302 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5303 }
5304 }
5305 }
5306 }
5307
5308 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, small_kernel) {
5309 TEST_REQUIRES_ARM_NEON_FMA;
5310 for (size_t k = 1; k <= 40; k += 9) {
5311 GemmMicrokernelTester()
5312 .mr(5)
5313 .nr(8)
5314 .kr(1)
5315 .sr(1)
5316 .m(5)
5317 .n(8)
5318 .k(k)
5319 .ks(3)
5320 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5321 }
5322 }
5323
5324 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, small_kernel_subtile) {
5325 TEST_REQUIRES_ARM_NEON_FMA;
5326 for (size_t k = 1; k <= 40; k += 9) {
5327 for (uint32_t m = 1; m <= 5; m++) {
5328 for (uint32_t n = 1; n <= 8; n++) {
5329 GemmMicrokernelTester()
5330 .mr(5)
5331 .nr(8)
5332 .kr(1)
5333 .sr(1)
5334 .m(m)
5335 .n(n)
5336 .k(k)
5337 .ks(3)
5338 .iterations(1)
5339 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5340 }
5341 }
5342 }
5343 }
5344
5345 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_small_kernel) {
5346 TEST_REQUIRES_ARM_NEON_FMA;
5347 for (uint32_t n = 9; n < 16; n++) {
5348 for (size_t k = 1; k <= 40; k += 9) {
5349 GemmMicrokernelTester()
5350 .mr(5)
5351 .nr(8)
5352 .kr(1)
5353 .sr(1)
5354 .m(5)
5355 .n(8)
5356 .k(k)
5357 .ks(3)
5358 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5359 }
5360 }
5361 }
5362
5363 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_small_kernel) {
5364 TEST_REQUIRES_ARM_NEON_FMA;
5365 for (uint32_t n = 16; n <= 24; n += 8) {
5366 for (size_t k = 1; k <= 40; k += 9) {
5367 GemmMicrokernelTester()
5368 .mr(5)
5369 .nr(8)
5370 .kr(1)
5371 .sr(1)
5372 .m(5)
5373 .n(8)
5374 .k(k)
5375 .ks(3)
5376 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5377 }
5378 }
5379 }
5380
5381 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
5382 TEST_REQUIRES_ARM_NEON_FMA;
5383 for (size_t k = 1; k <= 40; k += 9) {
5384 for (uint32_t m = 1; m <= 5; m++) {
5385 for (uint32_t n = 1; n <= 8; n++) {
5386 GemmMicrokernelTester()
5387 .mr(5)
5388 .nr(8)
5389 .kr(1)
5390 .sr(1)
5391 .m(m)
5392 .n(n)
5393 .k(k)
5394 .cm_stride(11)
5395 .iterations(1)
5396 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5397 }
5398 }
5399 }
5400 }
5401
5402 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, a_offset) {
5403 TEST_REQUIRES_ARM_NEON_FMA;
5404 for (size_t k = 1; k <= 40; k += 9) {
5405 GemmMicrokernelTester()
5406 .mr(5)
5407 .nr(8)
5408 .kr(1)
5409 .sr(1)
5410 .m(5)
5411 .n(8)
5412 .k(k)
5413 .ks(3)
5414 .a_offset(211)
5415 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5416 }
5417 }
5418
5419 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, zero) {
5420 TEST_REQUIRES_ARM_NEON_FMA;
5421 for (uint32_t mz = 0; mz < 5; mz++) {
5422 for (size_t k = 1; k <= 40; k += 9) {
5423 GemmMicrokernelTester()
5424 .mr(5)
5425 .nr(8)
5426 .kr(1)
5427 .sr(1)
5428 .m(5)
5429 .n(8)
5430 .k(k)
5431 .ks(3)
5432 .a_offset(211)
5433 .zero_index(mz)
5434 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5435 }
5436 }
5437 }
5438
5439 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
5440 TEST_REQUIRES_ARM_NEON_FMA;
5441 GemmMicrokernelTester()
5442 .mr(5)
5443 .nr(8)
5444 .kr(1)
5445 .sr(1)
5446 .m(5)
5447 .n(8)
5448 .k(8)
5449 .qmin(128)
5450 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5451 }
5452
5453 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
5454 TEST_REQUIRES_ARM_NEON_FMA;
5455 GemmMicrokernelTester()
5456 .mr(5)
5457 .nr(8)
5458 .kr(1)
5459 .sr(1)
5460 .m(5)
5461 .n(8)
5462 .k(8)
5463 .qmax(128)
5464 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5465 }
5466
5467 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
5468 TEST_REQUIRES_ARM_NEON_FMA;
5469 GemmMicrokernelTester()
5470 .mr(5)
5471 .nr(8)
5472 .kr(1)
5473 .sr(1)
5474 .m(5)
5475 .n(8)
5476 .k(8)
5477 .cm_stride(11)
5478 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
5479 }
Frank Barchard7e955972019-10-11 10:34:25 -07005480#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07005481
5482
Frank Barchard7e955972019-10-11 10:34:25 -07005483#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard80b537a2019-11-18 10:51:33 -08005484 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005485 TEST_REQUIRES_ARM_NEON_FMA;
5486 GemmMicrokernelTester()
5487 .mr(6)
5488 .nr(8)
5489 .kr(1)
5490 .sr(1)
5491 .m(6)
5492 .n(8)
Frank Barchard80b537a2019-11-18 10:51:33 -08005493 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07005494 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005495 }
5496
Frank Barchard46fb8072019-10-25 12:54:22 -07005497 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005498 TEST_REQUIRES_ARM_NEON_FMA;
5499 GemmMicrokernelTester()
5500 .mr(6)
5501 .nr(8)
5502 .kr(1)
5503 .sr(1)
5504 .m(6)
5505 .n(8)
Frank Barchard80b537a2019-11-18 10:51:33 -08005506 .k(4)
XNNPACK Teamb455b122019-09-27 18:10:33 -07005507 .cn_stride(11)
Frank Barchard46fb8072019-10-25 12:54:22 -07005508 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005509 }
5510
Frank Barchard80b537a2019-11-18 10:51:33 -08005511 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005512 TEST_REQUIRES_ARM_NEON_FMA;
5513 for (uint32_t m = 1; m <= 6; m++) {
5514 for (uint32_t n = 1; n <= 8; n++) {
5515 GemmMicrokernelTester()
5516 .mr(6)
5517 .nr(8)
5518 .kr(1)
5519 .sr(1)
5520 .m(m)
5521 .n(n)
Frank Barchard80b537a2019-11-18 10:51:33 -08005522 .k(4)
XNNPACK Teamb455b122019-09-27 18:10:33 -07005523 .iterations(1)
Frank Barchard46fb8072019-10-25 12:54:22 -07005524 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005525 }
5526 }
5527 }
5528
Frank Barchard80b537a2019-11-18 10:51:33 -08005529 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005530 TEST_REQUIRES_ARM_NEON_FMA;
5531 for (uint32_t m = 1; m <= 6; m++) {
5532 GemmMicrokernelTester()
5533 .mr(6)
5534 .nr(8)
5535 .kr(1)
5536 .sr(1)
5537 .m(m)
5538 .n(8)
Frank Barchard80b537a2019-11-18 10:51:33 -08005539 .k(4)
XNNPACK Teamb455b122019-09-27 18:10:33 -07005540 .iterations(1)
Frank Barchard46fb8072019-10-25 12:54:22 -07005541 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005542 }
5543 }
5544
Frank Barchard80b537a2019-11-18 10:51:33 -08005545 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005546 TEST_REQUIRES_ARM_NEON_FMA;
5547 for (uint32_t n = 1; n <= 8; n++) {
5548 GemmMicrokernelTester()
5549 .mr(6)
5550 .nr(8)
5551 .kr(1)
5552 .sr(1)
5553 .m(6)
5554 .n(n)
Frank Barchard80b537a2019-11-18 10:51:33 -08005555 .k(4)
XNNPACK Teamb455b122019-09-27 18:10:33 -07005556 .iterations(1)
Frank Barchard46fb8072019-10-25 12:54:22 -07005557 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005558 }
5559 }
5560
Frank Barchard80b537a2019-11-18 10:51:33 -08005561 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005562 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard80b537a2019-11-18 10:51:33 -08005563 GemmMicrokernelTester()
5564 .mr(6)
5565 .nr(8)
5566 .kr(1)
5567 .sr(1)
5568 .m(6)
5569 .n(8)
5570 .k(8)
5571 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
5572 }
5573
5574 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
5575 TEST_REQUIRES_ARM_NEON_FMA;
5576 for (uint32_t m = 1; m <= 6; m++) {
5577 for (uint32_t n = 1; n <= 8; n++) {
5578 GemmMicrokernelTester()
5579 .mr(6)
5580 .nr(8)
5581 .kr(1)
5582 .sr(1)
5583 .m(m)
5584 .n(n)
5585 .k(8)
5586 .iterations(1)
5587 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
5588 }
5589 }
5590 }
5591
5592 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
5593 TEST_REQUIRES_ARM_NEON_FMA;
5594 for (size_t k = 1; k < 8; k++) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005595 GemmMicrokernelTester()
5596 .mr(6)
5597 .nr(8)
5598 .kr(1)
5599 .sr(1)
5600 .m(6)
5601 .n(8)
5602 .k(k)
Frank Barchard46fb8072019-10-25 12:54:22 -07005603 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005604 }
5605 }
5606
Frank Barchard80b537a2019-11-18 10:51:33 -08005607 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005608 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard80b537a2019-11-18 10:51:33 -08005609 for (size_t k = 1; k < 8; k++) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005610 for (uint32_t m = 1; m <= 6; m++) {
5611 for (uint32_t n = 1; n <= 8; n++) {
5612 GemmMicrokernelTester()
5613 .mr(6)
5614 .nr(8)
5615 .kr(1)
5616 .sr(1)
5617 .m(m)
5618 .n(n)
5619 .k(k)
5620 .iterations(1)
Frank Barchard46fb8072019-10-25 12:54:22 -07005621 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005622 }
5623 }
5624 }
5625 }
5626
Frank Barchard80b537a2019-11-18 10:51:33 -08005627 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005628 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard80b537a2019-11-18 10:51:33 -08005629 for (size_t k = 9; k < 8; k++) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005630 GemmMicrokernelTester()
5631 .mr(6)
5632 .nr(8)
5633 .kr(1)
5634 .sr(1)
5635 .m(6)
5636 .n(8)
5637 .k(k)
Frank Barchard46fb8072019-10-25 12:54:22 -07005638 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005639 }
5640 }
5641
Frank Barchard80b537a2019-11-18 10:51:33 -08005642 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005643 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard80b537a2019-11-18 10:51:33 -08005644 for (size_t k = 9; k < 8; k++) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005645 for (uint32_t m = 1; m <= 6; m++) {
5646 for (uint32_t n = 1; n <= 8; n++) {
5647 GemmMicrokernelTester()
5648 .mr(6)
5649 .nr(8)
5650 .kr(1)
5651 .sr(1)
5652 .m(m)
5653 .n(n)
5654 .k(k)
5655 .iterations(1)
Frank Barchard46fb8072019-10-25 12:54:22 -07005656 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005657 }
5658 }
5659 }
5660 }
5661
Frank Barchard80b537a2019-11-18 10:51:33 -08005662 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005663 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard80b537a2019-11-18 10:51:33 -08005664 for (size_t k = 12; k <= 40; k += 4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005665 GemmMicrokernelTester()
5666 .mr(6)
5667 .nr(8)
5668 .kr(1)
5669 .sr(1)
5670 .m(6)
5671 .n(8)
5672 .k(k)
Frank Barchard46fb8072019-10-25 12:54:22 -07005673 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005674 }
5675 }
5676
Frank Barchard80b537a2019-11-18 10:51:33 -08005677 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005678 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard80b537a2019-11-18 10:51:33 -08005679 for (size_t k = 12; k <= 40; k += 4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005680 for (uint32_t m = 1; m <= 6; m++) {
5681 for (uint32_t n = 1; n <= 8; n++) {
5682 GemmMicrokernelTester()
5683 .mr(6)
5684 .nr(8)
5685 .kr(1)
5686 .sr(1)
5687 .m(m)
5688 .n(n)
5689 .k(k)
5690 .iterations(1)
Frank Barchard46fb8072019-10-25 12:54:22 -07005691 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005692 }
5693 }
5694 }
5695 }
5696
Frank Barchard46fb8072019-10-25 12:54:22 -07005697 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005698 TEST_REQUIRES_ARM_NEON_FMA;
5699 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard80b537a2019-11-18 10:51:33 -08005700 for (size_t k = 1; k <= 20; k += 5) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005701 GemmMicrokernelTester()
5702 .mr(6)
5703 .nr(8)
5704 .kr(1)
5705 .sr(1)
5706 .m(6)
5707 .n(8)
5708 .k(k)
Frank Barchard46fb8072019-10-25 12:54:22 -07005709 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005710 }
5711 }
5712 }
5713
Frank Barchard46fb8072019-10-25 12:54:22 -07005714 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005715 TEST_REQUIRES_ARM_NEON_FMA;
5716 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard80b537a2019-11-18 10:51:33 -08005717 for (size_t k = 1; k <= 20; k += 5) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005718 GemmMicrokernelTester()
5719 .mr(6)
5720 .nr(8)
5721 .kr(1)
5722 .sr(1)
5723 .m(6)
5724 .n(8)
5725 .k(k)
5726 .cn_stride(11)
Frank Barchard46fb8072019-10-25 12:54:22 -07005727 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005728 }
5729 }
5730 }
5731
Frank Barchard46fb8072019-10-25 12:54:22 -07005732 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005733 TEST_REQUIRES_ARM_NEON_FMA;
5734 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard80b537a2019-11-18 10:51:33 -08005735 for (size_t k = 1; k <= 20; k += 5) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005736 for (uint32_t m = 1; m <= 6; m++) {
5737 GemmMicrokernelTester()
5738 .mr(6)
5739 .nr(8)
5740 .kr(1)
5741 .sr(1)
5742 .m(m)
5743 .n(n)
5744 .k(k)
5745 .iterations(1)
Frank Barchard46fb8072019-10-25 12:54:22 -07005746 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005747 }
5748 }
5749 }
5750 }
5751
Frank Barchard46fb8072019-10-25 12:54:22 -07005752 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005753 TEST_REQUIRES_ARM_NEON_FMA;
5754 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard80b537a2019-11-18 10:51:33 -08005755 for (size_t k = 1; k <= 20; k += 5) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005756 GemmMicrokernelTester()
5757 .mr(6)
5758 .nr(8)
5759 .kr(1)
5760 .sr(1)
5761 .m(6)
5762 .n(8)
5763 .k(k)
Frank Barchard46fb8072019-10-25 12:54:22 -07005764 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005765 }
5766 }
5767 }
5768
Frank Barchard46fb8072019-10-25 12:54:22 -07005769 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005770 TEST_REQUIRES_ARM_NEON_FMA;
5771 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard80b537a2019-11-18 10:51:33 -08005772 for (size_t k = 1; k <= 20; k += 5) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005773 GemmMicrokernelTester()
5774 .mr(6)
5775 .nr(8)
5776 .kr(1)
5777 .sr(1)
5778 .m(6)
5779 .n(n)
5780 .k(k)
5781 .cn_stride(11)
Frank Barchard46fb8072019-10-25 12:54:22 -07005782 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005783 }
5784 }
5785 }
5786
Frank Barchard46fb8072019-10-25 12:54:22 -07005787 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005788 TEST_REQUIRES_ARM_NEON_FMA;
5789 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard80b537a2019-11-18 10:51:33 -08005790 for (size_t k = 1; k <= 20; k += 5) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005791 for (uint32_t m = 1; m <= 6; m++) {
5792 GemmMicrokernelTester()
5793 .mr(6)
5794 .nr(8)
5795 .kr(1)
5796 .sr(1)
5797 .m(m)
5798 .n(n)
5799 .k(k)
5800 .iterations(1)
Frank Barchard46fb8072019-10-25 12:54:22 -07005801 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005802 }
5803 }
5804 }
5805 }
5806
Frank Barchard46fb8072019-10-25 12:54:22 -07005807 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005808 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard80b537a2019-11-18 10:51:33 -08005809 for (size_t k = 1; k <= 20; k += 5) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005810 GemmMicrokernelTester()
5811 .mr(6)
5812 .nr(8)
5813 .kr(1)
5814 .sr(1)
5815 .m(6)
5816 .n(8)
5817 .k(k)
5818 .ks(3)
Frank Barchard46fb8072019-10-25 12:54:22 -07005819 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005820 }
5821 }
5822
Frank Barchard46fb8072019-10-25 12:54:22 -07005823 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, small_kernel_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005824 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard80b537a2019-11-18 10:51:33 -08005825 for (size_t k = 1; k <= 20; k += 5) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005826 for (uint32_t m = 1; m <= 6; m++) {
5827 for (uint32_t n = 1; n <= 8; n++) {
5828 GemmMicrokernelTester()
5829 .mr(6)
5830 .nr(8)
5831 .kr(1)
5832 .sr(1)
5833 .m(m)
5834 .n(n)
5835 .k(k)
5836 .ks(3)
5837 .iterations(1)
Frank Barchard46fb8072019-10-25 12:54:22 -07005838 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005839 }
5840 }
5841 }
5842 }
5843
Frank Barchard46fb8072019-10-25 12:54:22 -07005844 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005845 TEST_REQUIRES_ARM_NEON_FMA;
5846 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard80b537a2019-11-18 10:51:33 -08005847 for (size_t k = 1; k <= 20; k += 5) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005848 GemmMicrokernelTester()
5849 .mr(6)
5850 .nr(8)
5851 .kr(1)
5852 .sr(1)
5853 .m(6)
5854 .n(8)
5855 .k(k)
5856 .ks(3)
Frank Barchard46fb8072019-10-25 12:54:22 -07005857 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005858 }
5859 }
5860 }
5861
Frank Barchard46fb8072019-10-25 12:54:22 -07005862 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005863 TEST_REQUIRES_ARM_NEON_FMA;
5864 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard80b537a2019-11-18 10:51:33 -08005865 for (size_t k = 1; k <= 20; k += 5) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005866 GemmMicrokernelTester()
5867 .mr(6)
5868 .nr(8)
5869 .kr(1)
5870 .sr(1)
5871 .m(6)
5872 .n(8)
5873 .k(k)
5874 .ks(3)
Frank Barchard46fb8072019-10-25 12:54:22 -07005875 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005876 }
5877 }
5878 }
5879
Frank Barchard46fb8072019-10-25 12:54:22 -07005880 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005881 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard80b537a2019-11-18 10:51:33 -08005882 for (size_t k = 1; k <= 20; k += 5) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005883 for (uint32_t m = 1; m <= 6; m++) {
5884 for (uint32_t n = 1; n <= 8; n++) {
5885 GemmMicrokernelTester()
5886 .mr(6)
5887 .nr(8)
5888 .kr(1)
5889 .sr(1)
5890 .m(m)
5891 .n(n)
5892 .k(k)
5893 .cm_stride(11)
5894 .iterations(1)
Frank Barchard46fb8072019-10-25 12:54:22 -07005895 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005896 }
5897 }
5898 }
5899 }
5900
Frank Barchard46fb8072019-10-25 12:54:22 -07005901 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, a_offset) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005902 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard80b537a2019-11-18 10:51:33 -08005903 for (size_t k = 1; k <= 20; k += 5) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005904 GemmMicrokernelTester()
5905 .mr(6)
5906 .nr(8)
5907 .kr(1)
5908 .sr(1)
5909 .m(6)
5910 .n(8)
5911 .k(k)
5912 .ks(3)
Frank Barchard80b537a2019-11-18 10:51:33 -08005913 .a_offset(127)
Frank Barchard46fb8072019-10-25 12:54:22 -07005914 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005915 }
5916 }
5917
Frank Barchard46fb8072019-10-25 12:54:22 -07005918 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, zero) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005919 TEST_REQUIRES_ARM_NEON_FMA;
5920 for (uint32_t mz = 0; mz < 6; mz++) {
Frank Barchard80b537a2019-11-18 10:51:33 -08005921 for (size_t k = 1; k <= 20; k += 5) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005922 GemmMicrokernelTester()
5923 .mr(6)
5924 .nr(8)
5925 .kr(1)
5926 .sr(1)
5927 .m(6)
5928 .n(8)
5929 .k(k)
5930 .ks(3)
Frank Barchard80b537a2019-11-18 10:51:33 -08005931 .a_offset(127)
XNNPACK Teamb455b122019-09-27 18:10:33 -07005932 .zero_index(mz)
Frank Barchard46fb8072019-10-25 12:54:22 -07005933 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005934 }
5935 }
5936 }
5937
Frank Barchard46fb8072019-10-25 12:54:22 -07005938 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005939 TEST_REQUIRES_ARM_NEON_FMA;
5940 GemmMicrokernelTester()
5941 .mr(6)
5942 .nr(8)
5943 .kr(1)
5944 .sr(1)
5945 .m(6)
5946 .n(8)
Frank Barchard80b537a2019-11-18 10:51:33 -08005947 .k(4)
XNNPACK Teamb455b122019-09-27 18:10:33 -07005948 .qmin(128)
Frank Barchard46fb8072019-10-25 12:54:22 -07005949 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005950 }
5951
Frank Barchard46fb8072019-10-25 12:54:22 -07005952 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005953 TEST_REQUIRES_ARM_NEON_FMA;
5954 GemmMicrokernelTester()
5955 .mr(6)
5956 .nr(8)
5957 .kr(1)
5958 .sr(1)
5959 .m(6)
5960 .n(8)
Frank Barchard80b537a2019-11-18 10:51:33 -08005961 .k(4)
XNNPACK Teamb455b122019-09-27 18:10:33 -07005962 .qmax(128)
Frank Barchard46fb8072019-10-25 12:54:22 -07005963 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005964 }
5965
Frank Barchard46fb8072019-10-25 12:54:22 -07005966 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005967 TEST_REQUIRES_ARM_NEON_FMA;
5968 GemmMicrokernelTester()
5969 .mr(6)
5970 .nr(8)
5971 .kr(1)
5972 .sr(1)
5973 .m(6)
5974 .n(8)
Frank Barchard80b537a2019-11-18 10:51:33 -08005975 .k(4)
XNNPACK Teamb455b122019-09-27 18:10:33 -07005976 .cm_stride(11)
Frank Barchard46fb8072019-10-25 12:54:22 -07005977 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
XNNPACK Teamb455b122019-09-27 18:10:33 -07005978 }
Frank Barchard7e955972019-10-11 10:34:25 -07005979#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07005980
5981
Frank Barchard7e955972019-10-11 10:34:25 -07005982#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard46fb8072019-10-25 12:54:22 -07005983 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8) {
5984 TEST_REQUIRES_ARM_NEON_FMA;
5985 GemmMicrokernelTester()
5986 .mr(6)
5987 .nr(8)
5988 .kr(1)
5989 .sr(1)
5990 .m(6)
5991 .n(8)
5992 .k(8)
5993 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5994 }
5995
5996 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, strided_cn) {
5997 TEST_REQUIRES_ARM_NEON_FMA;
5998 GemmMicrokernelTester()
5999 .mr(6)
6000 .nr(8)
6001 .kr(1)
6002 .sr(1)
6003 .m(6)
6004 .n(8)
6005 .k(8)
6006 .cn_stride(11)
6007 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6008 }
6009
6010 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8_subtile) {
6011 TEST_REQUIRES_ARM_NEON_FMA;
6012 for (uint32_t m = 1; m <= 6; m++) {
6013 for (uint32_t n = 1; n <= 8; n++) {
6014 GemmMicrokernelTester()
6015 .mr(6)
6016 .nr(8)
6017 .kr(1)
6018 .sr(1)
6019 .m(m)
6020 .n(n)
6021 .k(8)
6022 .iterations(1)
6023 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6024 }
6025 }
6026 }
6027
6028 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8_subtile_m) {
6029 TEST_REQUIRES_ARM_NEON_FMA;
6030 for (uint32_t m = 1; m <= 6; m++) {
6031 GemmMicrokernelTester()
6032 .mr(6)
6033 .nr(8)
6034 .kr(1)
6035 .sr(1)
6036 .m(m)
6037 .n(8)
6038 .k(8)
6039 .iterations(1)
6040 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6041 }
6042 }
6043
6044 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8_subtile_n) {
6045 TEST_REQUIRES_ARM_NEON_FMA;
6046 for (uint32_t n = 1; n <= 8; n++) {
6047 GemmMicrokernelTester()
6048 .mr(6)
6049 .nr(8)
6050 .kr(1)
6051 .sr(1)
6052 .m(6)
6053 .n(n)
6054 .k(8)
6055 .iterations(1)
6056 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6057 }
6058 }
6059
6060 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_16) {
6061 TEST_REQUIRES_ARM_NEON_FMA;
6062 GemmMicrokernelTester()
6063 .mr(6)
6064 .nr(8)
6065 .kr(1)
6066 .sr(1)
6067 .m(6)
6068 .n(8)
6069 .k(16)
6070 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6071 }
6072
6073 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_16_subtile) {
6074 TEST_REQUIRES_ARM_NEON_FMA;
6075 for (uint32_t m = 1; m <= 6; m++) {
6076 for (uint32_t n = 1; n <= 8; n++) {
6077 GemmMicrokernelTester()
6078 .mr(6)
6079 .nr(8)
6080 .kr(1)
6081 .sr(1)
6082 .m(m)
6083 .n(n)
6084 .k(16)
6085 .iterations(1)
6086 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6087 }
6088 }
6089 }
6090
6091 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_lt_16) {
6092 TEST_REQUIRES_ARM_NEON_FMA;
6093 for (size_t k = 1; k < 16; k++) {
6094 GemmMicrokernelTester()
6095 .mr(6)
6096 .nr(8)
6097 .kr(1)
6098 .sr(1)
6099 .m(6)
6100 .n(8)
6101 .k(k)
6102 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6103 }
6104 }
6105
6106 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_lt_16_subtile) {
6107 TEST_REQUIRES_ARM_NEON_FMA;
6108 for (size_t k = 1; k < 16; k++) {
6109 for (uint32_t m = 1; m <= 6; m++) {
6110 for (uint32_t n = 1; n <= 8; n++) {
6111 GemmMicrokernelTester()
6112 .mr(6)
6113 .nr(8)
6114 .kr(1)
6115 .sr(1)
6116 .m(m)
6117 .n(n)
6118 .k(k)
6119 .iterations(1)
6120 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6121 }
6122 }
6123 }
6124 }
6125
6126 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_gt_16) {
6127 TEST_REQUIRES_ARM_NEON_FMA;
6128 for (size_t k = 17; k < 16; k++) {
6129 GemmMicrokernelTester()
6130 .mr(6)
6131 .nr(8)
6132 .kr(1)
6133 .sr(1)
6134 .m(6)
6135 .n(8)
6136 .k(k)
6137 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6138 }
6139 }
6140
6141 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_gt_8_subtile) {
6142 TEST_REQUIRES_ARM_NEON_FMA;
6143 for (size_t k = 17; k < 16; k++) {
6144 for (uint32_t m = 1; m <= 6; m++) {
6145 for (uint32_t n = 1; n <= 8; n++) {
6146 GemmMicrokernelTester()
6147 .mr(6)
6148 .nr(8)
6149 .kr(1)
6150 .sr(1)
6151 .m(m)
6152 .n(n)
6153 .k(k)
6154 .iterations(1)
6155 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6156 }
6157 }
6158 }
6159 }
6160
6161 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_div_8) {
6162 TEST_REQUIRES_ARM_NEON_FMA;
6163 for (size_t k = 24; k <= 80; k += 8) {
6164 GemmMicrokernelTester()
6165 .mr(6)
6166 .nr(8)
6167 .kr(1)
6168 .sr(1)
6169 .m(6)
6170 .n(8)
6171 .k(k)
6172 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6173 }
6174 }
6175
6176 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_div_8_subtile) {
6177 TEST_REQUIRES_ARM_NEON_FMA;
6178 for (size_t k = 24; k <= 80; k += 8) {
6179 for (uint32_t m = 1; m <= 6; m++) {
6180 for (uint32_t n = 1; n <= 8; n++) {
6181 GemmMicrokernelTester()
6182 .mr(6)
6183 .nr(8)
6184 .kr(1)
6185 .sr(1)
6186 .m(m)
6187 .n(n)
6188 .k(k)
6189 .iterations(1)
6190 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6191 }
6192 }
6193 }
6194 }
6195
6196 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8) {
6197 TEST_REQUIRES_ARM_NEON_FMA;
6198 for (uint32_t n = 9; n < 16; n++) {
6199 for (size_t k = 1; k <= 40; k += 9) {
6200 GemmMicrokernelTester()
6201 .mr(6)
6202 .nr(8)
6203 .kr(1)
6204 .sr(1)
6205 .m(6)
6206 .n(8)
6207 .k(k)
6208 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6209 }
6210 }
6211 }
6212
6213 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8_strided_cn) {
6214 TEST_REQUIRES_ARM_NEON_FMA;
6215 for (uint32_t n = 9; n < 16; n++) {
6216 for (size_t k = 1; k <= 40; k += 9) {
6217 GemmMicrokernelTester()
6218 .mr(6)
6219 .nr(8)
6220 .kr(1)
6221 .sr(1)
6222 .m(6)
6223 .n(8)
6224 .k(k)
6225 .cn_stride(11)
6226 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6227 }
6228 }
6229 }
6230
6231 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8_subtile) {
6232 TEST_REQUIRES_ARM_NEON_FMA;
6233 for (uint32_t n = 9; n < 16; n++) {
6234 for (size_t k = 1; k <= 40; k += 9) {
6235 for (uint32_t m = 1; m <= 6; m++) {
6236 GemmMicrokernelTester()
6237 .mr(6)
6238 .nr(8)
6239 .kr(1)
6240 .sr(1)
6241 .m(m)
6242 .n(n)
6243 .k(k)
6244 .iterations(1)
6245 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6246 }
6247 }
6248 }
6249 }
6250
6251 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8) {
6252 TEST_REQUIRES_ARM_NEON_FMA;
6253 for (uint32_t n = 16; n <= 24; n += 8) {
6254 for (size_t k = 1; k <= 40; k += 9) {
6255 GemmMicrokernelTester()
6256 .mr(6)
6257 .nr(8)
6258 .kr(1)
6259 .sr(1)
6260 .m(6)
6261 .n(8)
6262 .k(k)
6263 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6264 }
6265 }
6266 }
6267
6268 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8_strided_cn) {
6269 TEST_REQUIRES_ARM_NEON_FMA;
6270 for (uint32_t n = 16; n <= 24; n += 8) {
6271 for (size_t k = 1; k <= 40; k += 9) {
6272 GemmMicrokernelTester()
6273 .mr(6)
6274 .nr(8)
6275 .kr(1)
6276 .sr(1)
6277 .m(6)
6278 .n(n)
6279 .k(k)
6280 .cn_stride(11)
6281 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6282 }
6283 }
6284 }
6285
6286 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8_subtile) {
6287 TEST_REQUIRES_ARM_NEON_FMA;
6288 for (uint32_t n = 16; n <= 24; n += 8) {
6289 for (size_t k = 1; k <= 40; k += 9) {
6290 for (uint32_t m = 1; m <= 6; m++) {
6291 GemmMicrokernelTester()
6292 .mr(6)
6293 .nr(8)
6294 .kr(1)
6295 .sr(1)
6296 .m(m)
6297 .n(n)
6298 .k(k)
6299 .iterations(1)
6300 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6301 }
6302 }
6303 }
6304 }
6305
6306 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, small_kernel) {
6307 TEST_REQUIRES_ARM_NEON_FMA;
6308 for (size_t k = 1; k <= 40; k += 9) {
6309 GemmMicrokernelTester()
6310 .mr(6)
6311 .nr(8)
6312 .kr(1)
6313 .sr(1)
6314 .m(6)
6315 .n(8)
6316 .k(k)
6317 .ks(3)
6318 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6319 }
6320 }
6321
6322 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, small_kernel_subtile) {
6323 TEST_REQUIRES_ARM_NEON_FMA;
6324 for (size_t k = 1; k <= 40; k += 9) {
6325 for (uint32_t m = 1; m <= 6; m++) {
6326 for (uint32_t n = 1; n <= 8; n++) {
6327 GemmMicrokernelTester()
6328 .mr(6)
6329 .nr(8)
6330 .kr(1)
6331 .sr(1)
6332 .m(m)
6333 .n(n)
6334 .k(k)
6335 .ks(3)
6336 .iterations(1)
6337 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6338 }
6339 }
6340 }
6341 }
6342
6343 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8_small_kernel) {
6344 TEST_REQUIRES_ARM_NEON_FMA;
6345 for (uint32_t n = 9; n < 16; n++) {
6346 for (size_t k = 1; k <= 40; k += 9) {
6347 GemmMicrokernelTester()
6348 .mr(6)
6349 .nr(8)
6350 .kr(1)
6351 .sr(1)
6352 .m(6)
6353 .n(8)
6354 .k(k)
6355 .ks(3)
6356 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6357 }
6358 }
6359 }
6360
6361 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8_small_kernel) {
6362 TEST_REQUIRES_ARM_NEON_FMA;
6363 for (uint32_t n = 16; n <= 24; n += 8) {
6364 for (size_t k = 1; k <= 40; k += 9) {
6365 GemmMicrokernelTester()
6366 .mr(6)
6367 .nr(8)
6368 .kr(1)
6369 .sr(1)
6370 .m(6)
6371 .n(8)
6372 .k(k)
6373 .ks(3)
6374 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6375 }
6376 }
6377 }
6378
6379 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, strided_cm_subtile) {
6380 TEST_REQUIRES_ARM_NEON_FMA;
6381 for (size_t k = 1; k <= 40; k += 9) {
6382 for (uint32_t m = 1; m <= 6; m++) {
6383 for (uint32_t n = 1; n <= 8; n++) {
6384 GemmMicrokernelTester()
6385 .mr(6)
6386 .nr(8)
6387 .kr(1)
6388 .sr(1)
6389 .m(m)
6390 .n(n)
6391 .k(k)
6392 .cm_stride(11)
6393 .iterations(1)
6394 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6395 }
6396 }
6397 }
6398 }
6399
6400 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, a_offset) {
6401 TEST_REQUIRES_ARM_NEON_FMA;
6402 for (size_t k = 1; k <= 40; k += 9) {
6403 GemmMicrokernelTester()
6404 .mr(6)
6405 .nr(8)
6406 .kr(1)
6407 .sr(1)
6408 .m(6)
6409 .n(8)
6410 .k(k)
6411 .ks(3)
6412 .a_offset(251)
6413 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6414 }
6415 }
6416
6417 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, zero) {
6418 TEST_REQUIRES_ARM_NEON_FMA;
6419 for (uint32_t mz = 0; mz < 6; mz++) {
6420 for (size_t k = 1; k <= 40; k += 9) {
6421 GemmMicrokernelTester()
6422 .mr(6)
6423 .nr(8)
6424 .kr(1)
6425 .sr(1)
6426 .m(6)
6427 .n(8)
6428 .k(k)
6429 .ks(3)
6430 .a_offset(251)
6431 .zero_index(mz)
6432 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6433 }
6434 }
6435 }
6436
6437 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, qmin) {
6438 TEST_REQUIRES_ARM_NEON_FMA;
6439 GemmMicrokernelTester()
6440 .mr(6)
6441 .nr(8)
6442 .kr(1)
6443 .sr(1)
6444 .m(6)
6445 .n(8)
6446 .k(8)
6447 .qmin(128)
6448 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6449 }
6450
6451 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, qmax) {
6452 TEST_REQUIRES_ARM_NEON_FMA;
6453 GemmMicrokernelTester()
6454 .mr(6)
6455 .nr(8)
6456 .kr(1)
6457 .sr(1)
6458 .m(6)
6459 .n(8)
6460 .k(8)
6461 .qmax(128)
6462 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6463 }
6464
6465 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, strided_cm) {
6466 TEST_REQUIRES_ARM_NEON_FMA;
6467 GemmMicrokernelTester()
6468 .mr(6)
6469 .nr(8)
6470 .kr(1)
6471 .sr(1)
6472 .m(6)
6473 .n(8)
6474 .k(8)
6475 .cm_stride(11)
6476 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6477 }
6478#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
6479
6480
6481#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard387c2d12019-12-16 19:14:07 -08006482 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
6483 TEST_REQUIRES_ARM_NEON_FMA;
6484 GemmMicrokernelTester()
6485 .mr(6)
6486 .nr(8)
6487 .kr(1)
6488 .sr(1)
6489 .m(6)
6490 .n(8)
6491 .k(8)
6492 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6493 }
6494
6495 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, strided_cn) {
6496 TEST_REQUIRES_ARM_NEON_FMA;
6497 GemmMicrokernelTester()
6498 .mr(6)
6499 .nr(8)
6500 .kr(1)
6501 .sr(1)
6502 .m(6)
6503 .n(8)
6504 .k(8)
6505 .cn_stride(11)
6506 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6507 }
6508
6509 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile) {
6510 TEST_REQUIRES_ARM_NEON_FMA;
6511 for (uint32_t m = 1; m <= 6; m++) {
6512 for (uint32_t n = 1; n <= 8; n++) {
6513 GemmMicrokernelTester()
6514 .mr(6)
6515 .nr(8)
6516 .kr(1)
6517 .sr(1)
6518 .m(m)
6519 .n(n)
6520 .k(8)
6521 .iterations(1)
6522 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6523 }
6524 }
6525 }
6526
6527 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_m) {
6528 TEST_REQUIRES_ARM_NEON_FMA;
6529 for (uint32_t m = 1; m <= 6; m++) {
6530 GemmMicrokernelTester()
6531 .mr(6)
6532 .nr(8)
6533 .kr(1)
6534 .sr(1)
6535 .m(m)
6536 .n(8)
6537 .k(8)
6538 .iterations(1)
6539 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6540 }
6541 }
6542
6543 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_n) {
6544 TEST_REQUIRES_ARM_NEON_FMA;
6545 for (uint32_t n = 1; n <= 8; n++) {
6546 GemmMicrokernelTester()
6547 .mr(6)
6548 .nr(8)
6549 .kr(1)
6550 .sr(1)
6551 .m(6)
6552 .n(n)
6553 .k(8)
6554 .iterations(1)
6555 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6556 }
6557 }
6558
6559 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16) {
6560 TEST_REQUIRES_ARM_NEON_FMA;
6561 GemmMicrokernelTester()
6562 .mr(6)
6563 .nr(8)
6564 .kr(1)
6565 .sr(1)
6566 .m(6)
6567 .n(8)
6568 .k(16)
6569 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6570 }
6571
6572 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_subtile) {
6573 TEST_REQUIRES_ARM_NEON_FMA;
6574 for (uint32_t m = 1; m <= 6; m++) {
6575 for (uint32_t n = 1; n <= 8; n++) {
6576 GemmMicrokernelTester()
6577 .mr(6)
6578 .nr(8)
6579 .kr(1)
6580 .sr(1)
6581 .m(m)
6582 .n(n)
6583 .k(16)
6584 .iterations(1)
6585 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6586 }
6587 }
6588 }
6589
6590 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16) {
6591 TEST_REQUIRES_ARM_NEON_FMA;
6592 for (size_t k = 1; k < 16; k++) {
6593 GemmMicrokernelTester()
6594 .mr(6)
6595 .nr(8)
6596 .kr(1)
6597 .sr(1)
6598 .m(6)
6599 .n(8)
6600 .k(k)
6601 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6602 }
6603 }
6604
6605 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_subtile) {
6606 TEST_REQUIRES_ARM_NEON_FMA;
6607 for (size_t k = 1; k < 16; k++) {
6608 for (uint32_t m = 1; m <= 6; m++) {
6609 for (uint32_t n = 1; n <= 8; n++) {
6610 GemmMicrokernelTester()
6611 .mr(6)
6612 .nr(8)
6613 .kr(1)
6614 .sr(1)
6615 .m(m)
6616 .n(n)
6617 .k(k)
6618 .iterations(1)
6619 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6620 }
6621 }
6622 }
6623 }
6624
6625 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_16) {
6626 TEST_REQUIRES_ARM_NEON_FMA;
6627 for (size_t k = 17; k < 16; k++) {
6628 GemmMicrokernelTester()
6629 .mr(6)
6630 .nr(8)
6631 .kr(1)
6632 .sr(1)
6633 .m(6)
6634 .n(8)
6635 .k(k)
6636 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6637 }
6638 }
6639
6640 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_subtile) {
6641 TEST_REQUIRES_ARM_NEON_FMA;
6642 for (size_t k = 17; k < 16; k++) {
6643 for (uint32_t m = 1; m <= 6; m++) {
6644 for (uint32_t n = 1; n <= 8; n++) {
6645 GemmMicrokernelTester()
6646 .mr(6)
6647 .nr(8)
6648 .kr(1)
6649 .sr(1)
6650 .m(m)
6651 .n(n)
6652 .k(k)
6653 .iterations(1)
6654 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6655 }
6656 }
6657 }
6658 }
6659
6660 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8) {
6661 TEST_REQUIRES_ARM_NEON_FMA;
6662 for (size_t k = 24; k <= 80; k += 8) {
6663 GemmMicrokernelTester()
6664 .mr(6)
6665 .nr(8)
6666 .kr(1)
6667 .sr(1)
6668 .m(6)
6669 .n(8)
6670 .k(k)
6671 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6672 }
6673 }
6674
6675 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_subtile) {
6676 TEST_REQUIRES_ARM_NEON_FMA;
6677 for (size_t k = 24; k <= 80; k += 8) {
6678 for (uint32_t m = 1; m <= 6; m++) {
6679 for (uint32_t n = 1; n <= 8; n++) {
6680 GemmMicrokernelTester()
6681 .mr(6)
6682 .nr(8)
6683 .kr(1)
6684 .sr(1)
6685 .m(m)
6686 .n(n)
6687 .k(k)
6688 .iterations(1)
6689 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6690 }
6691 }
6692 }
6693 }
6694
6695 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8) {
6696 TEST_REQUIRES_ARM_NEON_FMA;
6697 for (uint32_t n = 9; n < 16; n++) {
6698 for (size_t k = 1; k <= 40; k += 9) {
6699 GemmMicrokernelTester()
6700 .mr(6)
6701 .nr(8)
6702 .kr(1)
6703 .sr(1)
6704 .m(6)
6705 .n(8)
6706 .k(k)
6707 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6708 }
6709 }
6710 }
6711
6712 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_cn) {
6713 TEST_REQUIRES_ARM_NEON_FMA;
6714 for (uint32_t n = 9; n < 16; n++) {
6715 for (size_t k = 1; k <= 40; k += 9) {
6716 GemmMicrokernelTester()
6717 .mr(6)
6718 .nr(8)
6719 .kr(1)
6720 .sr(1)
6721 .m(6)
6722 .n(8)
6723 .k(k)
6724 .cn_stride(11)
6725 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6726 }
6727 }
6728 }
6729
6730 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_subtile) {
6731 TEST_REQUIRES_ARM_NEON_FMA;
6732 for (uint32_t n = 9; n < 16; n++) {
6733 for (size_t k = 1; k <= 40; k += 9) {
6734 for (uint32_t m = 1; m <= 6; m++) {
6735 GemmMicrokernelTester()
6736 .mr(6)
6737 .nr(8)
6738 .kr(1)
6739 .sr(1)
6740 .m(m)
6741 .n(n)
6742 .k(k)
6743 .iterations(1)
6744 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6745 }
6746 }
6747 }
6748 }
6749
6750 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8) {
6751 TEST_REQUIRES_ARM_NEON_FMA;
6752 for (uint32_t n = 16; n <= 24; n += 8) {
6753 for (size_t k = 1; k <= 40; k += 9) {
6754 GemmMicrokernelTester()
6755 .mr(6)
6756 .nr(8)
6757 .kr(1)
6758 .sr(1)
6759 .m(6)
6760 .n(8)
6761 .k(k)
6762 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6763 }
6764 }
6765 }
6766
6767 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_cn) {
6768 TEST_REQUIRES_ARM_NEON_FMA;
6769 for (uint32_t n = 16; n <= 24; n += 8) {
6770 for (size_t k = 1; k <= 40; k += 9) {
6771 GemmMicrokernelTester()
6772 .mr(6)
6773 .nr(8)
6774 .kr(1)
6775 .sr(1)
6776 .m(6)
6777 .n(n)
6778 .k(k)
6779 .cn_stride(11)
6780 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6781 }
6782 }
6783 }
6784
6785 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_subtile) {
6786 TEST_REQUIRES_ARM_NEON_FMA;
6787 for (uint32_t n = 16; n <= 24; n += 8) {
6788 for (size_t k = 1; k <= 40; k += 9) {
6789 for (uint32_t m = 1; m <= 6; m++) {
6790 GemmMicrokernelTester()
6791 .mr(6)
6792 .nr(8)
6793 .kr(1)
6794 .sr(1)
6795 .m(m)
6796 .n(n)
6797 .k(k)
6798 .iterations(1)
6799 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6800 }
6801 }
6802 }
6803 }
6804
6805 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, small_kernel) {
6806 TEST_REQUIRES_ARM_NEON_FMA;
6807 for (size_t k = 1; k <= 40; k += 9) {
6808 GemmMicrokernelTester()
6809 .mr(6)
6810 .nr(8)
6811 .kr(1)
6812 .sr(1)
6813 .m(6)
6814 .n(8)
6815 .k(k)
6816 .ks(3)
6817 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6818 }
6819 }
6820
6821 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, small_kernel_subtile) {
6822 TEST_REQUIRES_ARM_NEON_FMA;
6823 for (size_t k = 1; k <= 40; k += 9) {
6824 for (uint32_t m = 1; m <= 6; m++) {
6825 for (uint32_t n = 1; n <= 8; n++) {
6826 GemmMicrokernelTester()
6827 .mr(6)
6828 .nr(8)
6829 .kr(1)
6830 .sr(1)
6831 .m(m)
6832 .n(n)
6833 .k(k)
6834 .ks(3)
6835 .iterations(1)
6836 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6837 }
6838 }
6839 }
6840 }
6841
6842 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_small_kernel) {
6843 TEST_REQUIRES_ARM_NEON_FMA;
6844 for (uint32_t n = 9; n < 16; n++) {
6845 for (size_t k = 1; k <= 40; k += 9) {
6846 GemmMicrokernelTester()
6847 .mr(6)
6848 .nr(8)
6849 .kr(1)
6850 .sr(1)
6851 .m(6)
6852 .n(8)
6853 .k(k)
6854 .ks(3)
6855 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6856 }
6857 }
6858 }
6859
6860 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_small_kernel) {
6861 TEST_REQUIRES_ARM_NEON_FMA;
6862 for (uint32_t n = 16; n <= 24; n += 8) {
6863 for (size_t k = 1; k <= 40; k += 9) {
6864 GemmMicrokernelTester()
6865 .mr(6)
6866 .nr(8)
6867 .kr(1)
6868 .sr(1)
6869 .m(6)
6870 .n(8)
6871 .k(k)
6872 .ks(3)
6873 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6874 }
6875 }
6876 }
6877
6878 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm_subtile) {
6879 TEST_REQUIRES_ARM_NEON_FMA;
6880 for (size_t k = 1; k <= 40; k += 9) {
6881 for (uint32_t m = 1; m <= 6; m++) {
6882 for (uint32_t n = 1; n <= 8; n++) {
6883 GemmMicrokernelTester()
6884 .mr(6)
6885 .nr(8)
6886 .kr(1)
6887 .sr(1)
6888 .m(m)
6889 .n(n)
6890 .k(k)
6891 .cm_stride(11)
6892 .iterations(1)
6893 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6894 }
6895 }
6896 }
6897 }
6898
6899 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, a_offset) {
6900 TEST_REQUIRES_ARM_NEON_FMA;
6901 for (size_t k = 1; k <= 40; k += 9) {
6902 GemmMicrokernelTester()
6903 .mr(6)
6904 .nr(8)
6905 .kr(1)
6906 .sr(1)
6907 .m(6)
6908 .n(8)
6909 .k(k)
6910 .ks(3)
6911 .a_offset(251)
6912 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6913 }
6914 }
6915
6916 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, zero) {
6917 TEST_REQUIRES_ARM_NEON_FMA;
6918 for (uint32_t mz = 0; mz < 6; mz++) {
6919 for (size_t k = 1; k <= 40; k += 9) {
6920 GemmMicrokernelTester()
6921 .mr(6)
6922 .nr(8)
6923 .kr(1)
6924 .sr(1)
6925 .m(6)
6926 .n(8)
6927 .k(k)
6928 .ks(3)
6929 .a_offset(251)
6930 .zero_index(mz)
6931 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6932 }
6933 }
6934 }
6935
6936 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, qmin) {
6937 TEST_REQUIRES_ARM_NEON_FMA;
6938 GemmMicrokernelTester()
6939 .mr(6)
6940 .nr(8)
6941 .kr(1)
6942 .sr(1)
6943 .m(6)
6944 .n(8)
6945 .k(8)
6946 .qmin(128)
6947 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6948 }
6949
6950 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, qmax) {
6951 TEST_REQUIRES_ARM_NEON_FMA;
6952 GemmMicrokernelTester()
6953 .mr(6)
6954 .nr(8)
6955 .kr(1)
6956 .sr(1)
6957 .m(6)
6958 .n(8)
6959 .k(8)
6960 .qmax(128)
6961 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6962 }
6963
6964 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm) {
6965 TEST_REQUIRES_ARM_NEON_FMA;
6966 GemmMicrokernelTester()
6967 .mr(6)
6968 .nr(8)
6969 .kr(1)
6970 .sr(1)
6971 .m(6)
6972 .n(8)
6973 .k(8)
6974 .cm_stride(11)
6975 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6976 }
6977#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
6978
6979
6980#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07006981 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
6982 TEST_REQUIRES_ARM_NEON_FMA;
6983 GemmMicrokernelTester()
6984 .mr(6)
6985 .nr(8)
6986 .kr(1)
6987 .sr(1)
6988 .m(6)
6989 .n(8)
6990 .k(8)
6991 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6992 }
6993
6994 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
6995 TEST_REQUIRES_ARM_NEON_FMA;
6996 GemmMicrokernelTester()
6997 .mr(6)
6998 .nr(8)
6999 .kr(1)
7000 .sr(1)
7001 .m(6)
7002 .n(8)
7003 .k(8)
7004 .cn_stride(11)
7005 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7006 }
7007
7008 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
7009 TEST_REQUIRES_ARM_NEON_FMA;
7010 for (uint32_t m = 1; m <= 6; m++) {
7011 for (uint32_t n = 1; n <= 8; n++) {
7012 GemmMicrokernelTester()
7013 .mr(6)
7014 .nr(8)
7015 .kr(1)
7016 .sr(1)
7017 .m(m)
7018 .n(n)
7019 .k(8)
7020 .iterations(1)
7021 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7022 }
7023 }
7024 }
7025
7026 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
7027 TEST_REQUIRES_ARM_NEON_FMA;
7028 for (uint32_t m = 1; m <= 6; m++) {
7029 GemmMicrokernelTester()
7030 .mr(6)
7031 .nr(8)
7032 .kr(1)
7033 .sr(1)
7034 .m(m)
7035 .n(8)
7036 .k(8)
7037 .iterations(1)
7038 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7039 }
7040 }
7041
7042 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
7043 TEST_REQUIRES_ARM_NEON_FMA;
7044 for (uint32_t n = 1; n <= 8; n++) {
7045 GemmMicrokernelTester()
7046 .mr(6)
7047 .nr(8)
7048 .kr(1)
7049 .sr(1)
7050 .m(6)
7051 .n(n)
7052 .k(8)
7053 .iterations(1)
7054 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7055 }
7056 }
7057
7058 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
7059 TEST_REQUIRES_ARM_NEON_FMA;
7060 GemmMicrokernelTester()
7061 .mr(6)
7062 .nr(8)
7063 .kr(1)
7064 .sr(1)
7065 .m(6)
7066 .n(8)
7067 .k(16)
7068 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7069 }
7070
7071 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
7072 TEST_REQUIRES_ARM_NEON_FMA;
7073 for (uint32_t m = 1; m <= 6; m++) {
7074 for (uint32_t n = 1; n <= 8; n++) {
7075 GemmMicrokernelTester()
7076 .mr(6)
7077 .nr(8)
7078 .kr(1)
7079 .sr(1)
7080 .m(m)
7081 .n(n)
7082 .k(16)
7083 .iterations(1)
7084 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7085 }
7086 }
7087 }
7088
7089 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
7090 TEST_REQUIRES_ARM_NEON_FMA;
7091 for (size_t k = 1; k < 16; k++) {
7092 GemmMicrokernelTester()
7093 .mr(6)
7094 .nr(8)
7095 .kr(1)
7096 .sr(1)
7097 .m(6)
7098 .n(8)
7099 .k(k)
7100 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7101 }
7102 }
7103
7104 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
7105 TEST_REQUIRES_ARM_NEON_FMA;
7106 for (size_t k = 1; k < 16; k++) {
7107 for (uint32_t m = 1; m <= 6; m++) {
7108 for (uint32_t n = 1; n <= 8; n++) {
7109 GemmMicrokernelTester()
7110 .mr(6)
7111 .nr(8)
7112 .kr(1)
7113 .sr(1)
7114 .m(m)
7115 .n(n)
7116 .k(k)
7117 .iterations(1)
7118 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7119 }
7120 }
7121 }
7122 }
7123
7124 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
7125 TEST_REQUIRES_ARM_NEON_FMA;
7126 for (size_t k = 17; k < 16; k++) {
7127 GemmMicrokernelTester()
7128 .mr(6)
7129 .nr(8)
7130 .kr(1)
7131 .sr(1)
7132 .m(6)
7133 .n(8)
7134 .k(k)
7135 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7136 }
7137 }
7138
7139 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
7140 TEST_REQUIRES_ARM_NEON_FMA;
7141 for (size_t k = 17; k < 16; k++) {
7142 for (uint32_t m = 1; m <= 6; m++) {
7143 for (uint32_t n = 1; n <= 8; n++) {
7144 GemmMicrokernelTester()
7145 .mr(6)
7146 .nr(8)
7147 .kr(1)
7148 .sr(1)
7149 .m(m)
7150 .n(n)
7151 .k(k)
7152 .iterations(1)
7153 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7154 }
7155 }
7156 }
7157 }
7158
7159 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
7160 TEST_REQUIRES_ARM_NEON_FMA;
7161 for (size_t k = 24; k <= 80; k += 8) {
7162 GemmMicrokernelTester()
7163 .mr(6)
7164 .nr(8)
7165 .kr(1)
7166 .sr(1)
7167 .m(6)
7168 .n(8)
7169 .k(k)
7170 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7171 }
7172 }
7173
7174 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
7175 TEST_REQUIRES_ARM_NEON_FMA;
7176 for (size_t k = 24; k <= 80; k += 8) {
7177 for (uint32_t m = 1; m <= 6; m++) {
7178 for (uint32_t n = 1; n <= 8; n++) {
7179 GemmMicrokernelTester()
7180 .mr(6)
7181 .nr(8)
7182 .kr(1)
7183 .sr(1)
7184 .m(m)
7185 .n(n)
7186 .k(k)
7187 .iterations(1)
7188 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7189 }
7190 }
7191 }
7192 }
7193
7194 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
7195 TEST_REQUIRES_ARM_NEON_FMA;
7196 for (uint32_t n = 9; n < 16; n++) {
7197 for (size_t k = 1; k <= 40; k += 9) {
7198 GemmMicrokernelTester()
7199 .mr(6)
7200 .nr(8)
7201 .kr(1)
7202 .sr(1)
7203 .m(6)
7204 .n(8)
7205 .k(k)
7206 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7207 }
7208 }
7209 }
7210
7211 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
7212 TEST_REQUIRES_ARM_NEON_FMA;
7213 for (uint32_t n = 9; n < 16; n++) {
7214 for (size_t k = 1; k <= 40; k += 9) {
7215 GemmMicrokernelTester()
7216 .mr(6)
7217 .nr(8)
7218 .kr(1)
7219 .sr(1)
7220 .m(6)
7221 .n(8)
7222 .k(k)
7223 .cn_stride(11)
7224 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7225 }
7226 }
7227 }
7228
7229 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
7230 TEST_REQUIRES_ARM_NEON_FMA;
7231 for (uint32_t n = 9; n < 16; n++) {
7232 for (size_t k = 1; k <= 40; k += 9) {
7233 for (uint32_t m = 1; m <= 6; m++) {
7234 GemmMicrokernelTester()
7235 .mr(6)
7236 .nr(8)
7237 .kr(1)
7238 .sr(1)
7239 .m(m)
7240 .n(n)
7241 .k(k)
7242 .iterations(1)
7243 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7244 }
7245 }
7246 }
7247 }
7248
7249 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
7250 TEST_REQUIRES_ARM_NEON_FMA;
7251 for (uint32_t n = 16; n <= 24; n += 8) {
7252 for (size_t k = 1; k <= 40; k += 9) {
7253 GemmMicrokernelTester()
7254 .mr(6)
7255 .nr(8)
7256 .kr(1)
7257 .sr(1)
7258 .m(6)
7259 .n(8)
7260 .k(k)
7261 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7262 }
7263 }
7264 }
7265
7266 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
7267 TEST_REQUIRES_ARM_NEON_FMA;
7268 for (uint32_t n = 16; n <= 24; n += 8) {
7269 for (size_t k = 1; k <= 40; k += 9) {
7270 GemmMicrokernelTester()
7271 .mr(6)
7272 .nr(8)
7273 .kr(1)
7274 .sr(1)
7275 .m(6)
7276 .n(n)
7277 .k(k)
7278 .cn_stride(11)
7279 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7280 }
7281 }
7282 }
7283
7284 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
7285 TEST_REQUIRES_ARM_NEON_FMA;
7286 for (uint32_t n = 16; n <= 24; n += 8) {
7287 for (size_t k = 1; k <= 40; k += 9) {
7288 for (uint32_t m = 1; m <= 6; m++) {
7289 GemmMicrokernelTester()
7290 .mr(6)
7291 .nr(8)
7292 .kr(1)
7293 .sr(1)
7294 .m(m)
7295 .n(n)
7296 .k(k)
7297 .iterations(1)
7298 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7299 }
7300 }
7301 }
7302 }
7303
7304 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, small_kernel) {
7305 TEST_REQUIRES_ARM_NEON_FMA;
7306 for (size_t k = 1; k <= 40; k += 9) {
7307 GemmMicrokernelTester()
7308 .mr(6)
7309 .nr(8)
7310 .kr(1)
7311 .sr(1)
7312 .m(6)
7313 .n(8)
7314 .k(k)
7315 .ks(3)
7316 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7317 }
7318 }
7319
7320 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, small_kernel_subtile) {
7321 TEST_REQUIRES_ARM_NEON_FMA;
7322 for (size_t k = 1; k <= 40; k += 9) {
7323 for (uint32_t m = 1; m <= 6; m++) {
7324 for (uint32_t n = 1; n <= 8; n++) {
7325 GemmMicrokernelTester()
7326 .mr(6)
7327 .nr(8)
7328 .kr(1)
7329 .sr(1)
7330 .m(m)
7331 .n(n)
7332 .k(k)
7333 .ks(3)
7334 .iterations(1)
7335 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7336 }
7337 }
7338 }
7339 }
7340
7341 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_small_kernel) {
7342 TEST_REQUIRES_ARM_NEON_FMA;
7343 for (uint32_t n = 9; n < 16; n++) {
7344 for (size_t k = 1; k <= 40; k += 9) {
7345 GemmMicrokernelTester()
7346 .mr(6)
7347 .nr(8)
7348 .kr(1)
7349 .sr(1)
7350 .m(6)
7351 .n(8)
7352 .k(k)
7353 .ks(3)
7354 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7355 }
7356 }
7357 }
7358
7359 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_small_kernel) {
7360 TEST_REQUIRES_ARM_NEON_FMA;
7361 for (uint32_t n = 16; n <= 24; n += 8) {
7362 for (size_t k = 1; k <= 40; k += 9) {
7363 GemmMicrokernelTester()
7364 .mr(6)
7365 .nr(8)
7366 .kr(1)
7367 .sr(1)
7368 .m(6)
7369 .n(8)
7370 .k(k)
7371 .ks(3)
7372 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7373 }
7374 }
7375 }
7376
7377 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
7378 TEST_REQUIRES_ARM_NEON_FMA;
7379 for (size_t k = 1; k <= 40; k += 9) {
7380 for (uint32_t m = 1; m <= 6; m++) {
7381 for (uint32_t n = 1; n <= 8; n++) {
7382 GemmMicrokernelTester()
7383 .mr(6)
7384 .nr(8)
7385 .kr(1)
7386 .sr(1)
7387 .m(m)
7388 .n(n)
7389 .k(k)
7390 .cm_stride(11)
7391 .iterations(1)
7392 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7393 }
7394 }
7395 }
7396 }
7397
7398 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, a_offset) {
7399 TEST_REQUIRES_ARM_NEON_FMA;
7400 for (size_t k = 1; k <= 40; k += 9) {
7401 GemmMicrokernelTester()
7402 .mr(6)
7403 .nr(8)
7404 .kr(1)
7405 .sr(1)
7406 .m(6)
7407 .n(8)
7408 .k(k)
7409 .ks(3)
7410 .a_offset(251)
7411 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7412 }
7413 }
7414
7415 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, zero) {
7416 TEST_REQUIRES_ARM_NEON_FMA;
7417 for (uint32_t mz = 0; mz < 6; mz++) {
7418 for (size_t k = 1; k <= 40; k += 9) {
7419 GemmMicrokernelTester()
7420 .mr(6)
7421 .nr(8)
7422 .kr(1)
7423 .sr(1)
7424 .m(6)
7425 .n(8)
7426 .k(k)
7427 .ks(3)
7428 .a_offset(251)
7429 .zero_index(mz)
7430 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7431 }
7432 }
7433 }
7434
7435 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
7436 TEST_REQUIRES_ARM_NEON_FMA;
7437 GemmMicrokernelTester()
7438 .mr(6)
7439 .nr(8)
7440 .kr(1)
7441 .sr(1)
7442 .m(6)
7443 .n(8)
7444 .k(8)
7445 .qmin(128)
7446 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7447 }
7448
7449 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
7450 TEST_REQUIRES_ARM_NEON_FMA;
7451 GemmMicrokernelTester()
7452 .mr(6)
7453 .nr(8)
7454 .kr(1)
7455 .sr(1)
7456 .m(6)
7457 .n(8)
7458 .k(8)
7459 .qmax(128)
7460 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7461 }
7462
7463 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
7464 TEST_REQUIRES_ARM_NEON_FMA;
7465 GemmMicrokernelTester()
7466 .mr(6)
7467 .nr(8)
7468 .kr(1)
7469 .sr(1)
7470 .m(6)
7471 .n(8)
7472 .k(8)
7473 .cm_stride(11)
7474 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7475 }
Frank Barchard7e955972019-10-11 10:34:25 -07007476#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07007477
7478
Frank Barchard7e955972019-10-11 10:34:25 -07007479#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07007480 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
7481 TEST_REQUIRES_ARM_NEON_FMA;
7482 GemmMicrokernelTester()
7483 .mr(1)
7484 .nr(12)
7485 .kr(1)
7486 .sr(1)
7487 .m(1)
7488 .n(12)
7489 .k(4)
7490 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7491 }
7492
7493 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
7494 TEST_REQUIRES_ARM_NEON_FMA;
7495 GemmMicrokernelTester()
7496 .mr(1)
7497 .nr(12)
7498 .kr(1)
7499 .sr(1)
7500 .m(1)
7501 .n(12)
7502 .k(4)
7503 .cn_stride(17)
7504 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7505 }
7506
7507 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
7508 TEST_REQUIRES_ARM_NEON_FMA;
7509 for (uint32_t m = 1; m <= 1; m++) {
7510 for (uint32_t n = 1; n <= 12; n++) {
7511 GemmMicrokernelTester()
7512 .mr(1)
7513 .nr(12)
7514 .kr(1)
7515 .sr(1)
7516 .m(m)
7517 .n(n)
7518 .k(4)
7519 .iterations(1)
7520 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7521 }
7522 }
7523 }
7524
7525 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
7526 TEST_REQUIRES_ARM_NEON_FMA;
7527 for (uint32_t m = 1; m <= 1; m++) {
7528 GemmMicrokernelTester()
7529 .mr(1)
7530 .nr(12)
7531 .kr(1)
7532 .sr(1)
7533 .m(m)
7534 .n(12)
7535 .k(4)
7536 .iterations(1)
7537 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7538 }
7539 }
7540
7541 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
7542 TEST_REQUIRES_ARM_NEON_FMA;
7543 for (uint32_t n = 1; n <= 12; n++) {
7544 GemmMicrokernelTester()
7545 .mr(1)
7546 .nr(12)
7547 .kr(1)
7548 .sr(1)
7549 .m(1)
7550 .n(n)
7551 .k(4)
7552 .iterations(1)
7553 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7554 }
7555 }
7556
7557 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
7558 TEST_REQUIRES_ARM_NEON_FMA;
7559 GemmMicrokernelTester()
7560 .mr(1)
7561 .nr(12)
7562 .kr(1)
7563 .sr(1)
7564 .m(1)
7565 .n(12)
7566 .k(8)
7567 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7568 }
7569
7570 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
7571 TEST_REQUIRES_ARM_NEON_FMA;
7572 for (uint32_t m = 1; m <= 1; m++) {
7573 for (uint32_t n = 1; n <= 12; n++) {
7574 GemmMicrokernelTester()
7575 .mr(1)
7576 .nr(12)
7577 .kr(1)
7578 .sr(1)
7579 .m(m)
7580 .n(n)
7581 .k(8)
7582 .iterations(1)
7583 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7584 }
7585 }
7586 }
7587
7588 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
7589 TEST_REQUIRES_ARM_NEON_FMA;
7590 for (size_t k = 1; k < 8; k++) {
7591 GemmMicrokernelTester()
7592 .mr(1)
7593 .nr(12)
7594 .kr(1)
7595 .sr(1)
7596 .m(1)
7597 .n(12)
7598 .k(k)
7599 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7600 }
7601 }
7602
7603 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
7604 TEST_REQUIRES_ARM_NEON_FMA;
7605 for (size_t k = 1; k < 8; k++) {
7606 for (uint32_t m = 1; m <= 1; m++) {
7607 for (uint32_t n = 1; n <= 12; n++) {
7608 GemmMicrokernelTester()
7609 .mr(1)
7610 .nr(12)
7611 .kr(1)
7612 .sr(1)
7613 .m(m)
7614 .n(n)
7615 .k(k)
7616 .iterations(1)
7617 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7618 }
7619 }
7620 }
7621 }
7622
7623 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
7624 TEST_REQUIRES_ARM_NEON_FMA;
7625 for (size_t k = 9; k < 8; k++) {
7626 GemmMicrokernelTester()
7627 .mr(1)
7628 .nr(12)
7629 .kr(1)
7630 .sr(1)
7631 .m(1)
7632 .n(12)
7633 .k(k)
7634 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7635 }
7636 }
7637
7638 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
7639 TEST_REQUIRES_ARM_NEON_FMA;
7640 for (size_t k = 9; k < 8; k++) {
7641 for (uint32_t m = 1; m <= 1; m++) {
7642 for (uint32_t n = 1; n <= 12; n++) {
7643 GemmMicrokernelTester()
7644 .mr(1)
7645 .nr(12)
7646 .kr(1)
7647 .sr(1)
7648 .m(m)
7649 .n(n)
7650 .k(k)
7651 .iterations(1)
7652 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7653 }
7654 }
7655 }
7656 }
7657
7658 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
7659 TEST_REQUIRES_ARM_NEON_FMA;
7660 for (size_t k = 12; k <= 40; k += 4) {
7661 GemmMicrokernelTester()
7662 .mr(1)
7663 .nr(12)
7664 .kr(1)
7665 .sr(1)
7666 .m(1)
7667 .n(12)
7668 .k(k)
7669 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7670 }
7671 }
7672
7673 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
7674 TEST_REQUIRES_ARM_NEON_FMA;
7675 for (size_t k = 12; k <= 40; k += 4) {
7676 for (uint32_t m = 1; m <= 1; m++) {
7677 for (uint32_t n = 1; n <= 12; n++) {
7678 GemmMicrokernelTester()
7679 .mr(1)
7680 .nr(12)
7681 .kr(1)
7682 .sr(1)
7683 .m(m)
7684 .n(n)
7685 .k(k)
7686 .iterations(1)
7687 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7688 }
7689 }
7690 }
7691 }
7692
7693 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12) {
7694 TEST_REQUIRES_ARM_NEON_FMA;
7695 for (uint32_t n = 13; n < 24; n++) {
7696 for (size_t k = 1; k <= 20; k += 5) {
7697 GemmMicrokernelTester()
7698 .mr(1)
7699 .nr(12)
7700 .kr(1)
7701 .sr(1)
7702 .m(1)
7703 .n(12)
7704 .k(k)
7705 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7706 }
7707 }
7708 }
7709
7710 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_strided_cn) {
7711 TEST_REQUIRES_ARM_NEON_FMA;
7712 for (uint32_t n = 13; n < 24; n++) {
7713 for (size_t k = 1; k <= 20; k += 5) {
7714 GemmMicrokernelTester()
7715 .mr(1)
7716 .nr(12)
7717 .kr(1)
7718 .sr(1)
7719 .m(1)
7720 .n(12)
7721 .k(k)
7722 .cn_stride(17)
7723 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7724 }
7725 }
7726 }
7727
7728 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_subtile) {
7729 TEST_REQUIRES_ARM_NEON_FMA;
7730 for (uint32_t n = 13; n < 24; n++) {
7731 for (size_t k = 1; k <= 20; k += 5) {
7732 for (uint32_t m = 1; m <= 1; m++) {
7733 GemmMicrokernelTester()
7734 .mr(1)
7735 .nr(12)
7736 .kr(1)
7737 .sr(1)
7738 .m(m)
7739 .n(n)
7740 .k(k)
7741 .iterations(1)
7742 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7743 }
7744 }
7745 }
7746 }
7747
7748 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12) {
7749 TEST_REQUIRES_ARM_NEON_FMA;
7750 for (uint32_t n = 24; n <= 36; n += 12) {
7751 for (size_t k = 1; k <= 20; k += 5) {
7752 GemmMicrokernelTester()
7753 .mr(1)
7754 .nr(12)
7755 .kr(1)
7756 .sr(1)
7757 .m(1)
7758 .n(12)
7759 .k(k)
7760 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7761 }
7762 }
7763 }
7764
7765 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_strided_cn) {
7766 TEST_REQUIRES_ARM_NEON_FMA;
7767 for (uint32_t n = 24; n <= 36; n += 12) {
7768 for (size_t k = 1; k <= 20; k += 5) {
7769 GemmMicrokernelTester()
7770 .mr(1)
7771 .nr(12)
7772 .kr(1)
7773 .sr(1)
7774 .m(1)
7775 .n(n)
7776 .k(k)
7777 .cn_stride(17)
7778 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7779 }
7780 }
7781 }
7782
7783 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_subtile) {
7784 TEST_REQUIRES_ARM_NEON_FMA;
7785 for (uint32_t n = 24; n <= 36; n += 12) {
7786 for (size_t k = 1; k <= 20; k += 5) {
7787 for (uint32_t m = 1; m <= 1; m++) {
7788 GemmMicrokernelTester()
7789 .mr(1)
7790 .nr(12)
7791 .kr(1)
7792 .sr(1)
7793 .m(m)
7794 .n(n)
7795 .k(k)
7796 .iterations(1)
7797 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7798 }
7799 }
7800 }
7801 }
7802
7803 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, small_kernel) {
7804 TEST_REQUIRES_ARM_NEON_FMA;
7805 for (size_t k = 1; k <= 20; k += 5) {
7806 GemmMicrokernelTester()
7807 .mr(1)
7808 .nr(12)
7809 .kr(1)
7810 .sr(1)
7811 .m(1)
7812 .n(12)
7813 .k(k)
7814 .ks(3)
7815 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7816 }
7817 }
7818
7819 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, small_kernel_subtile) {
7820 TEST_REQUIRES_ARM_NEON_FMA;
7821 for (size_t k = 1; k <= 20; k += 5) {
7822 for (uint32_t m = 1; m <= 1; m++) {
7823 for (uint32_t n = 1; n <= 12; n++) {
7824 GemmMicrokernelTester()
7825 .mr(1)
7826 .nr(12)
7827 .kr(1)
7828 .sr(1)
7829 .m(m)
7830 .n(n)
7831 .k(k)
7832 .ks(3)
7833 .iterations(1)
7834 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7835 }
7836 }
7837 }
7838 }
7839
7840 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_small_kernel) {
7841 TEST_REQUIRES_ARM_NEON_FMA;
7842 for (uint32_t n = 13; n < 24; n++) {
7843 for (size_t k = 1; k <= 20; k += 5) {
7844 GemmMicrokernelTester()
7845 .mr(1)
7846 .nr(12)
7847 .kr(1)
7848 .sr(1)
7849 .m(1)
7850 .n(12)
7851 .k(k)
7852 .ks(3)
7853 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7854 }
7855 }
7856 }
7857
7858 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_small_kernel) {
7859 TEST_REQUIRES_ARM_NEON_FMA;
7860 for (uint32_t n = 24; n <= 36; n += 12) {
7861 for (size_t k = 1; k <= 20; k += 5) {
7862 GemmMicrokernelTester()
7863 .mr(1)
7864 .nr(12)
7865 .kr(1)
7866 .sr(1)
7867 .m(1)
7868 .n(12)
7869 .k(k)
7870 .ks(3)
7871 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7872 }
7873 }
7874 }
7875
7876 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
7877 TEST_REQUIRES_ARM_NEON_FMA;
7878 for (size_t k = 1; k <= 20; k += 5) {
7879 for (uint32_t m = 1; m <= 1; m++) {
7880 for (uint32_t n = 1; n <= 12; n++) {
7881 GemmMicrokernelTester()
7882 .mr(1)
7883 .nr(12)
7884 .kr(1)
7885 .sr(1)
7886 .m(m)
7887 .n(n)
7888 .k(k)
7889 .cm_stride(17)
7890 .iterations(1)
7891 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7892 }
7893 }
7894 }
7895 }
7896
7897 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, a_offset) {
7898 TEST_REQUIRES_ARM_NEON_FMA;
7899 for (size_t k = 1; k <= 20; k += 5) {
7900 GemmMicrokernelTester()
7901 .mr(1)
7902 .nr(12)
7903 .kr(1)
7904 .sr(1)
7905 .m(1)
7906 .n(12)
7907 .k(k)
7908 .ks(3)
7909 .a_offset(23)
7910 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7911 }
7912 }
7913
7914 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, zero) {
7915 TEST_REQUIRES_ARM_NEON_FMA;
7916 for (uint32_t mz = 0; mz < 1; mz++) {
7917 for (size_t k = 1; k <= 20; k += 5) {
7918 GemmMicrokernelTester()
7919 .mr(1)
7920 .nr(12)
7921 .kr(1)
7922 .sr(1)
7923 .m(1)
7924 .n(12)
7925 .k(k)
7926 .ks(3)
7927 .a_offset(23)
7928 .zero_index(mz)
7929 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7930 }
7931 }
7932 }
7933
7934 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, qmin) {
7935 TEST_REQUIRES_ARM_NEON_FMA;
7936 GemmMicrokernelTester()
7937 .mr(1)
7938 .nr(12)
7939 .kr(1)
7940 .sr(1)
7941 .m(1)
7942 .n(12)
7943 .k(4)
7944 .qmin(128)
7945 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7946 }
7947
7948 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, qmax) {
7949 TEST_REQUIRES_ARM_NEON_FMA;
7950 GemmMicrokernelTester()
7951 .mr(1)
7952 .nr(12)
7953 .kr(1)
7954 .sr(1)
7955 .m(1)
7956 .n(12)
7957 .k(4)
7958 .qmax(128)
7959 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7960 }
7961
7962 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
7963 TEST_REQUIRES_ARM_NEON_FMA;
7964 GemmMicrokernelTester()
7965 .mr(1)
7966 .nr(12)
7967 .kr(1)
7968 .sr(1)
7969 .m(1)
7970 .n(12)
7971 .k(4)
7972 .cm_stride(17)
7973 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7974 }
Frank Barchard7e955972019-10-11 10:34:25 -07007975#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07007976
7977
Frank Barchard7e955972019-10-11 10:34:25 -07007978#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07007979 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
7980 TEST_REQUIRES_ARM_NEON_FMA;
7981 GemmMicrokernelTester()
7982 .mr(4)
7983 .nr(12)
7984 .kr(1)
7985 .sr(1)
7986 .m(4)
7987 .n(12)
7988 .k(4)
7989 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
7990 }
7991
7992 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
7993 TEST_REQUIRES_ARM_NEON_FMA;
7994 GemmMicrokernelTester()
7995 .mr(4)
7996 .nr(12)
7997 .kr(1)
7998 .sr(1)
7999 .m(4)
8000 .n(12)
8001 .k(4)
8002 .cn_stride(17)
8003 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8004 }
8005
8006 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
8007 TEST_REQUIRES_ARM_NEON_FMA;
8008 for (uint32_t m = 1; m <= 4; m++) {
8009 for (uint32_t n = 1; n <= 12; n++) {
8010 GemmMicrokernelTester()
8011 .mr(4)
8012 .nr(12)
8013 .kr(1)
8014 .sr(1)
8015 .m(m)
8016 .n(n)
8017 .k(4)
8018 .iterations(1)
8019 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8020 }
8021 }
8022 }
8023
8024 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
8025 TEST_REQUIRES_ARM_NEON_FMA;
8026 for (uint32_t m = 1; m <= 4; m++) {
8027 GemmMicrokernelTester()
8028 .mr(4)
8029 .nr(12)
8030 .kr(1)
8031 .sr(1)
8032 .m(m)
8033 .n(12)
8034 .k(4)
8035 .iterations(1)
8036 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8037 }
8038 }
8039
8040 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
8041 TEST_REQUIRES_ARM_NEON_FMA;
8042 for (uint32_t n = 1; n <= 12; n++) {
8043 GemmMicrokernelTester()
8044 .mr(4)
8045 .nr(12)
8046 .kr(1)
8047 .sr(1)
8048 .m(4)
8049 .n(n)
8050 .k(4)
8051 .iterations(1)
8052 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8053 }
8054 }
8055
8056 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
8057 TEST_REQUIRES_ARM_NEON_FMA;
8058 GemmMicrokernelTester()
8059 .mr(4)
8060 .nr(12)
8061 .kr(1)
8062 .sr(1)
8063 .m(4)
8064 .n(12)
8065 .k(8)
8066 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8067 }
8068
8069 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
8070 TEST_REQUIRES_ARM_NEON_FMA;
8071 for (uint32_t m = 1; m <= 4; m++) {
8072 for (uint32_t n = 1; n <= 12; n++) {
8073 GemmMicrokernelTester()
8074 .mr(4)
8075 .nr(12)
8076 .kr(1)
8077 .sr(1)
8078 .m(m)
8079 .n(n)
8080 .k(8)
8081 .iterations(1)
8082 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8083 }
8084 }
8085 }
8086
8087 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
8088 TEST_REQUIRES_ARM_NEON_FMA;
8089 for (size_t k = 1; k < 8; k++) {
8090 GemmMicrokernelTester()
8091 .mr(4)
8092 .nr(12)
8093 .kr(1)
8094 .sr(1)
8095 .m(4)
8096 .n(12)
8097 .k(k)
8098 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8099 }
8100 }
8101
8102 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
8103 TEST_REQUIRES_ARM_NEON_FMA;
8104 for (size_t k = 1; k < 8; k++) {
8105 for (uint32_t m = 1; m <= 4; m++) {
8106 for (uint32_t n = 1; n <= 12; n++) {
8107 GemmMicrokernelTester()
8108 .mr(4)
8109 .nr(12)
8110 .kr(1)
8111 .sr(1)
8112 .m(m)
8113 .n(n)
8114 .k(k)
8115 .iterations(1)
8116 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8117 }
8118 }
8119 }
8120 }
8121
8122 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
8123 TEST_REQUIRES_ARM_NEON_FMA;
8124 for (size_t k = 9; k < 8; k++) {
8125 GemmMicrokernelTester()
8126 .mr(4)
8127 .nr(12)
8128 .kr(1)
8129 .sr(1)
8130 .m(4)
8131 .n(12)
8132 .k(k)
8133 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8134 }
8135 }
8136
8137 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
8138 TEST_REQUIRES_ARM_NEON_FMA;
8139 for (size_t k = 9; k < 8; k++) {
8140 for (uint32_t m = 1; m <= 4; m++) {
8141 for (uint32_t n = 1; n <= 12; n++) {
8142 GemmMicrokernelTester()
8143 .mr(4)
8144 .nr(12)
8145 .kr(1)
8146 .sr(1)
8147 .m(m)
8148 .n(n)
8149 .k(k)
8150 .iterations(1)
8151 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8152 }
8153 }
8154 }
8155 }
8156
8157 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
8158 TEST_REQUIRES_ARM_NEON_FMA;
8159 for (size_t k = 12; k <= 40; k += 4) {
8160 GemmMicrokernelTester()
8161 .mr(4)
8162 .nr(12)
8163 .kr(1)
8164 .sr(1)
8165 .m(4)
8166 .n(12)
8167 .k(k)
8168 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8169 }
8170 }
8171
8172 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
8173 TEST_REQUIRES_ARM_NEON_FMA;
8174 for (size_t k = 12; k <= 40; k += 4) {
8175 for (uint32_t m = 1; m <= 4; m++) {
8176 for (uint32_t n = 1; n <= 12; n++) {
8177 GemmMicrokernelTester()
8178 .mr(4)
8179 .nr(12)
8180 .kr(1)
8181 .sr(1)
8182 .m(m)
8183 .n(n)
8184 .k(k)
8185 .iterations(1)
8186 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8187 }
8188 }
8189 }
8190 }
8191
8192 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12) {
8193 TEST_REQUIRES_ARM_NEON_FMA;
8194 for (uint32_t n = 13; n < 24; n++) {
8195 for (size_t k = 1; k <= 20; k += 5) {
8196 GemmMicrokernelTester()
8197 .mr(4)
8198 .nr(12)
8199 .kr(1)
8200 .sr(1)
8201 .m(4)
8202 .n(12)
8203 .k(k)
8204 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8205 }
8206 }
8207 }
8208
8209 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_strided_cn) {
8210 TEST_REQUIRES_ARM_NEON_FMA;
8211 for (uint32_t n = 13; n < 24; n++) {
8212 for (size_t k = 1; k <= 20; k += 5) {
8213 GemmMicrokernelTester()
8214 .mr(4)
8215 .nr(12)
8216 .kr(1)
8217 .sr(1)
8218 .m(4)
8219 .n(12)
8220 .k(k)
8221 .cn_stride(17)
8222 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8223 }
8224 }
8225 }
8226
8227 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_subtile) {
8228 TEST_REQUIRES_ARM_NEON_FMA;
8229 for (uint32_t n = 13; n < 24; n++) {
8230 for (size_t k = 1; k <= 20; k += 5) {
8231 for (uint32_t m = 1; m <= 4; m++) {
8232 GemmMicrokernelTester()
8233 .mr(4)
8234 .nr(12)
8235 .kr(1)
8236 .sr(1)
8237 .m(m)
8238 .n(n)
8239 .k(k)
8240 .iterations(1)
8241 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8242 }
8243 }
8244 }
8245 }
8246
8247 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12) {
8248 TEST_REQUIRES_ARM_NEON_FMA;
8249 for (uint32_t n = 24; n <= 36; n += 12) {
8250 for (size_t k = 1; k <= 20; k += 5) {
8251 GemmMicrokernelTester()
8252 .mr(4)
8253 .nr(12)
8254 .kr(1)
8255 .sr(1)
8256 .m(4)
8257 .n(12)
8258 .k(k)
8259 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8260 }
8261 }
8262 }
8263
8264 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_strided_cn) {
8265 TEST_REQUIRES_ARM_NEON_FMA;
8266 for (uint32_t n = 24; n <= 36; n += 12) {
8267 for (size_t k = 1; k <= 20; k += 5) {
8268 GemmMicrokernelTester()
8269 .mr(4)
8270 .nr(12)
8271 .kr(1)
8272 .sr(1)
8273 .m(4)
8274 .n(n)
8275 .k(k)
8276 .cn_stride(17)
8277 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8278 }
8279 }
8280 }
8281
8282 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_subtile) {
8283 TEST_REQUIRES_ARM_NEON_FMA;
8284 for (uint32_t n = 24; n <= 36; n += 12) {
8285 for (size_t k = 1; k <= 20; k += 5) {
8286 for (uint32_t m = 1; m <= 4; m++) {
8287 GemmMicrokernelTester()
8288 .mr(4)
8289 .nr(12)
8290 .kr(1)
8291 .sr(1)
8292 .m(m)
8293 .n(n)
8294 .k(k)
8295 .iterations(1)
8296 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8297 }
8298 }
8299 }
8300 }
8301
8302 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, small_kernel) {
8303 TEST_REQUIRES_ARM_NEON_FMA;
8304 for (size_t k = 1; k <= 20; k += 5) {
8305 GemmMicrokernelTester()
8306 .mr(4)
8307 .nr(12)
8308 .kr(1)
8309 .sr(1)
8310 .m(4)
8311 .n(12)
8312 .k(k)
8313 .ks(3)
8314 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8315 }
8316 }
8317
8318 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, small_kernel_subtile) {
8319 TEST_REQUIRES_ARM_NEON_FMA;
8320 for (size_t k = 1; k <= 20; k += 5) {
8321 for (uint32_t m = 1; m <= 4; m++) {
8322 for (uint32_t n = 1; n <= 12; n++) {
8323 GemmMicrokernelTester()
8324 .mr(4)
8325 .nr(12)
8326 .kr(1)
8327 .sr(1)
8328 .m(m)
8329 .n(n)
8330 .k(k)
8331 .ks(3)
8332 .iterations(1)
8333 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8334 }
8335 }
8336 }
8337 }
8338
8339 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_small_kernel) {
8340 TEST_REQUIRES_ARM_NEON_FMA;
8341 for (uint32_t n = 13; n < 24; n++) {
8342 for (size_t k = 1; k <= 20; k += 5) {
8343 GemmMicrokernelTester()
8344 .mr(4)
8345 .nr(12)
8346 .kr(1)
8347 .sr(1)
8348 .m(4)
8349 .n(12)
8350 .k(k)
8351 .ks(3)
8352 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8353 }
8354 }
8355 }
8356
8357 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_small_kernel) {
8358 TEST_REQUIRES_ARM_NEON_FMA;
8359 for (uint32_t n = 24; n <= 36; n += 12) {
8360 for (size_t k = 1; k <= 20; k += 5) {
8361 GemmMicrokernelTester()
8362 .mr(4)
8363 .nr(12)
8364 .kr(1)
8365 .sr(1)
8366 .m(4)
8367 .n(12)
8368 .k(k)
8369 .ks(3)
8370 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8371 }
8372 }
8373 }
8374
8375 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
8376 TEST_REQUIRES_ARM_NEON_FMA;
8377 for (size_t k = 1; k <= 20; k += 5) {
8378 for (uint32_t m = 1; m <= 4; m++) {
8379 for (uint32_t n = 1; n <= 12; n++) {
8380 GemmMicrokernelTester()
8381 .mr(4)
8382 .nr(12)
8383 .kr(1)
8384 .sr(1)
8385 .m(m)
8386 .n(n)
8387 .k(k)
8388 .cm_stride(17)
8389 .iterations(1)
8390 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8391 }
8392 }
8393 }
8394 }
8395
8396 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, a_offset) {
8397 TEST_REQUIRES_ARM_NEON_FMA;
8398 for (size_t k = 1; k <= 20; k += 5) {
8399 GemmMicrokernelTester()
8400 .mr(4)
8401 .nr(12)
8402 .kr(1)
8403 .sr(1)
8404 .m(4)
8405 .n(12)
8406 .k(k)
8407 .ks(3)
8408 .a_offset(83)
8409 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8410 }
8411 }
8412
8413 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, zero) {
8414 TEST_REQUIRES_ARM_NEON_FMA;
8415 for (uint32_t mz = 0; mz < 4; mz++) {
8416 for (size_t k = 1; k <= 20; k += 5) {
8417 GemmMicrokernelTester()
8418 .mr(4)
8419 .nr(12)
8420 .kr(1)
8421 .sr(1)
8422 .m(4)
8423 .n(12)
8424 .k(k)
8425 .ks(3)
8426 .a_offset(83)
8427 .zero_index(mz)
8428 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8429 }
8430 }
8431 }
8432
8433 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, qmin) {
8434 TEST_REQUIRES_ARM_NEON_FMA;
8435 GemmMicrokernelTester()
8436 .mr(4)
8437 .nr(12)
8438 .kr(1)
8439 .sr(1)
8440 .m(4)
8441 .n(12)
8442 .k(4)
8443 .qmin(128)
8444 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8445 }
8446
8447 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, qmax) {
8448 TEST_REQUIRES_ARM_NEON_FMA;
8449 GemmMicrokernelTester()
8450 .mr(4)
8451 .nr(12)
8452 .kr(1)
8453 .sr(1)
8454 .m(4)
8455 .n(12)
8456 .k(4)
8457 .qmax(128)
8458 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8459 }
8460
8461 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
8462 TEST_REQUIRES_ARM_NEON_FMA;
8463 GemmMicrokernelTester()
8464 .mr(4)
8465 .nr(12)
8466 .kr(1)
8467 .sr(1)
8468 .m(4)
8469 .n(12)
8470 .k(4)
8471 .cm_stride(17)
8472 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8473 }
Frank Barchard7e955972019-10-11 10:34:25 -07008474#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07008475
8476
Frank Barchard5243bb02019-11-22 16:37:50 -08008477#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -08008478 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008479 TEST_REQUIRES_ARM_NEON;
8480 GemmMicrokernelTester()
8481 .mr(1)
8482 .nr(8)
8483 .kr(1)
8484 .sr(1)
8485 .m(1)
8486 .n(8)
8487 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -08008488 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008489 }
8490
Frank Barchard91317c52019-11-22 10:54:35 -08008491 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008492 TEST_REQUIRES_ARM_NEON;
8493 GemmMicrokernelTester()
8494 .mr(1)
8495 .nr(8)
8496 .kr(1)
8497 .sr(1)
8498 .m(1)
8499 .n(8)
8500 .k(2)
8501 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -08008502 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008503 }
8504
Frank Barchard91317c52019-11-22 10:54:35 -08008505 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008506 TEST_REQUIRES_ARM_NEON;
8507 for (uint32_t m = 1; m <= 1; m++) {
8508 for (uint32_t n = 1; n <= 8; n++) {
8509 GemmMicrokernelTester()
8510 .mr(1)
8511 .nr(8)
8512 .kr(1)
8513 .sr(1)
8514 .m(m)
8515 .n(n)
8516 .k(2)
8517 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008518 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008519 }
8520 }
8521 }
8522
Frank Barchard91317c52019-11-22 10:54:35 -08008523 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008524 TEST_REQUIRES_ARM_NEON;
8525 for (uint32_t m = 1; m <= 1; m++) {
8526 GemmMicrokernelTester()
8527 .mr(1)
8528 .nr(8)
8529 .kr(1)
8530 .sr(1)
8531 .m(m)
8532 .n(8)
8533 .k(2)
8534 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008535 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008536 }
8537 }
8538
Frank Barchard91317c52019-11-22 10:54:35 -08008539 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008540 TEST_REQUIRES_ARM_NEON;
8541 for (uint32_t n = 1; n <= 8; n++) {
8542 GemmMicrokernelTester()
8543 .mr(1)
8544 .nr(8)
8545 .kr(1)
8546 .sr(1)
8547 .m(1)
8548 .n(n)
8549 .k(2)
8550 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008551 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008552 }
8553 }
8554
Frank Barchard91317c52019-11-22 10:54:35 -08008555 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008556 TEST_REQUIRES_ARM_NEON;
8557 for (size_t k = 1; k < 2; k++) {
8558 GemmMicrokernelTester()
8559 .mr(1)
8560 .nr(8)
8561 .kr(1)
8562 .sr(1)
8563 .m(1)
8564 .n(8)
8565 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08008566 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008567 }
8568 }
8569
Frank Barchard91317c52019-11-22 10:54:35 -08008570 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008571 TEST_REQUIRES_ARM_NEON;
8572 for (size_t k = 1; k < 2; k++) {
8573 for (uint32_t m = 1; m <= 1; m++) {
8574 for (uint32_t n = 1; n <= 8; n++) {
8575 GemmMicrokernelTester()
8576 .mr(1)
8577 .nr(8)
8578 .kr(1)
8579 .sr(1)
8580 .m(m)
8581 .n(n)
8582 .k(k)
8583 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008584 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008585 }
8586 }
8587 }
8588 }
8589
Frank Barchard91317c52019-11-22 10:54:35 -08008590 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008591 TEST_REQUIRES_ARM_NEON;
8592 for (size_t k = 3; k < 4; k++) {
8593 GemmMicrokernelTester()
8594 .mr(1)
8595 .nr(8)
8596 .kr(1)
8597 .sr(1)
8598 .m(1)
8599 .n(8)
8600 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08008601 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008602 }
8603 }
8604
Frank Barchard91317c52019-11-22 10:54:35 -08008605 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008606 TEST_REQUIRES_ARM_NEON;
8607 for (size_t k = 3; k < 4; k++) {
8608 for (uint32_t m = 1; m <= 1; m++) {
8609 for (uint32_t n = 1; n <= 8; n++) {
8610 GemmMicrokernelTester()
8611 .mr(1)
8612 .nr(8)
8613 .kr(1)
8614 .sr(1)
8615 .m(m)
8616 .n(n)
8617 .k(k)
8618 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008619 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008620 }
8621 }
8622 }
8623 }
8624
Frank Barchard91317c52019-11-22 10:54:35 -08008625 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008626 TEST_REQUIRES_ARM_NEON;
8627 for (size_t k = 4; k <= 20; k += 2) {
8628 GemmMicrokernelTester()
8629 .mr(1)
8630 .nr(8)
8631 .kr(1)
8632 .sr(1)
8633 .m(1)
8634 .n(8)
8635 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08008636 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008637 }
8638 }
8639
Frank Barchard91317c52019-11-22 10:54:35 -08008640 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008641 TEST_REQUIRES_ARM_NEON;
8642 for (size_t k = 4; k <= 20; k += 2) {
8643 for (uint32_t m = 1; m <= 1; m++) {
8644 for (uint32_t n = 1; n <= 8; n++) {
8645 GemmMicrokernelTester()
8646 .mr(1)
8647 .nr(8)
8648 .kr(1)
8649 .sr(1)
8650 .m(m)
8651 .n(n)
8652 .k(k)
8653 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008654 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008655 }
8656 }
8657 }
8658 }
8659
Frank Barchard91317c52019-11-22 10:54:35 -08008660 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008661 TEST_REQUIRES_ARM_NEON;
8662 for (uint32_t n = 9; n < 16; n++) {
8663 for (size_t k = 1; k <= 10; k += 3) {
8664 GemmMicrokernelTester()
8665 .mr(1)
8666 .nr(8)
8667 .kr(1)
8668 .sr(1)
8669 .m(1)
8670 .n(8)
8671 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08008672 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008673 }
8674 }
8675 }
8676
Frank Barchard91317c52019-11-22 10:54:35 -08008677 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008678 TEST_REQUIRES_ARM_NEON;
8679 for (uint32_t n = 9; n < 16; n++) {
8680 for (size_t k = 1; k <= 10; k += 3) {
8681 GemmMicrokernelTester()
8682 .mr(1)
8683 .nr(8)
8684 .kr(1)
8685 .sr(1)
8686 .m(1)
8687 .n(8)
8688 .k(k)
8689 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -08008690 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008691 }
8692 }
8693 }
8694
Frank Barchard91317c52019-11-22 10:54:35 -08008695 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008696 TEST_REQUIRES_ARM_NEON;
8697 for (uint32_t n = 9; n < 16; n++) {
8698 for (size_t k = 1; k <= 10; k += 3) {
8699 for (uint32_t m = 1; m <= 1; m++) {
8700 GemmMicrokernelTester()
8701 .mr(1)
8702 .nr(8)
8703 .kr(1)
8704 .sr(1)
8705 .m(m)
8706 .n(n)
8707 .k(k)
8708 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008709 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008710 }
8711 }
8712 }
8713 }
8714
Frank Barchard91317c52019-11-22 10:54:35 -08008715 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008716 TEST_REQUIRES_ARM_NEON;
8717 for (uint32_t n = 16; n <= 24; n += 8) {
8718 for (size_t k = 1; k <= 10; k += 3) {
8719 GemmMicrokernelTester()
8720 .mr(1)
8721 .nr(8)
8722 .kr(1)
8723 .sr(1)
8724 .m(1)
8725 .n(8)
8726 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08008727 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008728 }
8729 }
8730 }
8731
Frank Barchard91317c52019-11-22 10:54:35 -08008732 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008733 TEST_REQUIRES_ARM_NEON;
8734 for (uint32_t n = 16; n <= 24; n += 8) {
8735 for (size_t k = 1; k <= 10; k += 3) {
8736 GemmMicrokernelTester()
8737 .mr(1)
8738 .nr(8)
8739 .kr(1)
8740 .sr(1)
8741 .m(1)
8742 .n(n)
8743 .k(k)
8744 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -08008745 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008746 }
8747 }
8748 }
8749
Frank Barchard91317c52019-11-22 10:54:35 -08008750 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008751 TEST_REQUIRES_ARM_NEON;
8752 for (uint32_t n = 16; n <= 24; n += 8) {
8753 for (size_t k = 1; k <= 10; k += 3) {
8754 for (uint32_t m = 1; m <= 1; m++) {
8755 GemmMicrokernelTester()
8756 .mr(1)
8757 .nr(8)
8758 .kr(1)
8759 .sr(1)
8760 .m(m)
8761 .n(n)
8762 .k(k)
8763 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008764 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008765 }
8766 }
8767 }
8768 }
8769
Frank Barchard91317c52019-11-22 10:54:35 -08008770 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008771 TEST_REQUIRES_ARM_NEON;
8772 for (size_t k = 1; k <= 10; k += 3) {
8773 GemmMicrokernelTester()
8774 .mr(1)
8775 .nr(8)
8776 .kr(1)
8777 .sr(1)
8778 .m(1)
8779 .n(8)
8780 .k(k)
8781 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -08008782 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008783 }
8784 }
8785
Frank Barchard91317c52019-11-22 10:54:35 -08008786 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, small_kernel_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008787 TEST_REQUIRES_ARM_NEON;
8788 for (size_t k = 1; k <= 10; k += 3) {
8789 for (uint32_t m = 1; m <= 1; m++) {
8790 for (uint32_t n = 1; n <= 8; n++) {
8791 GemmMicrokernelTester()
8792 .mr(1)
8793 .nr(8)
8794 .kr(1)
8795 .sr(1)
8796 .m(m)
8797 .n(n)
8798 .k(k)
8799 .ks(3)
8800 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008801 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008802 }
8803 }
8804 }
8805 }
8806
Frank Barchard91317c52019-11-22 10:54:35 -08008807 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_gt_8_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008808 TEST_REQUIRES_ARM_NEON;
8809 for (uint32_t n = 9; n < 16; n++) {
8810 for (size_t k = 1; k <= 10; k += 3) {
8811 GemmMicrokernelTester()
8812 .mr(1)
8813 .nr(8)
8814 .kr(1)
8815 .sr(1)
8816 .m(1)
8817 .n(8)
8818 .k(k)
8819 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -08008820 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008821 }
8822 }
8823 }
8824
Frank Barchard91317c52019-11-22 10:54:35 -08008825 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_div_8_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008826 TEST_REQUIRES_ARM_NEON;
8827 for (uint32_t n = 16; n <= 24; n += 8) {
8828 for (size_t k = 1; k <= 10; k += 3) {
8829 GemmMicrokernelTester()
8830 .mr(1)
8831 .nr(8)
8832 .kr(1)
8833 .sr(1)
8834 .m(1)
8835 .n(8)
8836 .k(k)
8837 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -08008838 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008839 }
8840 }
8841 }
8842
Frank Barchard91317c52019-11-22 10:54:35 -08008843 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008844 TEST_REQUIRES_ARM_NEON;
8845 for (size_t k = 1; k <= 10; k += 3) {
8846 for (uint32_t m = 1; m <= 1; m++) {
8847 for (uint32_t n = 1; n <= 8; n++) {
8848 GemmMicrokernelTester()
8849 .mr(1)
8850 .nr(8)
8851 .kr(1)
8852 .sr(1)
8853 .m(m)
8854 .n(n)
8855 .k(k)
8856 .cm_stride(11)
8857 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008858 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008859 }
8860 }
8861 }
8862 }
8863
Frank Barchard91317c52019-11-22 10:54:35 -08008864 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, a_offset) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008865 TEST_REQUIRES_ARM_NEON;
8866 for (size_t k = 1; k <= 10; k += 3) {
8867 GemmMicrokernelTester()
8868 .mr(1)
8869 .nr(8)
8870 .kr(1)
8871 .sr(1)
8872 .m(1)
8873 .n(8)
8874 .k(k)
8875 .ks(3)
8876 .a_offset(13)
Frank Barchard91317c52019-11-22 10:54:35 -08008877 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008878 }
8879 }
8880
Frank Barchard91317c52019-11-22 10:54:35 -08008881 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, zero) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008882 TEST_REQUIRES_ARM_NEON;
8883 for (uint32_t mz = 0; mz < 1; mz++) {
8884 for (size_t k = 1; k <= 10; k += 3) {
8885 GemmMicrokernelTester()
8886 .mr(1)
8887 .nr(8)
8888 .kr(1)
8889 .sr(1)
8890 .m(1)
8891 .n(8)
8892 .k(k)
8893 .ks(3)
8894 .a_offset(13)
8895 .zero_index(mz)
Frank Barchard91317c52019-11-22 10:54:35 -08008896 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008897 }
8898 }
8899 }
8900
Frank Barchard91317c52019-11-22 10:54:35 -08008901 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008902 TEST_REQUIRES_ARM_NEON;
8903 GemmMicrokernelTester()
8904 .mr(1)
8905 .nr(8)
8906 .kr(1)
8907 .sr(1)
8908 .m(1)
8909 .n(8)
8910 .k(2)
8911 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -08008912 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008913 }
8914
Frank Barchard91317c52019-11-22 10:54:35 -08008915 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008916 TEST_REQUIRES_ARM_NEON;
8917 GemmMicrokernelTester()
8918 .mr(1)
8919 .nr(8)
8920 .kr(1)
8921 .sr(1)
8922 .m(1)
8923 .n(8)
8924 .k(2)
8925 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -08008926 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008927 }
8928
Frank Barchard91317c52019-11-22 10:54:35 -08008929 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008930 TEST_REQUIRES_ARM_NEON;
8931 GemmMicrokernelTester()
8932 .mr(1)
8933 .nr(8)
8934 .kr(1)
8935 .sr(1)
8936 .m(1)
8937 .n(8)
8938 .k(2)
8939 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -08008940 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008941 }
Frank Barchard5243bb02019-11-22 16:37:50 -08008942#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -07008943
8944
Marat Dukhan1dadbf72019-10-01 10:46:20 -07008945#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -08008946 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008947 TEST_REQUIRES_ARM_NEON;
8948 GemmMicrokernelTester()
8949 .mr(4)
8950 .nr(2)
8951 .kr(1)
8952 .sr(1)
8953 .m(4)
8954 .n(2)
8955 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -08008956 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008957 }
8958
Frank Barchard91317c52019-11-22 10:54:35 -08008959 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008960 TEST_REQUIRES_ARM_NEON;
8961 GemmMicrokernelTester()
8962 .mr(4)
8963 .nr(2)
8964 .kr(1)
8965 .sr(1)
8966 .m(4)
8967 .n(2)
8968 .k(2)
8969 .cn_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -08008970 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008971 }
8972
Frank Barchard91317c52019-11-22 10:54:35 -08008973 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008974 TEST_REQUIRES_ARM_NEON;
8975 for (uint32_t m = 1; m <= 4; m++) {
8976 for (uint32_t n = 1; n <= 2; n++) {
8977 GemmMicrokernelTester()
8978 .mr(4)
8979 .nr(2)
8980 .kr(1)
8981 .sr(1)
8982 .m(m)
8983 .n(n)
8984 .k(2)
8985 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008986 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008987 }
8988 }
8989 }
8990
Frank Barchard91317c52019-11-22 10:54:35 -08008991 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008992 TEST_REQUIRES_ARM_NEON;
8993 for (uint32_t m = 1; m <= 4; m++) {
8994 GemmMicrokernelTester()
8995 .mr(4)
8996 .nr(2)
8997 .kr(1)
8998 .sr(1)
8999 .m(m)
9000 .n(2)
9001 .k(2)
9002 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009003 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009004 }
9005 }
9006
Frank Barchard91317c52019-11-22 10:54:35 -08009007 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009008 TEST_REQUIRES_ARM_NEON;
9009 for (uint32_t n = 1; n <= 2; n++) {
9010 GemmMicrokernelTester()
9011 .mr(4)
9012 .nr(2)
9013 .kr(1)
9014 .sr(1)
9015 .m(4)
9016 .n(n)
9017 .k(2)
9018 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009019 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009020 }
9021 }
9022
Frank Barchard91317c52019-11-22 10:54:35 -08009023 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009024 TEST_REQUIRES_ARM_NEON;
9025 for (size_t k = 1; k < 2; k++) {
9026 GemmMicrokernelTester()
9027 .mr(4)
9028 .nr(2)
9029 .kr(1)
9030 .sr(1)
9031 .m(4)
9032 .n(2)
9033 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08009034 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009035 }
9036 }
9037
Frank Barchard91317c52019-11-22 10:54:35 -08009038 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009039 TEST_REQUIRES_ARM_NEON;
9040 for (size_t k = 1; k < 2; k++) {
9041 for (uint32_t m = 1; m <= 4; m++) {
9042 for (uint32_t n = 1; n <= 2; n++) {
9043 GemmMicrokernelTester()
9044 .mr(4)
9045 .nr(2)
9046 .kr(1)
9047 .sr(1)
9048 .m(m)
9049 .n(n)
9050 .k(k)
9051 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009052 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009053 }
9054 }
9055 }
9056 }
9057
Frank Barchard91317c52019-11-22 10:54:35 -08009058 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009059 TEST_REQUIRES_ARM_NEON;
9060 for (size_t k = 3; k < 4; k++) {
9061 GemmMicrokernelTester()
9062 .mr(4)
9063 .nr(2)
9064 .kr(1)
9065 .sr(1)
9066 .m(4)
9067 .n(2)
9068 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08009069 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009070 }
9071 }
9072
Frank Barchard91317c52019-11-22 10:54:35 -08009073 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009074 TEST_REQUIRES_ARM_NEON;
9075 for (size_t k = 3; k < 4; k++) {
9076 for (uint32_t m = 1; m <= 4; m++) {
9077 for (uint32_t n = 1; n <= 2; n++) {
9078 GemmMicrokernelTester()
9079 .mr(4)
9080 .nr(2)
9081 .kr(1)
9082 .sr(1)
9083 .m(m)
9084 .n(n)
9085 .k(k)
9086 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009087 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009088 }
9089 }
9090 }
9091 }
9092
Frank Barchard91317c52019-11-22 10:54:35 -08009093 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009094 TEST_REQUIRES_ARM_NEON;
9095 for (size_t k = 4; k <= 20; k += 2) {
9096 GemmMicrokernelTester()
9097 .mr(4)
9098 .nr(2)
9099 .kr(1)
9100 .sr(1)
9101 .m(4)
9102 .n(2)
9103 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08009104 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009105 }
9106 }
9107
Frank Barchard91317c52019-11-22 10:54:35 -08009108 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009109 TEST_REQUIRES_ARM_NEON;
9110 for (size_t k = 4; k <= 20; k += 2) {
9111 for (uint32_t m = 1; m <= 4; m++) {
9112 for (uint32_t n = 1; n <= 2; n++) {
9113 GemmMicrokernelTester()
9114 .mr(4)
9115 .nr(2)
9116 .kr(1)
9117 .sr(1)
9118 .m(m)
9119 .n(n)
9120 .k(k)
9121 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009122 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009123 }
9124 }
9125 }
9126 }
9127
Frank Barchard91317c52019-11-22 10:54:35 -08009128 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009129 TEST_REQUIRES_ARM_NEON;
9130 for (uint32_t n = 3; n < 4; n++) {
9131 for (size_t k = 1; k <= 10; k += 3) {
9132 GemmMicrokernelTester()
9133 .mr(4)
9134 .nr(2)
9135 .kr(1)
9136 .sr(1)
9137 .m(4)
9138 .n(2)
9139 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08009140 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009141 }
9142 }
9143 }
9144
Frank Barchard91317c52019-11-22 10:54:35 -08009145 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_gt_2_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009146 TEST_REQUIRES_ARM_NEON;
9147 for (uint32_t n = 3; n < 4; n++) {
9148 for (size_t k = 1; k <= 10; k += 3) {
9149 GemmMicrokernelTester()
9150 .mr(4)
9151 .nr(2)
9152 .kr(1)
9153 .sr(1)
9154 .m(4)
9155 .n(2)
9156 .k(k)
9157 .cn_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -08009158 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009159 }
9160 }
9161 }
9162
Frank Barchard91317c52019-11-22 10:54:35 -08009163 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009164 TEST_REQUIRES_ARM_NEON;
9165 for (uint32_t n = 3; n < 4; n++) {
9166 for (size_t k = 1; k <= 10; k += 3) {
9167 for (uint32_t m = 1; m <= 4; m++) {
9168 GemmMicrokernelTester()
9169 .mr(4)
9170 .nr(2)
9171 .kr(1)
9172 .sr(1)
9173 .m(m)
9174 .n(n)
9175 .k(k)
9176 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009177 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009178 }
9179 }
9180 }
9181 }
9182
Frank Barchard91317c52019-11-22 10:54:35 -08009183 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009184 TEST_REQUIRES_ARM_NEON;
9185 for (uint32_t n = 4; n <= 6; n += 2) {
9186 for (size_t k = 1; k <= 10; k += 3) {
9187 GemmMicrokernelTester()
9188 .mr(4)
9189 .nr(2)
9190 .kr(1)
9191 .sr(1)
9192 .m(4)
9193 .n(2)
9194 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08009195 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009196 }
9197 }
9198 }
9199
Frank Barchard91317c52019-11-22 10:54:35 -08009200 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_div_2_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009201 TEST_REQUIRES_ARM_NEON;
9202 for (uint32_t n = 4; n <= 6; n += 2) {
9203 for (size_t k = 1; k <= 10; k += 3) {
9204 GemmMicrokernelTester()
9205 .mr(4)
9206 .nr(2)
9207 .kr(1)
9208 .sr(1)
9209 .m(4)
9210 .n(n)
9211 .k(k)
9212 .cn_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -08009213 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009214 }
9215 }
9216 }
9217
Frank Barchard91317c52019-11-22 10:54:35 -08009218 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009219 TEST_REQUIRES_ARM_NEON;
9220 for (uint32_t n = 4; n <= 6; n += 2) {
9221 for (size_t k = 1; k <= 10; k += 3) {
9222 for (uint32_t m = 1; m <= 4; m++) {
9223 GemmMicrokernelTester()
9224 .mr(4)
9225 .nr(2)
9226 .kr(1)
9227 .sr(1)
9228 .m(m)
9229 .n(n)
9230 .k(k)
9231 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009232 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009233 }
9234 }
9235 }
9236 }
9237
Frank Barchard91317c52019-11-22 10:54:35 -08009238 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009239 TEST_REQUIRES_ARM_NEON;
9240 for (size_t k = 1; k <= 10; k += 3) {
9241 GemmMicrokernelTester()
9242 .mr(4)
9243 .nr(2)
9244 .kr(1)
9245 .sr(1)
9246 .m(4)
9247 .n(2)
9248 .k(k)
9249 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -08009250 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009251 }
9252 }
9253
Frank Barchard91317c52019-11-22 10:54:35 -08009254 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, small_kernel_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009255 TEST_REQUIRES_ARM_NEON;
9256 for (size_t k = 1; k <= 10; k += 3) {
9257 for (uint32_t m = 1; m <= 4; m++) {
9258 for (uint32_t n = 1; n <= 2; n++) {
9259 GemmMicrokernelTester()
9260 .mr(4)
9261 .nr(2)
9262 .kr(1)
9263 .sr(1)
9264 .m(m)
9265 .n(n)
9266 .k(k)
9267 .ks(3)
9268 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009269 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009270 }
9271 }
9272 }
9273 }
9274
Frank Barchard91317c52019-11-22 10:54:35 -08009275 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_gt_2_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009276 TEST_REQUIRES_ARM_NEON;
9277 for (uint32_t n = 3; n < 4; n++) {
9278 for (size_t k = 1; k <= 10; k += 3) {
9279 GemmMicrokernelTester()
9280 .mr(4)
9281 .nr(2)
9282 .kr(1)
9283 .sr(1)
9284 .m(4)
9285 .n(2)
9286 .k(k)
9287 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -08009288 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009289 }
9290 }
9291 }
9292
Frank Barchard91317c52019-11-22 10:54:35 -08009293 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_div_2_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009294 TEST_REQUIRES_ARM_NEON;
9295 for (uint32_t n = 4; n <= 6; n += 2) {
9296 for (size_t k = 1; k <= 10; k += 3) {
9297 GemmMicrokernelTester()
9298 .mr(4)
9299 .nr(2)
9300 .kr(1)
9301 .sr(1)
9302 .m(4)
9303 .n(2)
9304 .k(k)
9305 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -08009306 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009307 }
9308 }
9309 }
9310
Frank Barchard91317c52019-11-22 10:54:35 -08009311 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009312 TEST_REQUIRES_ARM_NEON;
9313 for (size_t k = 1; k <= 10; k += 3) {
9314 for (uint32_t m = 1; m <= 4; m++) {
9315 for (uint32_t n = 1; n <= 2; n++) {
9316 GemmMicrokernelTester()
9317 .mr(4)
9318 .nr(2)
9319 .kr(1)
9320 .sr(1)
9321 .m(m)
9322 .n(n)
9323 .k(k)
9324 .cm_stride(5)
9325 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009326 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009327 }
9328 }
9329 }
9330 }
9331
Frank Barchard91317c52019-11-22 10:54:35 -08009332 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, a_offset) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009333 TEST_REQUIRES_ARM_NEON;
9334 for (size_t k = 1; k <= 10; k += 3) {
9335 GemmMicrokernelTester()
9336 .mr(4)
9337 .nr(2)
9338 .kr(1)
9339 .sr(1)
9340 .m(4)
9341 .n(2)
9342 .k(k)
9343 .ks(3)
9344 .a_offset(43)
Frank Barchard91317c52019-11-22 10:54:35 -08009345 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009346 }
9347 }
9348
Frank Barchard91317c52019-11-22 10:54:35 -08009349 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, zero) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009350 TEST_REQUIRES_ARM_NEON;
9351 for (uint32_t mz = 0; mz < 4; mz++) {
9352 for (size_t k = 1; k <= 10; k += 3) {
9353 GemmMicrokernelTester()
9354 .mr(4)
9355 .nr(2)
9356 .kr(1)
9357 .sr(1)
9358 .m(4)
9359 .n(2)
9360 .k(k)
9361 .ks(3)
9362 .a_offset(43)
9363 .zero_index(mz)
Frank Barchard91317c52019-11-22 10:54:35 -08009364 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009365 }
9366 }
9367 }
9368
Frank Barchard91317c52019-11-22 10:54:35 -08009369 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009370 TEST_REQUIRES_ARM_NEON;
9371 GemmMicrokernelTester()
9372 .mr(4)
9373 .nr(2)
9374 .kr(1)
9375 .sr(1)
9376 .m(4)
9377 .n(2)
9378 .k(2)
9379 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -08009380 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009381 }
9382
Frank Barchard91317c52019-11-22 10:54:35 -08009383 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009384 TEST_REQUIRES_ARM_NEON;
9385 GemmMicrokernelTester()
9386 .mr(4)
9387 .nr(2)
9388 .kr(1)
9389 .sr(1)
9390 .m(4)
9391 .n(2)
9392 .k(2)
9393 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -08009394 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009395 }
9396
Frank Barchard91317c52019-11-22 10:54:35 -08009397 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009398 TEST_REQUIRES_ARM_NEON;
9399 GemmMicrokernelTester()
9400 .mr(4)
9401 .nr(2)
9402 .kr(1)
9403 .sr(1)
9404 .m(4)
9405 .n(2)
9406 .k(2)
9407 .cm_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -08009408 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009409 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -07009410#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -07009411
9412
Marat Dukhan1dadbf72019-10-01 10:46:20 -07009413#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -08009414 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009415 TEST_REQUIRES_ARM_NEON;
9416 GemmMicrokernelTester()
9417 .mr(4)
9418 .nr(4)
9419 .kr(1)
9420 .sr(1)
9421 .m(4)
9422 .n(4)
9423 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -08009424 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009425 }
9426
Frank Barchard91317c52019-11-22 10:54:35 -08009427 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009428 TEST_REQUIRES_ARM_NEON;
9429 GemmMicrokernelTester()
9430 .mr(4)
9431 .nr(4)
9432 .kr(1)
9433 .sr(1)
9434 .m(4)
9435 .n(4)
9436 .k(2)
9437 .cn_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -08009438 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009439 }
9440
Frank Barchard91317c52019-11-22 10:54:35 -08009441 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009442 TEST_REQUIRES_ARM_NEON;
9443 for (uint32_t m = 1; m <= 4; m++) {
9444 for (uint32_t n = 1; n <= 4; n++) {
9445 GemmMicrokernelTester()
9446 .mr(4)
9447 .nr(4)
9448 .kr(1)
9449 .sr(1)
9450 .m(m)
9451 .n(n)
9452 .k(2)
9453 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009454 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009455 }
9456 }
9457 }
9458
Frank Barchard91317c52019-11-22 10:54:35 -08009459 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009460 TEST_REQUIRES_ARM_NEON;
9461 for (uint32_t m = 1; m <= 4; m++) {
9462 GemmMicrokernelTester()
9463 .mr(4)
9464 .nr(4)
9465 .kr(1)
9466 .sr(1)
9467 .m(m)
9468 .n(4)
9469 .k(2)
9470 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009471 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009472 }
9473 }
9474
Frank Barchard91317c52019-11-22 10:54:35 -08009475 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009476 TEST_REQUIRES_ARM_NEON;
9477 for (uint32_t n = 1; n <= 4; n++) {
9478 GemmMicrokernelTester()
9479 .mr(4)
9480 .nr(4)
9481 .kr(1)
9482 .sr(1)
9483 .m(4)
9484 .n(n)
9485 .k(2)
9486 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009487 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009488 }
9489 }
9490
Frank Barchard91317c52019-11-22 10:54:35 -08009491 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009492 TEST_REQUIRES_ARM_NEON;
9493 for (size_t k = 1; k < 2; k++) {
9494 GemmMicrokernelTester()
9495 .mr(4)
9496 .nr(4)
9497 .kr(1)
9498 .sr(1)
9499 .m(4)
9500 .n(4)
9501 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08009502 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009503 }
9504 }
9505
Frank Barchard91317c52019-11-22 10:54:35 -08009506 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009507 TEST_REQUIRES_ARM_NEON;
9508 for (size_t k = 1; k < 2; k++) {
9509 for (uint32_t m = 1; m <= 4; m++) {
9510 for (uint32_t n = 1; n <= 4; n++) {
9511 GemmMicrokernelTester()
9512 .mr(4)
9513 .nr(4)
9514 .kr(1)
9515 .sr(1)
9516 .m(m)
9517 .n(n)
9518 .k(k)
9519 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009520 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009521 }
9522 }
9523 }
9524 }
9525
Frank Barchard91317c52019-11-22 10:54:35 -08009526 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009527 TEST_REQUIRES_ARM_NEON;
9528 for (size_t k = 3; k < 4; k++) {
9529 GemmMicrokernelTester()
9530 .mr(4)
9531 .nr(4)
9532 .kr(1)
9533 .sr(1)
9534 .m(4)
9535 .n(4)
9536 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08009537 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009538 }
9539 }
9540
Frank Barchard91317c52019-11-22 10:54:35 -08009541 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009542 TEST_REQUIRES_ARM_NEON;
9543 for (size_t k = 3; k < 4; k++) {
9544 for (uint32_t m = 1; m <= 4; m++) {
9545 for (uint32_t n = 1; n <= 4; n++) {
9546 GemmMicrokernelTester()
9547 .mr(4)
9548 .nr(4)
9549 .kr(1)
9550 .sr(1)
9551 .m(m)
9552 .n(n)
9553 .k(k)
9554 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009555 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009556 }
9557 }
9558 }
9559 }
9560
Frank Barchard91317c52019-11-22 10:54:35 -08009561 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009562 TEST_REQUIRES_ARM_NEON;
9563 for (size_t k = 4; k <= 20; k += 2) {
9564 GemmMicrokernelTester()
9565 .mr(4)
9566 .nr(4)
9567 .kr(1)
9568 .sr(1)
9569 .m(4)
9570 .n(4)
9571 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08009572 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009573 }
9574 }
9575
Frank Barchard91317c52019-11-22 10:54:35 -08009576 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009577 TEST_REQUIRES_ARM_NEON;
9578 for (size_t k = 4; k <= 20; k += 2) {
9579 for (uint32_t m = 1; m <= 4; m++) {
9580 for (uint32_t n = 1; n <= 4; n++) {
9581 GemmMicrokernelTester()
9582 .mr(4)
9583 .nr(4)
9584 .kr(1)
9585 .sr(1)
9586 .m(m)
9587 .n(n)
9588 .k(k)
9589 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009590 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009591 }
9592 }
9593 }
9594 }
9595
Frank Barchard91317c52019-11-22 10:54:35 -08009596 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_gt_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009597 TEST_REQUIRES_ARM_NEON;
9598 for (uint32_t n = 5; n < 8; n++) {
9599 for (size_t k = 1; k <= 10; k += 3) {
9600 GemmMicrokernelTester()
9601 .mr(4)
9602 .nr(4)
9603 .kr(1)
9604 .sr(1)
9605 .m(4)
9606 .n(4)
9607 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08009608 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009609 }
9610 }
9611 }
9612
Frank Barchard91317c52019-11-22 10:54:35 -08009613 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_gt_4_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009614 TEST_REQUIRES_ARM_NEON;
9615 for (uint32_t n = 5; n < 8; n++) {
9616 for (size_t k = 1; k <= 10; k += 3) {
9617 GemmMicrokernelTester()
9618 .mr(4)
9619 .nr(4)
9620 .kr(1)
9621 .sr(1)
9622 .m(4)
9623 .n(4)
9624 .k(k)
9625 .cn_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -08009626 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009627 }
9628 }
9629 }
9630
Frank Barchard91317c52019-11-22 10:54:35 -08009631 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_gt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009632 TEST_REQUIRES_ARM_NEON;
9633 for (uint32_t n = 5; n < 8; n++) {
9634 for (size_t k = 1; k <= 10; k += 3) {
9635 for (uint32_t m = 1; m <= 4; m++) {
9636 GemmMicrokernelTester()
9637 .mr(4)
9638 .nr(4)
9639 .kr(1)
9640 .sr(1)
9641 .m(m)
9642 .n(n)
9643 .k(k)
9644 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009645 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009646 }
9647 }
9648 }
9649 }
9650
Frank Barchard91317c52019-11-22 10:54:35 -08009651 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_div_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009652 TEST_REQUIRES_ARM_NEON;
9653 for (uint32_t n = 8; n <= 12; n += 4) {
9654 for (size_t k = 1; k <= 10; k += 3) {
9655 GemmMicrokernelTester()
9656 .mr(4)
9657 .nr(4)
9658 .kr(1)
9659 .sr(1)
9660 .m(4)
9661 .n(4)
9662 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08009663 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009664 }
9665 }
9666 }
9667
Frank Barchard91317c52019-11-22 10:54:35 -08009668 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_div_4_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009669 TEST_REQUIRES_ARM_NEON;
9670 for (uint32_t n = 8; n <= 12; n += 4) {
9671 for (size_t k = 1; k <= 10; k += 3) {
9672 GemmMicrokernelTester()
9673 .mr(4)
9674 .nr(4)
9675 .kr(1)
9676 .sr(1)
9677 .m(4)
9678 .n(n)
9679 .k(k)
9680 .cn_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -08009681 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009682 }
9683 }
9684 }
9685
Frank Barchard91317c52019-11-22 10:54:35 -08009686 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_div_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009687 TEST_REQUIRES_ARM_NEON;
9688 for (uint32_t n = 8; n <= 12; n += 4) {
9689 for (size_t k = 1; k <= 10; k += 3) {
9690 for (uint32_t m = 1; m <= 4; m++) {
9691 GemmMicrokernelTester()
9692 .mr(4)
9693 .nr(4)
9694 .kr(1)
9695 .sr(1)
9696 .m(m)
9697 .n(n)
9698 .k(k)
9699 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009700 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009701 }
9702 }
9703 }
9704 }
9705
Frank Barchard91317c52019-11-22 10:54:35 -08009706 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009707 TEST_REQUIRES_ARM_NEON;
9708 for (size_t k = 1; k <= 10; k += 3) {
9709 GemmMicrokernelTester()
9710 .mr(4)
9711 .nr(4)
9712 .kr(1)
9713 .sr(1)
9714 .m(4)
9715 .n(4)
9716 .k(k)
9717 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -08009718 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009719 }
9720 }
9721
Frank Barchard91317c52019-11-22 10:54:35 -08009722 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, small_kernel_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009723 TEST_REQUIRES_ARM_NEON;
9724 for (size_t k = 1; k <= 10; k += 3) {
9725 for (uint32_t m = 1; m <= 4; m++) {
9726 for (uint32_t n = 1; n <= 4; n++) {
9727 GemmMicrokernelTester()
9728 .mr(4)
9729 .nr(4)
9730 .kr(1)
9731 .sr(1)
9732 .m(m)
9733 .n(n)
9734 .k(k)
9735 .ks(3)
9736 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009737 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009738 }
9739 }
9740 }
9741 }
9742
Frank Barchard91317c52019-11-22 10:54:35 -08009743 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_gt_4_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009744 TEST_REQUIRES_ARM_NEON;
9745 for (uint32_t n = 5; n < 8; n++) {
9746 for (size_t k = 1; k <= 10; k += 3) {
9747 GemmMicrokernelTester()
9748 .mr(4)
9749 .nr(4)
9750 .kr(1)
9751 .sr(1)
9752 .m(4)
9753 .n(4)
9754 .k(k)
9755 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -08009756 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009757 }
9758 }
9759 }
9760
Frank Barchard91317c52019-11-22 10:54:35 -08009761 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_div_4_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009762 TEST_REQUIRES_ARM_NEON;
9763 for (uint32_t n = 8; n <= 12; n += 4) {
9764 for (size_t k = 1; k <= 10; k += 3) {
9765 GemmMicrokernelTester()
9766 .mr(4)
9767 .nr(4)
9768 .kr(1)
9769 .sr(1)
9770 .m(4)
9771 .n(4)
9772 .k(k)
9773 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -08009774 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009775 }
9776 }
9777 }
9778
Frank Barchard91317c52019-11-22 10:54:35 -08009779 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009780 TEST_REQUIRES_ARM_NEON;
9781 for (size_t k = 1; k <= 10; k += 3) {
9782 for (uint32_t m = 1; m <= 4; m++) {
9783 for (uint32_t n = 1; n <= 4; n++) {
9784 GemmMicrokernelTester()
9785 .mr(4)
9786 .nr(4)
9787 .kr(1)
9788 .sr(1)
9789 .m(m)
9790 .n(n)
9791 .k(k)
9792 .cm_stride(7)
9793 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009794 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009795 }
9796 }
9797 }
9798 }
9799
Frank Barchard91317c52019-11-22 10:54:35 -08009800 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, a_offset) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009801 TEST_REQUIRES_ARM_NEON;
9802 for (size_t k = 1; k <= 10; k += 3) {
9803 GemmMicrokernelTester()
9804 .mr(4)
9805 .nr(4)
9806 .kr(1)
9807 .sr(1)
9808 .m(4)
9809 .n(4)
9810 .k(k)
9811 .ks(3)
9812 .a_offset(43)
Frank Barchard91317c52019-11-22 10:54:35 -08009813 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009814 }
9815 }
9816
Frank Barchard91317c52019-11-22 10:54:35 -08009817 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, zero) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009818 TEST_REQUIRES_ARM_NEON;
9819 for (uint32_t mz = 0; mz < 4; mz++) {
9820 for (size_t k = 1; k <= 10; k += 3) {
9821 GemmMicrokernelTester()
9822 .mr(4)
9823 .nr(4)
9824 .kr(1)
9825 .sr(1)
9826 .m(4)
9827 .n(4)
9828 .k(k)
9829 .ks(3)
9830 .a_offset(43)
9831 .zero_index(mz)
Frank Barchard91317c52019-11-22 10:54:35 -08009832 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009833 }
9834 }
9835 }
9836
Frank Barchard91317c52019-11-22 10:54:35 -08009837 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009838 TEST_REQUIRES_ARM_NEON;
9839 GemmMicrokernelTester()
9840 .mr(4)
9841 .nr(4)
9842 .kr(1)
9843 .sr(1)
9844 .m(4)
9845 .n(4)
9846 .k(2)
9847 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -08009848 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009849 }
9850
Frank Barchard91317c52019-11-22 10:54:35 -08009851 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009852 TEST_REQUIRES_ARM_NEON;
9853 GemmMicrokernelTester()
9854 .mr(4)
9855 .nr(4)
9856 .kr(1)
9857 .sr(1)
9858 .m(4)
9859 .n(4)
9860 .k(2)
9861 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -08009862 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009863 }
9864
Frank Barchard91317c52019-11-22 10:54:35 -08009865 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009866 TEST_REQUIRES_ARM_NEON;
9867 GemmMicrokernelTester()
9868 .mr(4)
9869 .nr(4)
9870 .kr(1)
9871 .sr(1)
9872 .m(4)
9873 .n(4)
9874 .k(2)
9875 .cm_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -08009876 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009877 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -07009878#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -07009879
9880
Marat Dukhan1dadbf72019-10-01 10:46:20 -07009881#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -08009882 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_eq_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009883 TEST_REQUIRES_ARM_NEON;
9884 GemmMicrokernelTester()
9885 .mr(4)
9886 .nr(8)
9887 .kr(1)
9888 .sr(1)
9889 .m(4)
9890 .n(8)
9891 .k(4)
Frank Barchard91317c52019-11-22 10:54:35 -08009892 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009893 }
9894
Frank Barchard91317c52019-11-22 10:54:35 -08009895 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009896 TEST_REQUIRES_ARM_NEON;
9897 GemmMicrokernelTester()
9898 .mr(4)
9899 .nr(8)
9900 .kr(1)
9901 .sr(1)
9902 .m(4)
9903 .n(8)
9904 .k(4)
9905 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -08009906 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009907 }
9908
Frank Barchard91317c52019-11-22 10:54:35 -08009909 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_eq_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009910 TEST_REQUIRES_ARM_NEON;
9911 for (uint32_t m = 1; m <= 4; m++) {
9912 for (uint32_t n = 1; n <= 8; n++) {
9913 GemmMicrokernelTester()
9914 .mr(4)
9915 .nr(8)
9916 .kr(1)
9917 .sr(1)
9918 .m(m)
9919 .n(n)
9920 .k(4)
9921 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009922 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009923 }
9924 }
9925 }
9926
Frank Barchard91317c52019-11-22 10:54:35 -08009927 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_eq_4_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009928 TEST_REQUIRES_ARM_NEON;
9929 for (uint32_t m = 1; m <= 4; m++) {
9930 GemmMicrokernelTester()
9931 .mr(4)
9932 .nr(8)
9933 .kr(1)
9934 .sr(1)
9935 .m(m)
9936 .n(8)
9937 .k(4)
9938 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009939 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009940 }
9941 }
9942
Frank Barchard91317c52019-11-22 10:54:35 -08009943 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_eq_4_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009944 TEST_REQUIRES_ARM_NEON;
9945 for (uint32_t n = 1; n <= 8; n++) {
9946 GemmMicrokernelTester()
9947 .mr(4)
9948 .nr(8)
9949 .kr(1)
9950 .sr(1)
9951 .m(4)
9952 .n(n)
9953 .k(4)
9954 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009955 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009956 }
9957 }
9958
Frank Barchard91317c52019-11-22 10:54:35 -08009959 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_lt_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009960 TEST_REQUIRES_ARM_NEON;
9961 for (size_t k = 1; k < 4; k++) {
9962 GemmMicrokernelTester()
9963 .mr(4)
9964 .nr(8)
9965 .kr(1)
9966 .sr(1)
9967 .m(4)
9968 .n(8)
9969 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08009970 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009971 }
9972 }
9973
Frank Barchard91317c52019-11-22 10:54:35 -08009974 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_lt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009975 TEST_REQUIRES_ARM_NEON;
9976 for (size_t k = 1; k < 4; k++) {
9977 for (uint32_t m = 1; m <= 4; m++) {
9978 for (uint32_t n = 1; n <= 8; n++) {
9979 GemmMicrokernelTester()
9980 .mr(4)
9981 .nr(8)
9982 .kr(1)
9983 .sr(1)
9984 .m(m)
9985 .n(n)
9986 .k(k)
9987 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009988 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009989 }
9990 }
9991 }
9992 }
9993
Frank Barchard91317c52019-11-22 10:54:35 -08009994 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_gt_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009995 TEST_REQUIRES_ARM_NEON;
9996 for (size_t k = 5; k < 8; k++) {
9997 GemmMicrokernelTester()
9998 .mr(4)
9999 .nr(8)
10000 .kr(1)
10001 .sr(1)
10002 .m(4)
10003 .n(8)
10004 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080010005 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010006 }
10007 }
10008
Frank Barchard91317c52019-11-22 10:54:35 -080010009 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_gt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010010 TEST_REQUIRES_ARM_NEON;
10011 for (size_t k = 5; k < 8; k++) {
10012 for (uint32_t m = 1; m <= 4; m++) {
10013 for (uint32_t n = 1; n <= 8; n++) {
10014 GemmMicrokernelTester()
10015 .mr(4)
10016 .nr(8)
10017 .kr(1)
10018 .sr(1)
10019 .m(m)
10020 .n(n)
10021 .k(k)
10022 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010023 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010024 }
10025 }
10026 }
10027 }
10028
Frank Barchard91317c52019-11-22 10:54:35 -080010029 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_div_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010030 TEST_REQUIRES_ARM_NEON;
10031 for (size_t k = 8; k <= 40; k += 4) {
10032 GemmMicrokernelTester()
10033 .mr(4)
10034 .nr(8)
10035 .kr(1)
10036 .sr(1)
10037 .m(4)
10038 .n(8)
10039 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080010040 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010041 }
10042 }
10043
Frank Barchard91317c52019-11-22 10:54:35 -080010044 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_div_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010045 TEST_REQUIRES_ARM_NEON;
10046 for (size_t k = 8; k <= 40; k += 4) {
10047 for (uint32_t m = 1; m <= 4; m++) {
10048 for (uint32_t n = 1; n <= 8; n++) {
10049 GemmMicrokernelTester()
10050 .mr(4)
10051 .nr(8)
10052 .kr(1)
10053 .sr(1)
10054 .m(m)
10055 .n(n)
10056 .k(k)
10057 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010058 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010059 }
10060 }
10061 }
10062 }
10063
Frank Barchard91317c52019-11-22 10:54:35 -080010064 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010065 TEST_REQUIRES_ARM_NEON;
10066 for (uint32_t n = 9; n < 16; n++) {
10067 for (size_t k = 1; k <= 20; k += 5) {
10068 GemmMicrokernelTester()
10069 .mr(4)
10070 .nr(8)
10071 .kr(1)
10072 .sr(1)
10073 .m(4)
10074 .n(8)
10075 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080010076 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010077 }
10078 }
10079 }
10080
Frank Barchard91317c52019-11-22 10:54:35 -080010081 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010082 TEST_REQUIRES_ARM_NEON;
10083 for (uint32_t n = 9; n < 16; n++) {
10084 for (size_t k = 1; k <= 20; k += 5) {
10085 GemmMicrokernelTester()
10086 .mr(4)
10087 .nr(8)
10088 .kr(1)
10089 .sr(1)
10090 .m(4)
10091 .n(8)
10092 .k(k)
10093 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080010094 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010095 }
10096 }
10097 }
10098
Frank Barchard91317c52019-11-22 10:54:35 -080010099 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010100 TEST_REQUIRES_ARM_NEON;
10101 for (uint32_t n = 9; n < 16; n++) {
10102 for (size_t k = 1; k <= 20; k += 5) {
10103 for (uint32_t m = 1; m <= 4; m++) {
10104 GemmMicrokernelTester()
10105 .mr(4)
10106 .nr(8)
10107 .kr(1)
10108 .sr(1)
10109 .m(m)
10110 .n(n)
10111 .k(k)
10112 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010113 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010114 }
10115 }
10116 }
10117 }
10118
Frank Barchard91317c52019-11-22 10:54:35 -080010119 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010120 TEST_REQUIRES_ARM_NEON;
10121 for (uint32_t n = 16; n <= 24; n += 8) {
10122 for (size_t k = 1; k <= 20; k += 5) {
10123 GemmMicrokernelTester()
10124 .mr(4)
10125 .nr(8)
10126 .kr(1)
10127 .sr(1)
10128 .m(4)
10129 .n(8)
10130 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080010131 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010132 }
10133 }
10134 }
10135
Frank Barchard91317c52019-11-22 10:54:35 -080010136 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010137 TEST_REQUIRES_ARM_NEON;
10138 for (uint32_t n = 16; n <= 24; n += 8) {
10139 for (size_t k = 1; k <= 20; k += 5) {
10140 GemmMicrokernelTester()
10141 .mr(4)
10142 .nr(8)
10143 .kr(1)
10144 .sr(1)
10145 .m(4)
10146 .n(n)
10147 .k(k)
10148 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080010149 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010150 }
10151 }
10152 }
10153
Frank Barchard91317c52019-11-22 10:54:35 -080010154 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010155 TEST_REQUIRES_ARM_NEON;
10156 for (uint32_t n = 16; n <= 24; n += 8) {
10157 for (size_t k = 1; k <= 20; k += 5) {
10158 for (uint32_t m = 1; m <= 4; m++) {
10159 GemmMicrokernelTester()
10160 .mr(4)
10161 .nr(8)
10162 .kr(1)
10163 .sr(1)
10164 .m(m)
10165 .n(n)
10166 .k(k)
10167 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010168 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010169 }
10170 }
10171 }
10172 }
10173
Frank Barchard91317c52019-11-22 10:54:35 -080010174 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010175 TEST_REQUIRES_ARM_NEON;
10176 for (size_t k = 1; k <= 20; k += 5) {
10177 GemmMicrokernelTester()
10178 .mr(4)
10179 .nr(8)
10180 .kr(1)
10181 .sr(1)
10182 .m(4)
10183 .n(8)
10184 .k(k)
10185 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080010186 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010187 }
10188 }
10189
Frank Barchard91317c52019-11-22 10:54:35 -080010190 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, small_kernel_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010191 TEST_REQUIRES_ARM_NEON;
10192 for (size_t k = 1; k <= 20; k += 5) {
10193 for (uint32_t m = 1; m <= 4; m++) {
10194 for (uint32_t n = 1; n <= 8; n++) {
10195 GemmMicrokernelTester()
10196 .mr(4)
10197 .nr(8)
10198 .kr(1)
10199 .sr(1)
10200 .m(m)
10201 .n(n)
10202 .k(k)
10203 .ks(3)
10204 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010205 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010206 }
10207 }
10208 }
10209 }
10210
Frank Barchard91317c52019-11-22 10:54:35 -080010211 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_gt_8_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010212 TEST_REQUIRES_ARM_NEON;
10213 for (uint32_t n = 9; n < 16; n++) {
10214 for (size_t k = 1; k <= 20; k += 5) {
10215 GemmMicrokernelTester()
10216 .mr(4)
10217 .nr(8)
10218 .kr(1)
10219 .sr(1)
10220 .m(4)
10221 .n(8)
10222 .k(k)
10223 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080010224 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010225 }
10226 }
10227 }
10228
Frank Barchard91317c52019-11-22 10:54:35 -080010229 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_div_8_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010230 TEST_REQUIRES_ARM_NEON;
10231 for (uint32_t n = 16; n <= 24; n += 8) {
10232 for (size_t k = 1; k <= 20; k += 5) {
10233 GemmMicrokernelTester()
10234 .mr(4)
10235 .nr(8)
10236 .kr(1)
10237 .sr(1)
10238 .m(4)
10239 .n(8)
10240 .k(k)
10241 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080010242 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010243 }
10244 }
10245 }
10246
Frank Barchard91317c52019-11-22 10:54:35 -080010247 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010248 TEST_REQUIRES_ARM_NEON;
10249 for (size_t k = 1; k <= 20; k += 5) {
10250 for (uint32_t m = 1; m <= 4; m++) {
10251 for (uint32_t n = 1; n <= 8; n++) {
10252 GemmMicrokernelTester()
10253 .mr(4)
10254 .nr(8)
10255 .kr(1)
10256 .sr(1)
10257 .m(m)
10258 .n(n)
10259 .k(k)
10260 .cm_stride(11)
10261 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010262 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010263 }
10264 }
10265 }
10266 }
10267
Frank Barchard91317c52019-11-22 10:54:35 -080010268 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, a_offset) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010269 TEST_REQUIRES_ARM_NEON;
10270 for (size_t k = 1; k <= 20; k += 5) {
10271 GemmMicrokernelTester()
10272 .mr(4)
10273 .nr(8)
10274 .kr(1)
10275 .sr(1)
10276 .m(4)
10277 .n(8)
10278 .k(k)
10279 .ks(3)
10280 .a_offset(83)
Frank Barchard91317c52019-11-22 10:54:35 -080010281 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010282 }
10283 }
10284
Frank Barchard91317c52019-11-22 10:54:35 -080010285 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, zero) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010286 TEST_REQUIRES_ARM_NEON;
10287 for (uint32_t mz = 0; mz < 4; mz++) {
10288 for (size_t k = 1; k <= 20; k += 5) {
10289 GemmMicrokernelTester()
10290 .mr(4)
10291 .nr(8)
10292 .kr(1)
10293 .sr(1)
10294 .m(4)
10295 .n(8)
10296 .k(k)
10297 .ks(3)
10298 .a_offset(83)
10299 .zero_index(mz)
Frank Barchard91317c52019-11-22 10:54:35 -080010300 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010301 }
10302 }
10303 }
10304
Frank Barchard91317c52019-11-22 10:54:35 -080010305 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010306 TEST_REQUIRES_ARM_NEON;
10307 GemmMicrokernelTester()
10308 .mr(4)
10309 .nr(8)
10310 .kr(1)
10311 .sr(1)
10312 .m(4)
10313 .n(8)
10314 .k(4)
10315 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080010316 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010317 }
10318
Frank Barchard91317c52019-11-22 10:54:35 -080010319 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010320 TEST_REQUIRES_ARM_NEON;
10321 GemmMicrokernelTester()
10322 .mr(4)
10323 .nr(8)
10324 .kr(1)
10325 .sr(1)
10326 .m(4)
10327 .n(8)
10328 .k(4)
10329 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080010330 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010331 }
10332
Frank Barchard91317c52019-11-22 10:54:35 -080010333 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010334 TEST_REQUIRES_ARM_NEON;
10335 GemmMicrokernelTester()
10336 .mr(4)
10337 .nr(8)
10338 .kr(1)
10339 .sr(1)
10340 .m(4)
10341 .n(8)
10342 .k(4)
10343 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080010344 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010345 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070010346#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070010347
10348
Marat Dukhan1dadbf72019-10-01 10:46:20 -070010349#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -080010350 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010351 TEST_REQUIRES_ARM_NEON;
10352 GemmMicrokernelTester()
10353 .mr(4)
10354 .nr(8)
10355 .kr(1)
10356 .sr(1)
10357 .m(4)
10358 .n(8)
10359 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -080010360 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010361 }
10362
Frank Barchard91317c52019-11-22 10:54:35 -080010363 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010364 TEST_REQUIRES_ARM_NEON;
10365 GemmMicrokernelTester()
10366 .mr(4)
10367 .nr(8)
10368 .kr(1)
10369 .sr(1)
10370 .m(4)
10371 .n(8)
10372 .k(2)
10373 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080010374 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010375 }
10376
Frank Barchard91317c52019-11-22 10:54:35 -080010377 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010378 TEST_REQUIRES_ARM_NEON;
10379 for (uint32_t m = 1; m <= 4; m++) {
10380 for (uint32_t n = 1; n <= 8; n++) {
10381 GemmMicrokernelTester()
10382 .mr(4)
10383 .nr(8)
10384 .kr(1)
10385 .sr(1)
10386 .m(m)
10387 .n(n)
10388 .k(2)
10389 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010390 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010391 }
10392 }
10393 }
10394
Frank Barchard91317c52019-11-22 10:54:35 -080010395 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010396 TEST_REQUIRES_ARM_NEON;
10397 for (uint32_t m = 1; m <= 4; m++) {
10398 GemmMicrokernelTester()
10399 .mr(4)
10400 .nr(8)
10401 .kr(1)
10402 .sr(1)
10403 .m(m)
10404 .n(8)
10405 .k(2)
10406 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010407 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010408 }
10409 }
10410
Frank Barchard91317c52019-11-22 10:54:35 -080010411 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010412 TEST_REQUIRES_ARM_NEON;
10413 for (uint32_t n = 1; n <= 8; n++) {
10414 GemmMicrokernelTester()
10415 .mr(4)
10416 .nr(8)
10417 .kr(1)
10418 .sr(1)
10419 .m(4)
10420 .n(n)
10421 .k(2)
10422 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010423 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010424 }
10425 }
10426
Frank Barchard91317c52019-11-22 10:54:35 -080010427 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010428 TEST_REQUIRES_ARM_NEON;
10429 for (size_t k = 1; k < 2; k++) {
10430 GemmMicrokernelTester()
10431 .mr(4)
10432 .nr(8)
10433 .kr(1)
10434 .sr(1)
10435 .m(4)
10436 .n(8)
10437 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080010438 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010439 }
10440 }
10441
Frank Barchard91317c52019-11-22 10:54:35 -080010442 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010443 TEST_REQUIRES_ARM_NEON;
10444 for (size_t k = 1; k < 2; k++) {
10445 for (uint32_t m = 1; m <= 4; m++) {
10446 for (uint32_t n = 1; n <= 8; n++) {
10447 GemmMicrokernelTester()
10448 .mr(4)
10449 .nr(8)
10450 .kr(1)
10451 .sr(1)
10452 .m(m)
10453 .n(n)
10454 .k(k)
10455 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010456 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010457 }
10458 }
10459 }
10460 }
10461
Frank Barchard91317c52019-11-22 10:54:35 -080010462 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010463 TEST_REQUIRES_ARM_NEON;
10464 for (size_t k = 3; k < 4; k++) {
10465 GemmMicrokernelTester()
10466 .mr(4)
10467 .nr(8)
10468 .kr(1)
10469 .sr(1)
10470 .m(4)
10471 .n(8)
10472 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080010473 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010474 }
10475 }
10476
Frank Barchard91317c52019-11-22 10:54:35 -080010477 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010478 TEST_REQUIRES_ARM_NEON;
10479 for (size_t k = 3; k < 4; k++) {
10480 for (uint32_t m = 1; m <= 4; m++) {
10481 for (uint32_t n = 1; n <= 8; n++) {
10482 GemmMicrokernelTester()
10483 .mr(4)
10484 .nr(8)
10485 .kr(1)
10486 .sr(1)
10487 .m(m)
10488 .n(n)
10489 .k(k)
10490 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010491 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010492 }
10493 }
10494 }
10495 }
10496
Frank Barchard91317c52019-11-22 10:54:35 -080010497 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010498 TEST_REQUIRES_ARM_NEON;
10499 for (size_t k = 4; k <= 20; k += 2) {
10500 GemmMicrokernelTester()
10501 .mr(4)
10502 .nr(8)
10503 .kr(1)
10504 .sr(1)
10505 .m(4)
10506 .n(8)
10507 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080010508 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010509 }
10510 }
10511
Frank Barchard91317c52019-11-22 10:54:35 -080010512 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010513 TEST_REQUIRES_ARM_NEON;
10514 for (size_t k = 4; k <= 20; k += 2) {
10515 for (uint32_t m = 1; m <= 4; m++) {
10516 for (uint32_t n = 1; n <= 8; n++) {
10517 GemmMicrokernelTester()
10518 .mr(4)
10519 .nr(8)
10520 .kr(1)
10521 .sr(1)
10522 .m(m)
10523 .n(n)
10524 .k(k)
10525 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010526 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010527 }
10528 }
10529 }
10530 }
10531
Frank Barchard91317c52019-11-22 10:54:35 -080010532 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010533 TEST_REQUIRES_ARM_NEON;
10534 for (uint32_t n = 9; n < 16; n++) {
10535 for (size_t k = 1; k <= 10; k += 3) {
10536 GemmMicrokernelTester()
10537 .mr(4)
10538 .nr(8)
10539 .kr(1)
10540 .sr(1)
10541 .m(4)
10542 .n(8)
10543 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080010544 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010545 }
10546 }
10547 }
10548
Frank Barchard91317c52019-11-22 10:54:35 -080010549 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010550 TEST_REQUIRES_ARM_NEON;
10551 for (uint32_t n = 9; n < 16; n++) {
10552 for (size_t k = 1; k <= 10; k += 3) {
10553 GemmMicrokernelTester()
10554 .mr(4)
10555 .nr(8)
10556 .kr(1)
10557 .sr(1)
10558 .m(4)
10559 .n(8)
10560 .k(k)
10561 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080010562 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010563 }
10564 }
10565 }
10566
Frank Barchard91317c52019-11-22 10:54:35 -080010567 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010568 TEST_REQUIRES_ARM_NEON;
10569 for (uint32_t n = 9; n < 16; n++) {
10570 for (size_t k = 1; k <= 10; k += 3) {
10571 for (uint32_t m = 1; m <= 4; m++) {
10572 GemmMicrokernelTester()
10573 .mr(4)
10574 .nr(8)
10575 .kr(1)
10576 .sr(1)
10577 .m(m)
10578 .n(n)
10579 .k(k)
10580 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010581 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010582 }
10583 }
10584 }
10585 }
10586
Frank Barchard91317c52019-11-22 10:54:35 -080010587 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010588 TEST_REQUIRES_ARM_NEON;
10589 for (uint32_t n = 16; n <= 24; n += 8) {
10590 for (size_t k = 1; k <= 10; k += 3) {
10591 GemmMicrokernelTester()
10592 .mr(4)
10593 .nr(8)
10594 .kr(1)
10595 .sr(1)
10596 .m(4)
10597 .n(8)
10598 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080010599 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010600 }
10601 }
10602 }
10603
Frank Barchard91317c52019-11-22 10:54:35 -080010604 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010605 TEST_REQUIRES_ARM_NEON;
10606 for (uint32_t n = 16; n <= 24; n += 8) {
10607 for (size_t k = 1; k <= 10; k += 3) {
10608 GemmMicrokernelTester()
10609 .mr(4)
10610 .nr(8)
10611 .kr(1)
10612 .sr(1)
10613 .m(4)
10614 .n(n)
10615 .k(k)
10616 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080010617 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010618 }
10619 }
10620 }
10621
Frank Barchard91317c52019-11-22 10:54:35 -080010622 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010623 TEST_REQUIRES_ARM_NEON;
10624 for (uint32_t n = 16; n <= 24; n += 8) {
10625 for (size_t k = 1; k <= 10; k += 3) {
10626 for (uint32_t m = 1; m <= 4; m++) {
10627 GemmMicrokernelTester()
10628 .mr(4)
10629 .nr(8)
10630 .kr(1)
10631 .sr(1)
10632 .m(m)
10633 .n(n)
10634 .k(k)
10635 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010636 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010637 }
10638 }
10639 }
10640 }
10641
Frank Barchard91317c52019-11-22 10:54:35 -080010642 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010643 TEST_REQUIRES_ARM_NEON;
10644 for (size_t k = 1; k <= 10; k += 3) {
10645 GemmMicrokernelTester()
10646 .mr(4)
10647 .nr(8)
10648 .kr(1)
10649 .sr(1)
10650 .m(4)
10651 .n(8)
10652 .k(k)
10653 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080010654 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010655 }
10656 }
10657
Frank Barchard91317c52019-11-22 10:54:35 -080010658 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, small_kernel_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010659 TEST_REQUIRES_ARM_NEON;
10660 for (size_t k = 1; k <= 10; k += 3) {
10661 for (uint32_t m = 1; m <= 4; m++) {
10662 for (uint32_t n = 1; n <= 8; n++) {
10663 GemmMicrokernelTester()
10664 .mr(4)
10665 .nr(8)
10666 .kr(1)
10667 .sr(1)
10668 .m(m)
10669 .n(n)
10670 .k(k)
10671 .ks(3)
10672 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010673 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010674 }
10675 }
10676 }
10677 }
10678
Frank Barchard91317c52019-11-22 10:54:35 -080010679 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_gt_8_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010680 TEST_REQUIRES_ARM_NEON;
10681 for (uint32_t n = 9; n < 16; n++) {
10682 for (size_t k = 1; k <= 10; k += 3) {
10683 GemmMicrokernelTester()
10684 .mr(4)
10685 .nr(8)
10686 .kr(1)
10687 .sr(1)
10688 .m(4)
10689 .n(8)
10690 .k(k)
10691 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080010692 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010693 }
10694 }
10695 }
10696
Frank Barchard91317c52019-11-22 10:54:35 -080010697 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_div_8_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010698 TEST_REQUIRES_ARM_NEON;
10699 for (uint32_t n = 16; n <= 24; n += 8) {
10700 for (size_t k = 1; k <= 10; k += 3) {
10701 GemmMicrokernelTester()
10702 .mr(4)
10703 .nr(8)
10704 .kr(1)
10705 .sr(1)
10706 .m(4)
10707 .n(8)
10708 .k(k)
10709 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080010710 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010711 }
10712 }
10713 }
10714
Frank Barchard91317c52019-11-22 10:54:35 -080010715 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010716 TEST_REQUIRES_ARM_NEON;
10717 for (size_t k = 1; k <= 10; k += 3) {
10718 for (uint32_t m = 1; m <= 4; m++) {
10719 for (uint32_t n = 1; n <= 8; n++) {
10720 GemmMicrokernelTester()
10721 .mr(4)
10722 .nr(8)
10723 .kr(1)
10724 .sr(1)
10725 .m(m)
10726 .n(n)
10727 .k(k)
10728 .cm_stride(11)
10729 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010730 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010731 }
10732 }
10733 }
10734 }
10735
Frank Barchard91317c52019-11-22 10:54:35 -080010736 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, a_offset) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010737 TEST_REQUIRES_ARM_NEON;
10738 for (size_t k = 1; k <= 10; k += 3) {
10739 GemmMicrokernelTester()
10740 .mr(4)
10741 .nr(8)
10742 .kr(1)
10743 .sr(1)
10744 .m(4)
10745 .n(8)
10746 .k(k)
10747 .ks(3)
10748 .a_offset(43)
Frank Barchard91317c52019-11-22 10:54:35 -080010749 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010750 }
10751 }
10752
Frank Barchard91317c52019-11-22 10:54:35 -080010753 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, zero) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010754 TEST_REQUIRES_ARM_NEON;
10755 for (uint32_t mz = 0; mz < 4; mz++) {
10756 for (size_t k = 1; k <= 10; k += 3) {
10757 GemmMicrokernelTester()
10758 .mr(4)
10759 .nr(8)
10760 .kr(1)
10761 .sr(1)
10762 .m(4)
10763 .n(8)
10764 .k(k)
10765 .ks(3)
10766 .a_offset(43)
10767 .zero_index(mz)
Frank Barchard91317c52019-11-22 10:54:35 -080010768 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010769 }
10770 }
10771 }
10772
Frank Barchard91317c52019-11-22 10:54:35 -080010773 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010774 TEST_REQUIRES_ARM_NEON;
10775 GemmMicrokernelTester()
10776 .mr(4)
10777 .nr(8)
10778 .kr(1)
10779 .sr(1)
10780 .m(4)
10781 .n(8)
10782 .k(2)
10783 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080010784 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010785 }
10786
Frank Barchard91317c52019-11-22 10:54:35 -080010787 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010788 TEST_REQUIRES_ARM_NEON;
10789 GemmMicrokernelTester()
10790 .mr(4)
10791 .nr(8)
10792 .kr(1)
10793 .sr(1)
10794 .m(4)
10795 .n(8)
10796 .k(2)
10797 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080010798 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010799 }
10800
Frank Barchard91317c52019-11-22 10:54:35 -080010801 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010802 TEST_REQUIRES_ARM_NEON;
10803 GemmMicrokernelTester()
10804 .mr(4)
10805 .nr(8)
10806 .kr(1)
10807 .sr(1)
10808 .m(4)
10809 .n(8)
10810 .k(2)
10811 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080010812 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010813 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070010814#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070010815
10816
Marat Dukhan1dadbf72019-10-01 10:46:20 -070010817#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -080010818 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010819 TEST_REQUIRES_ARM_NEON;
10820 GemmMicrokernelTester()
10821 .mr(6)
10822 .nr(8)
10823 .kr(1)
10824 .sr(1)
10825 .m(6)
10826 .n(8)
10827 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -080010828 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010829 }
10830
Frank Barchard91317c52019-11-22 10:54:35 -080010831 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010832 TEST_REQUIRES_ARM_NEON;
10833 GemmMicrokernelTester()
10834 .mr(6)
10835 .nr(8)
10836 .kr(1)
10837 .sr(1)
10838 .m(6)
10839 .n(8)
10840 .k(2)
10841 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080010842 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010843 }
10844
Frank Barchard91317c52019-11-22 10:54:35 -080010845 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010846 TEST_REQUIRES_ARM_NEON;
10847 for (uint32_t m = 1; m <= 6; m++) {
10848 for (uint32_t n = 1; n <= 8; n++) {
10849 GemmMicrokernelTester()
10850 .mr(6)
10851 .nr(8)
10852 .kr(1)
10853 .sr(1)
10854 .m(m)
10855 .n(n)
10856 .k(2)
10857 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010858 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010859 }
10860 }
10861 }
10862
Frank Barchard91317c52019-11-22 10:54:35 -080010863 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010864 TEST_REQUIRES_ARM_NEON;
10865 for (uint32_t m = 1; m <= 6; m++) {
10866 GemmMicrokernelTester()
10867 .mr(6)
10868 .nr(8)
10869 .kr(1)
10870 .sr(1)
10871 .m(m)
10872 .n(8)
10873 .k(2)
10874 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010875 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010876 }
10877 }
10878
Frank Barchard91317c52019-11-22 10:54:35 -080010879 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010880 TEST_REQUIRES_ARM_NEON;
10881 for (uint32_t n = 1; n <= 8; n++) {
10882 GemmMicrokernelTester()
10883 .mr(6)
10884 .nr(8)
10885 .kr(1)
10886 .sr(1)
10887 .m(6)
10888 .n(n)
10889 .k(2)
10890 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010891 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010892 }
10893 }
10894
Frank Barchard91317c52019-11-22 10:54:35 -080010895 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010896 TEST_REQUIRES_ARM_NEON;
10897 for (size_t k = 1; k < 2; k++) {
10898 GemmMicrokernelTester()
10899 .mr(6)
10900 .nr(8)
10901 .kr(1)
10902 .sr(1)
10903 .m(6)
10904 .n(8)
10905 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080010906 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010907 }
10908 }
10909
Frank Barchard91317c52019-11-22 10:54:35 -080010910 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010911 TEST_REQUIRES_ARM_NEON;
10912 for (size_t k = 1; k < 2; k++) {
10913 for (uint32_t m = 1; m <= 6; m++) {
10914 for (uint32_t n = 1; n <= 8; n++) {
10915 GemmMicrokernelTester()
10916 .mr(6)
10917 .nr(8)
10918 .kr(1)
10919 .sr(1)
10920 .m(m)
10921 .n(n)
10922 .k(k)
10923 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010924 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010925 }
10926 }
10927 }
10928 }
10929
Frank Barchard91317c52019-11-22 10:54:35 -080010930 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010931 TEST_REQUIRES_ARM_NEON;
10932 for (size_t k = 3; k < 4; k++) {
10933 GemmMicrokernelTester()
10934 .mr(6)
10935 .nr(8)
10936 .kr(1)
10937 .sr(1)
10938 .m(6)
10939 .n(8)
10940 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080010941 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010942 }
10943 }
10944
Frank Barchard91317c52019-11-22 10:54:35 -080010945 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010946 TEST_REQUIRES_ARM_NEON;
10947 for (size_t k = 3; k < 4; k++) {
10948 for (uint32_t m = 1; m <= 6; m++) {
10949 for (uint32_t n = 1; n <= 8; n++) {
10950 GemmMicrokernelTester()
10951 .mr(6)
10952 .nr(8)
10953 .kr(1)
10954 .sr(1)
10955 .m(m)
10956 .n(n)
10957 .k(k)
10958 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010959 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010960 }
10961 }
10962 }
10963 }
10964
Frank Barchard91317c52019-11-22 10:54:35 -080010965 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010966 TEST_REQUIRES_ARM_NEON;
10967 for (size_t k = 4; k <= 20; k += 2) {
10968 GemmMicrokernelTester()
10969 .mr(6)
10970 .nr(8)
10971 .kr(1)
10972 .sr(1)
10973 .m(6)
10974 .n(8)
10975 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080010976 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010977 }
10978 }
10979
Frank Barchard91317c52019-11-22 10:54:35 -080010980 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010981 TEST_REQUIRES_ARM_NEON;
10982 for (size_t k = 4; k <= 20; k += 2) {
10983 for (uint32_t m = 1; m <= 6; m++) {
10984 for (uint32_t n = 1; n <= 8; n++) {
10985 GemmMicrokernelTester()
10986 .mr(6)
10987 .nr(8)
10988 .kr(1)
10989 .sr(1)
10990 .m(m)
10991 .n(n)
10992 .k(k)
10993 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010994 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010995 }
10996 }
10997 }
10998 }
10999
Frank Barchard91317c52019-11-22 10:54:35 -080011000 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011001 TEST_REQUIRES_ARM_NEON;
11002 for (uint32_t n = 9; n < 16; n++) {
11003 for (size_t k = 1; k <= 10; k += 3) {
11004 GemmMicrokernelTester()
11005 .mr(6)
11006 .nr(8)
11007 .kr(1)
11008 .sr(1)
11009 .m(6)
11010 .n(8)
11011 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011012 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011013 }
11014 }
11015 }
11016
Frank Barchard91317c52019-11-22 10:54:35 -080011017 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011018 TEST_REQUIRES_ARM_NEON;
11019 for (uint32_t n = 9; n < 16; n++) {
11020 for (size_t k = 1; k <= 10; k += 3) {
11021 GemmMicrokernelTester()
11022 .mr(6)
11023 .nr(8)
11024 .kr(1)
11025 .sr(1)
11026 .m(6)
11027 .n(8)
11028 .k(k)
11029 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080011030 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011031 }
11032 }
11033 }
11034
Frank Barchard91317c52019-11-22 10:54:35 -080011035 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011036 TEST_REQUIRES_ARM_NEON;
11037 for (uint32_t n = 9; n < 16; n++) {
11038 for (size_t k = 1; k <= 10; k += 3) {
11039 for (uint32_t m = 1; m <= 6; m++) {
11040 GemmMicrokernelTester()
11041 .mr(6)
11042 .nr(8)
11043 .kr(1)
11044 .sr(1)
11045 .m(m)
11046 .n(n)
11047 .k(k)
11048 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011049 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011050 }
11051 }
11052 }
11053 }
11054
Frank Barchard91317c52019-11-22 10:54:35 -080011055 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011056 TEST_REQUIRES_ARM_NEON;
11057 for (uint32_t n = 16; n <= 24; n += 8) {
11058 for (size_t k = 1; k <= 10; k += 3) {
11059 GemmMicrokernelTester()
11060 .mr(6)
11061 .nr(8)
11062 .kr(1)
11063 .sr(1)
11064 .m(6)
11065 .n(8)
11066 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011067 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011068 }
11069 }
11070 }
11071
Frank Barchard91317c52019-11-22 10:54:35 -080011072 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011073 TEST_REQUIRES_ARM_NEON;
11074 for (uint32_t n = 16; n <= 24; n += 8) {
11075 for (size_t k = 1; k <= 10; k += 3) {
11076 GemmMicrokernelTester()
11077 .mr(6)
11078 .nr(8)
11079 .kr(1)
11080 .sr(1)
11081 .m(6)
11082 .n(n)
11083 .k(k)
11084 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080011085 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011086 }
11087 }
11088 }
11089
Frank Barchard91317c52019-11-22 10:54:35 -080011090 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011091 TEST_REQUIRES_ARM_NEON;
11092 for (uint32_t n = 16; n <= 24; n += 8) {
11093 for (size_t k = 1; k <= 10; k += 3) {
11094 for (uint32_t m = 1; m <= 6; m++) {
11095 GemmMicrokernelTester()
11096 .mr(6)
11097 .nr(8)
11098 .kr(1)
11099 .sr(1)
11100 .m(m)
11101 .n(n)
11102 .k(k)
11103 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011104 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011105 }
11106 }
11107 }
11108 }
11109
Frank Barchard91317c52019-11-22 10:54:35 -080011110 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011111 TEST_REQUIRES_ARM_NEON;
11112 for (size_t k = 1; k <= 10; k += 3) {
11113 GemmMicrokernelTester()
11114 .mr(6)
11115 .nr(8)
11116 .kr(1)
11117 .sr(1)
11118 .m(6)
11119 .n(8)
11120 .k(k)
11121 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080011122 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011123 }
11124 }
11125
Frank Barchard91317c52019-11-22 10:54:35 -080011126 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, small_kernel_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011127 TEST_REQUIRES_ARM_NEON;
11128 for (size_t k = 1; k <= 10; k += 3) {
11129 for (uint32_t m = 1; m <= 6; m++) {
11130 for (uint32_t n = 1; n <= 8; n++) {
11131 GemmMicrokernelTester()
11132 .mr(6)
11133 .nr(8)
11134 .kr(1)
11135 .sr(1)
11136 .m(m)
11137 .n(n)
11138 .k(k)
11139 .ks(3)
11140 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011141 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011142 }
11143 }
11144 }
11145 }
11146
Frank Barchard91317c52019-11-22 10:54:35 -080011147 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_gt_8_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011148 TEST_REQUIRES_ARM_NEON;
11149 for (uint32_t n = 9; n < 16; n++) {
11150 for (size_t k = 1; k <= 10; k += 3) {
11151 GemmMicrokernelTester()
11152 .mr(6)
11153 .nr(8)
11154 .kr(1)
11155 .sr(1)
11156 .m(6)
11157 .n(8)
11158 .k(k)
11159 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080011160 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011161 }
11162 }
11163 }
11164
Frank Barchard91317c52019-11-22 10:54:35 -080011165 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_div_8_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011166 TEST_REQUIRES_ARM_NEON;
11167 for (uint32_t n = 16; n <= 24; n += 8) {
11168 for (size_t k = 1; k <= 10; k += 3) {
11169 GemmMicrokernelTester()
11170 .mr(6)
11171 .nr(8)
11172 .kr(1)
11173 .sr(1)
11174 .m(6)
11175 .n(8)
11176 .k(k)
11177 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080011178 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011179 }
11180 }
11181 }
11182
Frank Barchard91317c52019-11-22 10:54:35 -080011183 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011184 TEST_REQUIRES_ARM_NEON;
11185 for (size_t k = 1; k <= 10; k += 3) {
11186 for (uint32_t m = 1; m <= 6; m++) {
11187 for (uint32_t n = 1; n <= 8; n++) {
11188 GemmMicrokernelTester()
11189 .mr(6)
11190 .nr(8)
11191 .kr(1)
11192 .sr(1)
11193 .m(m)
11194 .n(n)
11195 .k(k)
11196 .cm_stride(11)
11197 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011198 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011199 }
11200 }
11201 }
11202 }
11203
Frank Barchard91317c52019-11-22 10:54:35 -080011204 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, a_offset) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011205 TEST_REQUIRES_ARM_NEON;
11206 for (size_t k = 1; k <= 10; k += 3) {
11207 GemmMicrokernelTester()
11208 .mr(6)
11209 .nr(8)
11210 .kr(1)
11211 .sr(1)
11212 .m(6)
11213 .n(8)
11214 .k(k)
11215 .ks(3)
11216 .a_offset(67)
Frank Barchard91317c52019-11-22 10:54:35 -080011217 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011218 }
11219 }
11220
Frank Barchard91317c52019-11-22 10:54:35 -080011221 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, zero) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011222 TEST_REQUIRES_ARM_NEON;
11223 for (uint32_t mz = 0; mz < 6; mz++) {
11224 for (size_t k = 1; k <= 10; k += 3) {
11225 GemmMicrokernelTester()
11226 .mr(6)
11227 .nr(8)
11228 .kr(1)
11229 .sr(1)
11230 .m(6)
11231 .n(8)
11232 .k(k)
11233 .ks(3)
11234 .a_offset(67)
11235 .zero_index(mz)
Frank Barchard91317c52019-11-22 10:54:35 -080011236 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011237 }
11238 }
11239 }
11240
Frank Barchard91317c52019-11-22 10:54:35 -080011241 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011242 TEST_REQUIRES_ARM_NEON;
11243 GemmMicrokernelTester()
11244 .mr(6)
11245 .nr(8)
11246 .kr(1)
11247 .sr(1)
11248 .m(6)
11249 .n(8)
11250 .k(2)
11251 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080011252 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011253 }
11254
Frank Barchard91317c52019-11-22 10:54:35 -080011255 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011256 TEST_REQUIRES_ARM_NEON;
11257 GemmMicrokernelTester()
11258 .mr(6)
11259 .nr(8)
11260 .kr(1)
11261 .sr(1)
11262 .m(6)
11263 .n(8)
11264 .k(2)
11265 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080011266 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011267 }
11268
Frank Barchard91317c52019-11-22 10:54:35 -080011269 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011270 TEST_REQUIRES_ARM_NEON;
11271 GemmMicrokernelTester()
11272 .mr(6)
11273 .nr(8)
11274 .kr(1)
11275 .sr(1)
11276 .m(6)
11277 .n(8)
11278 .k(2)
11279 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080011280 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011281 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070011282#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070011283
11284
Frank Barchard69172d92019-11-26 16:22:39 -080011285#if XNN_ARCH_ARM || XNN_ARCH_ARM64
11286 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, k_eq_4) {
11287 TEST_REQUIRES_ARM_NEON;
11288 GemmMicrokernelTester()
11289 .mr(6)
11290 .nr(8)
11291 .kr(1)
11292 .sr(1)
11293 .m(6)
11294 .n(8)
11295 .k(4)
11296 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11297 }
11298
11299 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, strided_cn) {
11300 TEST_REQUIRES_ARM_NEON;
11301 GemmMicrokernelTester()
11302 .mr(6)
11303 .nr(8)
11304 .kr(1)
11305 .sr(1)
11306 .m(6)
11307 .n(8)
11308 .k(4)
11309 .cn_stride(11)
11310 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11311 }
11312
11313 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, k_eq_4_subtile) {
11314 TEST_REQUIRES_ARM_NEON;
11315 for (uint32_t m = 1; m <= 6; m++) {
11316 for (uint32_t n = 1; n <= 8; n++) {
11317 GemmMicrokernelTester()
11318 .mr(6)
11319 .nr(8)
11320 .kr(1)
11321 .sr(1)
11322 .m(m)
11323 .n(n)
11324 .k(4)
11325 .iterations(1)
11326 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11327 }
11328 }
11329 }
11330
11331 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, k_eq_4_subtile_m) {
11332 TEST_REQUIRES_ARM_NEON;
11333 for (uint32_t m = 1; m <= 6; m++) {
11334 GemmMicrokernelTester()
11335 .mr(6)
11336 .nr(8)
11337 .kr(1)
11338 .sr(1)
11339 .m(m)
11340 .n(8)
11341 .k(4)
11342 .iterations(1)
11343 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11344 }
11345 }
11346
11347 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, k_eq_4_subtile_n) {
11348 TEST_REQUIRES_ARM_NEON;
11349 for (uint32_t n = 1; n <= 8; n++) {
11350 GemmMicrokernelTester()
11351 .mr(6)
11352 .nr(8)
11353 .kr(1)
11354 .sr(1)
11355 .m(6)
11356 .n(n)
11357 .k(4)
11358 .iterations(1)
11359 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11360 }
11361 }
11362
11363 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, k_lt_4) {
11364 TEST_REQUIRES_ARM_NEON;
11365 for (size_t k = 1; k < 4; k++) {
11366 GemmMicrokernelTester()
11367 .mr(6)
11368 .nr(8)
11369 .kr(1)
11370 .sr(1)
11371 .m(6)
11372 .n(8)
11373 .k(k)
11374 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11375 }
11376 }
11377
11378 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, k_lt_4_subtile) {
11379 TEST_REQUIRES_ARM_NEON;
11380 for (size_t k = 1; k < 4; k++) {
11381 for (uint32_t m = 1; m <= 6; m++) {
11382 for (uint32_t n = 1; n <= 8; n++) {
11383 GemmMicrokernelTester()
11384 .mr(6)
11385 .nr(8)
11386 .kr(1)
11387 .sr(1)
11388 .m(m)
11389 .n(n)
11390 .k(k)
11391 .iterations(1)
11392 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11393 }
11394 }
11395 }
11396 }
11397
11398 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, k_gt_4) {
11399 TEST_REQUIRES_ARM_NEON;
11400 for (size_t k = 5; k < 8; k++) {
11401 GemmMicrokernelTester()
11402 .mr(6)
11403 .nr(8)
11404 .kr(1)
11405 .sr(1)
11406 .m(6)
11407 .n(8)
11408 .k(k)
11409 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11410 }
11411 }
11412
11413 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, k_gt_4_subtile) {
11414 TEST_REQUIRES_ARM_NEON;
11415 for (size_t k = 5; k < 8; k++) {
11416 for (uint32_t m = 1; m <= 6; m++) {
11417 for (uint32_t n = 1; n <= 8; n++) {
11418 GemmMicrokernelTester()
11419 .mr(6)
11420 .nr(8)
11421 .kr(1)
11422 .sr(1)
11423 .m(m)
11424 .n(n)
11425 .k(k)
11426 .iterations(1)
11427 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11428 }
11429 }
11430 }
11431 }
11432
11433 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, k_div_4) {
11434 TEST_REQUIRES_ARM_NEON;
11435 for (size_t k = 8; k <= 40; k += 4) {
11436 GemmMicrokernelTester()
11437 .mr(6)
11438 .nr(8)
11439 .kr(1)
11440 .sr(1)
11441 .m(6)
11442 .n(8)
11443 .k(k)
11444 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11445 }
11446 }
11447
11448 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, k_div_4_subtile) {
11449 TEST_REQUIRES_ARM_NEON;
11450 for (size_t k = 8; k <= 40; k += 4) {
11451 for (uint32_t m = 1; m <= 6; m++) {
11452 for (uint32_t n = 1; n <= 8; n++) {
11453 GemmMicrokernelTester()
11454 .mr(6)
11455 .nr(8)
11456 .kr(1)
11457 .sr(1)
11458 .m(m)
11459 .n(n)
11460 .k(k)
11461 .iterations(1)
11462 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11463 }
11464 }
11465 }
11466 }
11467
11468 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, n_gt_8) {
11469 TEST_REQUIRES_ARM_NEON;
11470 for (uint32_t n = 9; n < 16; n++) {
11471 for (size_t k = 1; k <= 20; k += 5) {
11472 GemmMicrokernelTester()
11473 .mr(6)
11474 .nr(8)
11475 .kr(1)
11476 .sr(1)
11477 .m(6)
11478 .n(8)
11479 .k(k)
11480 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11481 }
11482 }
11483 }
11484
11485 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, n_gt_8_strided_cn) {
11486 TEST_REQUIRES_ARM_NEON;
11487 for (uint32_t n = 9; n < 16; n++) {
11488 for (size_t k = 1; k <= 20; k += 5) {
11489 GemmMicrokernelTester()
11490 .mr(6)
11491 .nr(8)
11492 .kr(1)
11493 .sr(1)
11494 .m(6)
11495 .n(8)
11496 .k(k)
11497 .cn_stride(11)
11498 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11499 }
11500 }
11501 }
11502
11503 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, n_gt_8_subtile) {
11504 TEST_REQUIRES_ARM_NEON;
11505 for (uint32_t n = 9; n < 16; n++) {
11506 for (size_t k = 1; k <= 20; k += 5) {
11507 for (uint32_t m = 1; m <= 6; m++) {
11508 GemmMicrokernelTester()
11509 .mr(6)
11510 .nr(8)
11511 .kr(1)
11512 .sr(1)
11513 .m(m)
11514 .n(n)
11515 .k(k)
11516 .iterations(1)
11517 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11518 }
11519 }
11520 }
11521 }
11522
11523 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, n_div_8) {
11524 TEST_REQUIRES_ARM_NEON;
11525 for (uint32_t n = 16; n <= 24; n += 8) {
11526 for (size_t k = 1; k <= 20; k += 5) {
11527 GemmMicrokernelTester()
11528 .mr(6)
11529 .nr(8)
11530 .kr(1)
11531 .sr(1)
11532 .m(6)
11533 .n(8)
11534 .k(k)
11535 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11536 }
11537 }
11538 }
11539
11540 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, n_div_8_strided_cn) {
11541 TEST_REQUIRES_ARM_NEON;
11542 for (uint32_t n = 16; n <= 24; n += 8) {
11543 for (size_t k = 1; k <= 20; k += 5) {
11544 GemmMicrokernelTester()
11545 .mr(6)
11546 .nr(8)
11547 .kr(1)
11548 .sr(1)
11549 .m(6)
11550 .n(n)
11551 .k(k)
11552 .cn_stride(11)
11553 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11554 }
11555 }
11556 }
11557
11558 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, n_div_8_subtile) {
11559 TEST_REQUIRES_ARM_NEON;
11560 for (uint32_t n = 16; n <= 24; n += 8) {
11561 for (size_t k = 1; k <= 20; k += 5) {
11562 for (uint32_t m = 1; m <= 6; m++) {
11563 GemmMicrokernelTester()
11564 .mr(6)
11565 .nr(8)
11566 .kr(1)
11567 .sr(1)
11568 .m(m)
11569 .n(n)
11570 .k(k)
11571 .iterations(1)
11572 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11573 }
11574 }
11575 }
11576 }
11577
11578 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, small_kernel) {
11579 TEST_REQUIRES_ARM_NEON;
11580 for (size_t k = 1; k <= 20; k += 5) {
11581 GemmMicrokernelTester()
11582 .mr(6)
11583 .nr(8)
11584 .kr(1)
11585 .sr(1)
11586 .m(6)
11587 .n(8)
11588 .k(k)
11589 .ks(3)
11590 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11591 }
11592 }
11593
11594 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, small_kernel_subtile) {
11595 TEST_REQUIRES_ARM_NEON;
11596 for (size_t k = 1; k <= 20; k += 5) {
11597 for (uint32_t m = 1; m <= 6; m++) {
11598 for (uint32_t n = 1; n <= 8; n++) {
11599 GemmMicrokernelTester()
11600 .mr(6)
11601 .nr(8)
11602 .kr(1)
11603 .sr(1)
11604 .m(m)
11605 .n(n)
11606 .k(k)
11607 .ks(3)
11608 .iterations(1)
11609 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11610 }
11611 }
11612 }
11613 }
11614
11615 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, n_gt_8_small_kernel) {
11616 TEST_REQUIRES_ARM_NEON;
11617 for (uint32_t n = 9; n < 16; n++) {
11618 for (size_t k = 1; k <= 20; k += 5) {
11619 GemmMicrokernelTester()
11620 .mr(6)
11621 .nr(8)
11622 .kr(1)
11623 .sr(1)
11624 .m(6)
11625 .n(8)
11626 .k(k)
11627 .ks(3)
11628 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11629 }
11630 }
11631 }
11632
11633 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, n_div_8_small_kernel) {
11634 TEST_REQUIRES_ARM_NEON;
11635 for (uint32_t n = 16; n <= 24; n += 8) {
11636 for (size_t k = 1; k <= 20; k += 5) {
11637 GemmMicrokernelTester()
11638 .mr(6)
11639 .nr(8)
11640 .kr(1)
11641 .sr(1)
11642 .m(6)
11643 .n(8)
11644 .k(k)
11645 .ks(3)
11646 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11647 }
11648 }
11649 }
11650
11651 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, strided_cm_subtile) {
11652 TEST_REQUIRES_ARM_NEON;
11653 for (size_t k = 1; k <= 20; k += 5) {
11654 for (uint32_t m = 1; m <= 6; m++) {
11655 for (uint32_t n = 1; n <= 8; n++) {
11656 GemmMicrokernelTester()
11657 .mr(6)
11658 .nr(8)
11659 .kr(1)
11660 .sr(1)
11661 .m(m)
11662 .n(n)
11663 .k(k)
11664 .cm_stride(11)
11665 .iterations(1)
11666 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11667 }
11668 }
11669 }
11670 }
11671
11672 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, a_offset) {
11673 TEST_REQUIRES_ARM_NEON;
11674 for (size_t k = 1; k <= 20; k += 5) {
11675 GemmMicrokernelTester()
11676 .mr(6)
11677 .nr(8)
11678 .kr(1)
11679 .sr(1)
11680 .m(6)
11681 .n(8)
11682 .k(k)
11683 .ks(3)
11684 .a_offset(127)
11685 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11686 }
11687 }
11688
11689 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, zero) {
11690 TEST_REQUIRES_ARM_NEON;
11691 for (uint32_t mz = 0; mz < 6; mz++) {
11692 for (size_t k = 1; k <= 20; k += 5) {
11693 GemmMicrokernelTester()
11694 .mr(6)
11695 .nr(8)
11696 .kr(1)
11697 .sr(1)
11698 .m(6)
11699 .n(8)
11700 .k(k)
11701 .ks(3)
11702 .a_offset(127)
11703 .zero_index(mz)
11704 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11705 }
11706 }
11707 }
11708
11709 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, qmin) {
11710 TEST_REQUIRES_ARM_NEON;
11711 GemmMicrokernelTester()
11712 .mr(6)
11713 .nr(8)
11714 .kr(1)
11715 .sr(1)
11716 .m(6)
11717 .n(8)
11718 .k(4)
11719 .qmin(128)
11720 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11721 }
11722
11723 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, qmax) {
11724 TEST_REQUIRES_ARM_NEON;
11725 GemmMicrokernelTester()
11726 .mr(6)
11727 .nr(8)
11728 .kr(1)
11729 .sr(1)
11730 .m(6)
11731 .n(8)
11732 .k(4)
11733 .qmax(128)
11734 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11735 }
11736
11737 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, strided_cm) {
11738 TEST_REQUIRES_ARM_NEON;
11739 GemmMicrokernelTester()
11740 .mr(6)
11741 .nr(8)
11742 .kr(1)
11743 .sr(1)
11744 .m(6)
11745 .n(8)
11746 .k(4)
11747 .cm_stride(11)
11748 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
11749 }
11750#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
11751
11752
Frank Barchard5243bb02019-11-22 16:37:50 -080011753#if XNN_ARCH_ARM64
11754 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, k_eq_2) {
11755 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharddf06d802019-11-20 15:53:46 -080011756 GemmMicrokernelTester()
11757 .mr(1)
11758 .nr(8)
11759 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080011760 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080011761 .m(1)
11762 .n(8)
Frank Barchard5243bb02019-11-22 16:37:50 -080011763 .k(2)
11764 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080011765 }
11766
Frank Barchard5243bb02019-11-22 16:37:50 -080011767 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, strided_cn) {
11768 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharddf06d802019-11-20 15:53:46 -080011769 GemmMicrokernelTester()
11770 .mr(1)
11771 .nr(8)
11772 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080011773 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080011774 .m(1)
11775 .n(8)
Frank Barchard5243bb02019-11-22 16:37:50 -080011776 .k(2)
Frank Barcharddf06d802019-11-20 15:53:46 -080011777 .cn_stride(11)
Frank Barchard5243bb02019-11-22 16:37:50 -080011778 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080011779 }
11780
Frank Barchard5243bb02019-11-22 16:37:50 -080011781 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
11782 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharddf06d802019-11-20 15:53:46 -080011783 for (uint32_t m = 1; m <= 1; m++) {
11784 for (uint32_t n = 1; n <= 8; n++) {
11785 GemmMicrokernelTester()
11786 .mr(1)
11787 .nr(8)
11788 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080011789 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080011790 .m(m)
11791 .n(n)
Frank Barchard5243bb02019-11-22 16:37:50 -080011792 .k(2)
Frank Barcharddf06d802019-11-20 15:53:46 -080011793 .iterations(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080011794 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080011795 }
11796 }
11797 }
11798
Frank Barchard5243bb02019-11-22 16:37:50 -080011799 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
11800 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharddf06d802019-11-20 15:53:46 -080011801 for (uint32_t m = 1; m <= 1; m++) {
11802 GemmMicrokernelTester()
11803 .mr(1)
11804 .nr(8)
11805 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080011806 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080011807 .m(m)
11808 .n(8)
Frank Barchard5243bb02019-11-22 16:37:50 -080011809 .k(2)
Frank Barcharddf06d802019-11-20 15:53:46 -080011810 .iterations(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080011811 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080011812 }
11813 }
11814
Frank Barchard5243bb02019-11-22 16:37:50 -080011815 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
11816 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharddf06d802019-11-20 15:53:46 -080011817 for (uint32_t n = 1; n <= 8; n++) {
11818 GemmMicrokernelTester()
11819 .mr(1)
11820 .nr(8)
11821 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080011822 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080011823 .m(1)
11824 .n(n)
Frank Barchard5243bb02019-11-22 16:37:50 -080011825 .k(2)
Frank Barcharddf06d802019-11-20 15:53:46 -080011826 .iterations(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080011827 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080011828 }
11829 }
11830
Frank Barchard5243bb02019-11-22 16:37:50 -080011831 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, k_lt_2) {
11832 TEST_REQUIRES_ARM_NEON_FMA;
11833 for (size_t k = 1; k < 2; k++) {
Frank Barcharddf06d802019-11-20 15:53:46 -080011834 GemmMicrokernelTester()
11835 .mr(1)
11836 .nr(8)
11837 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080011838 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080011839 .m(1)
11840 .n(8)
11841 .k(k)
Frank Barchard5243bb02019-11-22 16:37:50 -080011842 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080011843 }
11844 }
11845
Frank Barchard5243bb02019-11-22 16:37:50 -080011846 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
11847 TEST_REQUIRES_ARM_NEON_FMA;
11848 for (size_t k = 1; k < 2; k++) {
Frank Barcharddf06d802019-11-20 15:53:46 -080011849 for (uint32_t m = 1; m <= 1; m++) {
11850 for (uint32_t n = 1; n <= 8; n++) {
11851 GemmMicrokernelTester()
11852 .mr(1)
11853 .nr(8)
11854 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080011855 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080011856 .m(m)
11857 .n(n)
11858 .k(k)
11859 .iterations(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080011860 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080011861 }
11862 }
11863 }
11864 }
11865
Frank Barchard5243bb02019-11-22 16:37:50 -080011866 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, k_gt_2) {
11867 TEST_REQUIRES_ARM_NEON_FMA;
11868 for (size_t k = 3; k < 4; k++) {
Frank Barcharddf06d802019-11-20 15:53:46 -080011869 GemmMicrokernelTester()
11870 .mr(1)
11871 .nr(8)
11872 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080011873 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080011874 .m(1)
11875 .n(8)
11876 .k(k)
Frank Barchard5243bb02019-11-22 16:37:50 -080011877 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080011878 }
11879 }
11880
Frank Barchard5243bb02019-11-22 16:37:50 -080011881 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
11882 TEST_REQUIRES_ARM_NEON_FMA;
11883 for (size_t k = 3; k < 4; k++) {
Frank Barcharddf06d802019-11-20 15:53:46 -080011884 for (uint32_t m = 1; m <= 1; m++) {
11885 for (uint32_t n = 1; n <= 8; n++) {
11886 GemmMicrokernelTester()
11887 .mr(1)
11888 .nr(8)
11889 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080011890 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080011891 .m(m)
11892 .n(n)
11893 .k(k)
11894 .iterations(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080011895 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080011896 }
11897 }
11898 }
11899 }
11900
Frank Barchard5243bb02019-11-22 16:37:50 -080011901 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, k_div_2) {
11902 TEST_REQUIRES_ARM_NEON_FMA;
11903 for (size_t k = 4; k <= 20; k += 2) {
Frank Barcharddf06d802019-11-20 15:53:46 -080011904 GemmMicrokernelTester()
11905 .mr(1)
11906 .nr(8)
11907 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080011908 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080011909 .m(1)
11910 .n(8)
11911 .k(k)
Frank Barchard5243bb02019-11-22 16:37:50 -080011912 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080011913 }
11914 }
11915
Frank Barchard5243bb02019-11-22 16:37:50 -080011916 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
11917 TEST_REQUIRES_ARM_NEON_FMA;
11918 for (size_t k = 4; k <= 20; k += 2) {
Frank Barcharddf06d802019-11-20 15:53:46 -080011919 for (uint32_t m = 1; m <= 1; m++) {
11920 for (uint32_t n = 1; n <= 8; n++) {
11921 GemmMicrokernelTester()
11922 .mr(1)
11923 .nr(8)
11924 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080011925 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080011926 .m(m)
11927 .n(n)
11928 .k(k)
11929 .iterations(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080011930 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080011931 }
11932 }
11933 }
11934 }
11935
Frank Barchard5243bb02019-11-22 16:37:50 -080011936 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, n_gt_8) {
11937 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharddf06d802019-11-20 15:53:46 -080011938 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard5243bb02019-11-22 16:37:50 -080011939 for (size_t k = 1; k <= 10; k += 3) {
Frank Barcharddf06d802019-11-20 15:53:46 -080011940 GemmMicrokernelTester()
11941 .mr(1)
11942 .nr(8)
11943 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080011944 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080011945 .m(1)
11946 .n(8)
11947 .k(k)
Frank Barchard5243bb02019-11-22 16:37:50 -080011948 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080011949 }
11950 }
11951 }
11952
Frank Barchard5243bb02019-11-22 16:37:50 -080011953 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
11954 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharddf06d802019-11-20 15:53:46 -080011955 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard5243bb02019-11-22 16:37:50 -080011956 for (size_t k = 1; k <= 10; k += 3) {
Frank Barcharddf06d802019-11-20 15:53:46 -080011957 GemmMicrokernelTester()
11958 .mr(1)
11959 .nr(8)
11960 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080011961 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080011962 .m(1)
11963 .n(8)
11964 .k(k)
11965 .cn_stride(11)
Frank Barchard5243bb02019-11-22 16:37:50 -080011966 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080011967 }
11968 }
11969 }
11970
Frank Barchard5243bb02019-11-22 16:37:50 -080011971 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
11972 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharddf06d802019-11-20 15:53:46 -080011973 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard5243bb02019-11-22 16:37:50 -080011974 for (size_t k = 1; k <= 10; k += 3) {
Frank Barcharddf06d802019-11-20 15:53:46 -080011975 for (uint32_t m = 1; m <= 1; m++) {
11976 GemmMicrokernelTester()
11977 .mr(1)
11978 .nr(8)
11979 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080011980 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080011981 .m(m)
11982 .n(n)
11983 .k(k)
11984 .iterations(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080011985 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080011986 }
11987 }
11988 }
11989 }
11990
Frank Barchard5243bb02019-11-22 16:37:50 -080011991 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, n_div_8) {
11992 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharddf06d802019-11-20 15:53:46 -080011993 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard5243bb02019-11-22 16:37:50 -080011994 for (size_t k = 1; k <= 10; k += 3) {
Frank Barcharddf06d802019-11-20 15:53:46 -080011995 GemmMicrokernelTester()
11996 .mr(1)
11997 .nr(8)
11998 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080011999 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080012000 .m(1)
12001 .n(8)
12002 .k(k)
Frank Barchard5243bb02019-11-22 16:37:50 -080012003 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080012004 }
12005 }
12006 }
12007
Frank Barchard5243bb02019-11-22 16:37:50 -080012008 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
12009 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharddf06d802019-11-20 15:53:46 -080012010 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard5243bb02019-11-22 16:37:50 -080012011 for (size_t k = 1; k <= 10; k += 3) {
Frank Barcharddf06d802019-11-20 15:53:46 -080012012 GemmMicrokernelTester()
12013 .mr(1)
12014 .nr(8)
12015 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080012016 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080012017 .m(1)
12018 .n(n)
12019 .k(k)
12020 .cn_stride(11)
Frank Barchard5243bb02019-11-22 16:37:50 -080012021 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080012022 }
12023 }
12024 }
12025
Frank Barchard5243bb02019-11-22 16:37:50 -080012026 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
12027 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharddf06d802019-11-20 15:53:46 -080012028 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard5243bb02019-11-22 16:37:50 -080012029 for (size_t k = 1; k <= 10; k += 3) {
Frank Barcharddf06d802019-11-20 15:53:46 -080012030 for (uint32_t m = 1; m <= 1; m++) {
12031 GemmMicrokernelTester()
12032 .mr(1)
12033 .nr(8)
12034 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080012035 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080012036 .m(m)
12037 .n(n)
12038 .k(k)
12039 .iterations(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080012040 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080012041 }
12042 }
12043 }
12044 }
12045
Frank Barchard5243bb02019-11-22 16:37:50 -080012046 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, small_kernel) {
12047 TEST_REQUIRES_ARM_NEON_FMA;
12048 for (size_t k = 1; k <= 10; k += 3) {
Frank Barcharddf06d802019-11-20 15:53:46 -080012049 GemmMicrokernelTester()
12050 .mr(1)
12051 .nr(8)
12052 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080012053 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080012054 .m(1)
12055 .n(8)
12056 .k(k)
12057 .ks(3)
Frank Barchard5243bb02019-11-22 16:37:50 -080012058 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080012059 }
12060 }
12061
Frank Barchard5243bb02019-11-22 16:37:50 -080012062 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, small_kernel_subtile) {
12063 TEST_REQUIRES_ARM_NEON_FMA;
12064 for (size_t k = 1; k <= 10; k += 3) {
Frank Barcharddf06d802019-11-20 15:53:46 -080012065 for (uint32_t m = 1; m <= 1; m++) {
12066 for (uint32_t n = 1; n <= 8; n++) {
12067 GemmMicrokernelTester()
12068 .mr(1)
12069 .nr(8)
12070 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080012071 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080012072 .m(m)
12073 .n(n)
12074 .k(k)
12075 .ks(3)
12076 .iterations(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080012077 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080012078 }
12079 }
12080 }
12081 }
12082
Frank Barchard5243bb02019-11-22 16:37:50 -080012083 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, n_gt_8_small_kernel) {
12084 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharddf06d802019-11-20 15:53:46 -080012085 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard5243bb02019-11-22 16:37:50 -080012086 for (size_t k = 1; k <= 10; k += 3) {
Frank Barcharddf06d802019-11-20 15:53:46 -080012087 GemmMicrokernelTester()
12088 .mr(1)
12089 .nr(8)
12090 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080012091 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080012092 .m(1)
12093 .n(8)
12094 .k(k)
12095 .ks(3)
Frank Barchard5243bb02019-11-22 16:37:50 -080012096 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080012097 }
12098 }
12099 }
12100
Frank Barchard5243bb02019-11-22 16:37:50 -080012101 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, n_div_8_small_kernel) {
12102 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharddf06d802019-11-20 15:53:46 -080012103 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard5243bb02019-11-22 16:37:50 -080012104 for (size_t k = 1; k <= 10; k += 3) {
Frank Barcharddf06d802019-11-20 15:53:46 -080012105 GemmMicrokernelTester()
12106 .mr(1)
12107 .nr(8)
12108 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080012109 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080012110 .m(1)
12111 .n(8)
12112 .k(k)
12113 .ks(3)
Frank Barchard5243bb02019-11-22 16:37:50 -080012114 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080012115 }
12116 }
12117 }
12118
Frank Barchard5243bb02019-11-22 16:37:50 -080012119 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
12120 TEST_REQUIRES_ARM_NEON_FMA;
12121 for (size_t k = 1; k <= 10; k += 3) {
Frank Barcharddf06d802019-11-20 15:53:46 -080012122 for (uint32_t m = 1; m <= 1; m++) {
12123 for (uint32_t n = 1; n <= 8; n++) {
12124 GemmMicrokernelTester()
12125 .mr(1)
12126 .nr(8)
12127 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080012128 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080012129 .m(m)
12130 .n(n)
12131 .k(k)
12132 .cm_stride(11)
12133 .iterations(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080012134 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080012135 }
12136 }
12137 }
12138 }
12139
Frank Barchard5243bb02019-11-22 16:37:50 -080012140 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, a_offset) {
12141 TEST_REQUIRES_ARM_NEON_FMA;
12142 for (size_t k = 1; k <= 10; k += 3) {
Frank Barcharddf06d802019-11-20 15:53:46 -080012143 GemmMicrokernelTester()
12144 .mr(1)
12145 .nr(8)
12146 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080012147 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080012148 .m(1)
12149 .n(8)
12150 .k(k)
12151 .ks(3)
Frank Barchard5243bb02019-11-22 16:37:50 -080012152 .a_offset(13)
12153 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080012154 }
12155 }
12156
Frank Barchard5243bb02019-11-22 16:37:50 -080012157 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, zero) {
12158 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharddf06d802019-11-20 15:53:46 -080012159 for (uint32_t mz = 0; mz < 1; mz++) {
Frank Barchard5243bb02019-11-22 16:37:50 -080012160 for (size_t k = 1; k <= 10; k += 3) {
Frank Barcharddf06d802019-11-20 15:53:46 -080012161 GemmMicrokernelTester()
12162 .mr(1)
12163 .nr(8)
12164 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080012165 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080012166 .m(1)
12167 .n(8)
12168 .k(k)
12169 .ks(3)
Frank Barchard5243bb02019-11-22 16:37:50 -080012170 .a_offset(13)
Frank Barcharddf06d802019-11-20 15:53:46 -080012171 .zero_index(mz)
Frank Barchard5243bb02019-11-22 16:37:50 -080012172 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080012173 }
12174 }
12175 }
12176
Frank Barchard5243bb02019-11-22 16:37:50 -080012177 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, qmin) {
12178 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharddf06d802019-11-20 15:53:46 -080012179 GemmMicrokernelTester()
12180 .mr(1)
12181 .nr(8)
12182 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080012183 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080012184 .m(1)
12185 .n(8)
Frank Barchard5243bb02019-11-22 16:37:50 -080012186 .k(2)
Frank Barcharddf06d802019-11-20 15:53:46 -080012187 .qmin(128)
Frank Barchard5243bb02019-11-22 16:37:50 -080012188 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080012189 }
12190
Frank Barchard5243bb02019-11-22 16:37:50 -080012191 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, qmax) {
12192 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharddf06d802019-11-20 15:53:46 -080012193 GemmMicrokernelTester()
12194 .mr(1)
12195 .nr(8)
12196 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080012197 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080012198 .m(1)
12199 .n(8)
Frank Barchard5243bb02019-11-22 16:37:50 -080012200 .k(2)
Frank Barcharddf06d802019-11-20 15:53:46 -080012201 .qmax(128)
Frank Barchard5243bb02019-11-22 16:37:50 -080012202 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080012203 }
12204
Frank Barchard5243bb02019-11-22 16:37:50 -080012205 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, strided_cm) {
12206 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharddf06d802019-11-20 15:53:46 -080012207 GemmMicrokernelTester()
12208 .mr(1)
12209 .nr(8)
12210 .kr(1)
Frank Barchard5243bb02019-11-22 16:37:50 -080012211 .sr(1)
Frank Barcharddf06d802019-11-20 15:53:46 -080012212 .m(1)
12213 .n(8)
Frank Barchard5243bb02019-11-22 16:37:50 -080012214 .k(2)
Frank Barcharddf06d802019-11-20 15:53:46 -080012215 .cm_stride(11)
Frank Barchard5243bb02019-11-22 16:37:50 -080012216 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
Frank Barcharddf06d802019-11-20 15:53:46 -080012217 }
Frank Barchard5243bb02019-11-22 16:37:50 -080012218#endif // XNN_ARCH_ARM64
Frank Barcharddf06d802019-11-20 15:53:46 -080012219
12220
Frank Barchard91317c52019-11-22 10:54:35 -080012221#if XNN_ARCH_ARM64
12222 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012223 TEST_REQUIRES_ARM_NEON_FMA;
12224 GemmMicrokernelTester()
12225 .mr(4)
12226 .nr(2)
12227 .kr(1)
12228 .sr(1)
12229 .m(4)
12230 .n(2)
12231 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -080012232 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012233 }
12234
Frank Barchard91317c52019-11-22 10:54:35 -080012235 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012236 TEST_REQUIRES_ARM_NEON_FMA;
12237 GemmMicrokernelTester()
12238 .mr(4)
12239 .nr(2)
12240 .kr(1)
12241 .sr(1)
12242 .m(4)
12243 .n(2)
12244 .k(2)
12245 .cn_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080012246 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012247 }
12248
Frank Barchard91317c52019-11-22 10:54:35 -080012249 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012250 TEST_REQUIRES_ARM_NEON_FMA;
12251 for (uint32_t m = 1; m <= 4; m++) {
12252 for (uint32_t n = 1; n <= 2; n++) {
12253 GemmMicrokernelTester()
12254 .mr(4)
12255 .nr(2)
12256 .kr(1)
12257 .sr(1)
12258 .m(m)
12259 .n(n)
12260 .k(2)
12261 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012262 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012263 }
12264 }
12265 }
12266
Frank Barchard91317c52019-11-22 10:54:35 -080012267 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012268 TEST_REQUIRES_ARM_NEON_FMA;
12269 for (uint32_t m = 1; m <= 4; m++) {
12270 GemmMicrokernelTester()
12271 .mr(4)
12272 .nr(2)
12273 .kr(1)
12274 .sr(1)
12275 .m(m)
12276 .n(2)
12277 .k(2)
12278 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012279 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012280 }
12281 }
12282
Frank Barchard91317c52019-11-22 10:54:35 -080012283 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012284 TEST_REQUIRES_ARM_NEON_FMA;
12285 for (uint32_t n = 1; n <= 2; n++) {
12286 GemmMicrokernelTester()
12287 .mr(4)
12288 .nr(2)
12289 .kr(1)
12290 .sr(1)
12291 .m(4)
12292 .n(n)
12293 .k(2)
12294 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012295 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012296 }
12297 }
12298
Frank Barchard91317c52019-11-22 10:54:35 -080012299 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012300 TEST_REQUIRES_ARM_NEON_FMA;
12301 for (size_t k = 1; k < 2; k++) {
12302 GemmMicrokernelTester()
12303 .mr(4)
12304 .nr(2)
12305 .kr(1)
12306 .sr(1)
12307 .m(4)
12308 .n(2)
12309 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012310 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012311 }
12312 }
12313
Frank Barchard91317c52019-11-22 10:54:35 -080012314 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012315 TEST_REQUIRES_ARM_NEON_FMA;
12316 for (size_t k = 1; k < 2; k++) {
12317 for (uint32_t m = 1; m <= 4; m++) {
12318 for (uint32_t n = 1; n <= 2; n++) {
12319 GemmMicrokernelTester()
12320 .mr(4)
12321 .nr(2)
12322 .kr(1)
12323 .sr(1)
12324 .m(m)
12325 .n(n)
12326 .k(k)
12327 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012328 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012329 }
12330 }
12331 }
12332 }
12333
Frank Barchard91317c52019-11-22 10:54:35 -080012334 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012335 TEST_REQUIRES_ARM_NEON_FMA;
12336 for (size_t k = 3; k < 4; k++) {
12337 GemmMicrokernelTester()
12338 .mr(4)
12339 .nr(2)
12340 .kr(1)
12341 .sr(1)
12342 .m(4)
12343 .n(2)
12344 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012345 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012346 }
12347 }
12348
Frank Barchard91317c52019-11-22 10:54:35 -080012349 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012350 TEST_REQUIRES_ARM_NEON_FMA;
12351 for (size_t k = 3; k < 4; k++) {
12352 for (uint32_t m = 1; m <= 4; m++) {
12353 for (uint32_t n = 1; n <= 2; n++) {
12354 GemmMicrokernelTester()
12355 .mr(4)
12356 .nr(2)
12357 .kr(1)
12358 .sr(1)
12359 .m(m)
12360 .n(n)
12361 .k(k)
12362 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012363 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012364 }
12365 }
12366 }
12367 }
12368
Frank Barchard91317c52019-11-22 10:54:35 -080012369 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012370 TEST_REQUIRES_ARM_NEON_FMA;
12371 for (size_t k = 4; k <= 20; k += 2) {
12372 GemmMicrokernelTester()
12373 .mr(4)
12374 .nr(2)
12375 .kr(1)
12376 .sr(1)
12377 .m(4)
12378 .n(2)
12379 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012380 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012381 }
12382 }
12383
Frank Barchard91317c52019-11-22 10:54:35 -080012384 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012385 TEST_REQUIRES_ARM_NEON_FMA;
12386 for (size_t k = 4; k <= 20; k += 2) {
12387 for (uint32_t m = 1; m <= 4; m++) {
12388 for (uint32_t n = 1; n <= 2; n++) {
12389 GemmMicrokernelTester()
12390 .mr(4)
12391 .nr(2)
12392 .kr(1)
12393 .sr(1)
12394 .m(m)
12395 .n(n)
12396 .k(k)
12397 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012398 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012399 }
12400 }
12401 }
12402 }
12403
Frank Barchard91317c52019-11-22 10:54:35 -080012404 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012405 TEST_REQUIRES_ARM_NEON_FMA;
12406 for (uint32_t n = 3; n < 4; n++) {
12407 for (size_t k = 1; k <= 10; k += 3) {
12408 GemmMicrokernelTester()
12409 .mr(4)
12410 .nr(2)
12411 .kr(1)
12412 .sr(1)
12413 .m(4)
12414 .n(2)
12415 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012416 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012417 }
12418 }
12419 }
12420
Frank Barchard91317c52019-11-22 10:54:35 -080012421 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_gt_2_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012422 TEST_REQUIRES_ARM_NEON_FMA;
12423 for (uint32_t n = 3; n < 4; n++) {
12424 for (size_t k = 1; k <= 10; k += 3) {
12425 GemmMicrokernelTester()
12426 .mr(4)
12427 .nr(2)
12428 .kr(1)
12429 .sr(1)
12430 .m(4)
12431 .n(2)
12432 .k(k)
12433 .cn_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080012434 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012435 }
12436 }
12437 }
12438
Frank Barchard91317c52019-11-22 10:54:35 -080012439 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012440 TEST_REQUIRES_ARM_NEON_FMA;
12441 for (uint32_t n = 3; n < 4; n++) {
12442 for (size_t k = 1; k <= 10; k += 3) {
12443 for (uint32_t m = 1; m <= 4; m++) {
12444 GemmMicrokernelTester()
12445 .mr(4)
12446 .nr(2)
12447 .kr(1)
12448 .sr(1)
12449 .m(m)
12450 .n(n)
12451 .k(k)
12452 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012453 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012454 }
12455 }
12456 }
12457 }
12458
Frank Barchard91317c52019-11-22 10:54:35 -080012459 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012460 TEST_REQUIRES_ARM_NEON_FMA;
12461 for (uint32_t n = 4; n <= 6; n += 2) {
12462 for (size_t k = 1; k <= 10; k += 3) {
12463 GemmMicrokernelTester()
12464 .mr(4)
12465 .nr(2)
12466 .kr(1)
12467 .sr(1)
12468 .m(4)
12469 .n(2)
12470 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012471 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012472 }
12473 }
12474 }
12475
Frank Barchard91317c52019-11-22 10:54:35 -080012476 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_div_2_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012477 TEST_REQUIRES_ARM_NEON_FMA;
12478 for (uint32_t n = 4; n <= 6; n += 2) {
12479 for (size_t k = 1; k <= 10; k += 3) {
12480 GemmMicrokernelTester()
12481 .mr(4)
12482 .nr(2)
12483 .kr(1)
12484 .sr(1)
12485 .m(4)
12486 .n(n)
12487 .k(k)
12488 .cn_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080012489 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012490 }
12491 }
12492 }
12493
Frank Barchard91317c52019-11-22 10:54:35 -080012494 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012495 TEST_REQUIRES_ARM_NEON_FMA;
12496 for (uint32_t n = 4; n <= 6; n += 2) {
12497 for (size_t k = 1; k <= 10; k += 3) {
12498 for (uint32_t m = 1; m <= 4; m++) {
12499 GemmMicrokernelTester()
12500 .mr(4)
12501 .nr(2)
12502 .kr(1)
12503 .sr(1)
12504 .m(m)
12505 .n(n)
12506 .k(k)
12507 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012508 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012509 }
12510 }
12511 }
12512 }
12513
Frank Barchard91317c52019-11-22 10:54:35 -080012514 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012515 TEST_REQUIRES_ARM_NEON_FMA;
12516 for (size_t k = 1; k <= 10; k += 3) {
12517 GemmMicrokernelTester()
12518 .mr(4)
12519 .nr(2)
12520 .kr(1)
12521 .sr(1)
12522 .m(4)
12523 .n(2)
12524 .k(k)
12525 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080012526 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012527 }
12528 }
12529
Frank Barchard91317c52019-11-22 10:54:35 -080012530 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, small_kernel_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012531 TEST_REQUIRES_ARM_NEON_FMA;
12532 for (size_t k = 1; k <= 10; k += 3) {
12533 for (uint32_t m = 1; m <= 4; m++) {
12534 for (uint32_t n = 1; n <= 2; n++) {
12535 GemmMicrokernelTester()
12536 .mr(4)
12537 .nr(2)
12538 .kr(1)
12539 .sr(1)
12540 .m(m)
12541 .n(n)
12542 .k(k)
12543 .ks(3)
12544 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012545 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012546 }
12547 }
12548 }
12549 }
12550
Frank Barchard91317c52019-11-22 10:54:35 -080012551 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_gt_2_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012552 TEST_REQUIRES_ARM_NEON_FMA;
12553 for (uint32_t n = 3; n < 4; n++) {
12554 for (size_t k = 1; k <= 10; k += 3) {
12555 GemmMicrokernelTester()
12556 .mr(4)
12557 .nr(2)
12558 .kr(1)
12559 .sr(1)
12560 .m(4)
12561 .n(2)
12562 .k(k)
12563 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080012564 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012565 }
12566 }
12567 }
12568
Frank Barchard91317c52019-11-22 10:54:35 -080012569 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_div_2_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012570 TEST_REQUIRES_ARM_NEON_FMA;
12571 for (uint32_t n = 4; n <= 6; n += 2) {
12572 for (size_t k = 1; k <= 10; k += 3) {
12573 GemmMicrokernelTester()
12574 .mr(4)
12575 .nr(2)
12576 .kr(1)
12577 .sr(1)
12578 .m(4)
12579 .n(2)
12580 .k(k)
12581 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080012582 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012583 }
12584 }
12585 }
12586
Frank Barchard91317c52019-11-22 10:54:35 -080012587 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012588 TEST_REQUIRES_ARM_NEON_FMA;
12589 for (size_t k = 1; k <= 10; k += 3) {
12590 for (uint32_t m = 1; m <= 4; m++) {
12591 for (uint32_t n = 1; n <= 2; n++) {
12592 GemmMicrokernelTester()
12593 .mr(4)
12594 .nr(2)
12595 .kr(1)
12596 .sr(1)
12597 .m(m)
12598 .n(n)
12599 .k(k)
12600 .cm_stride(5)
12601 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012602 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012603 }
12604 }
12605 }
12606 }
12607
Frank Barchard91317c52019-11-22 10:54:35 -080012608 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, a_offset) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012609 TEST_REQUIRES_ARM_NEON_FMA;
12610 for (size_t k = 1; k <= 10; k += 3) {
12611 GemmMicrokernelTester()
12612 .mr(4)
12613 .nr(2)
12614 .kr(1)
12615 .sr(1)
12616 .m(4)
12617 .n(2)
12618 .k(k)
12619 .ks(3)
12620 .a_offset(43)
Frank Barchard91317c52019-11-22 10:54:35 -080012621 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012622 }
12623 }
12624
Frank Barchard91317c52019-11-22 10:54:35 -080012625 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, zero) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012626 TEST_REQUIRES_ARM_NEON_FMA;
12627 for (uint32_t mz = 0; mz < 4; mz++) {
12628 for (size_t k = 1; k <= 10; k += 3) {
12629 GemmMicrokernelTester()
12630 .mr(4)
12631 .nr(2)
12632 .kr(1)
12633 .sr(1)
12634 .m(4)
12635 .n(2)
12636 .k(k)
12637 .ks(3)
12638 .a_offset(43)
12639 .zero_index(mz)
Frank Barchard91317c52019-11-22 10:54:35 -080012640 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012641 }
12642 }
12643 }
12644
Frank Barchard91317c52019-11-22 10:54:35 -080012645 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012646 TEST_REQUIRES_ARM_NEON_FMA;
12647 GemmMicrokernelTester()
12648 .mr(4)
12649 .nr(2)
12650 .kr(1)
12651 .sr(1)
12652 .m(4)
12653 .n(2)
12654 .k(2)
12655 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080012656 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012657 }
12658
Frank Barchard91317c52019-11-22 10:54:35 -080012659 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012660 TEST_REQUIRES_ARM_NEON_FMA;
12661 GemmMicrokernelTester()
12662 .mr(4)
12663 .nr(2)
12664 .kr(1)
12665 .sr(1)
12666 .m(4)
12667 .n(2)
12668 .k(2)
12669 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080012670 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012671 }
12672
Frank Barchard91317c52019-11-22 10:54:35 -080012673 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012674 TEST_REQUIRES_ARM_NEON_FMA;
12675 GemmMicrokernelTester()
12676 .mr(4)
12677 .nr(2)
12678 .kr(1)
12679 .sr(1)
12680 .m(4)
12681 .n(2)
12682 .k(2)
12683 .cm_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080012684 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012685 }
Frank Barchard91317c52019-11-22 10:54:35 -080012686#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070012687
12688
Frank Barchard91317c52019-11-22 10:54:35 -080012689#if XNN_ARCH_ARM64
12690 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012691 TEST_REQUIRES_ARM_NEON_FMA;
12692 GemmMicrokernelTester()
12693 .mr(4)
12694 .nr(4)
12695 .kr(1)
12696 .sr(1)
12697 .m(4)
12698 .n(4)
12699 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -080012700 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012701 }
12702
Frank Barchard91317c52019-11-22 10:54:35 -080012703 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012704 TEST_REQUIRES_ARM_NEON_FMA;
12705 GemmMicrokernelTester()
12706 .mr(4)
12707 .nr(4)
12708 .kr(1)
12709 .sr(1)
12710 .m(4)
12711 .n(4)
12712 .k(2)
12713 .cn_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080012714 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012715 }
12716
Frank Barchard91317c52019-11-22 10:54:35 -080012717 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012718 TEST_REQUIRES_ARM_NEON_FMA;
12719 for (uint32_t m = 1; m <= 4; m++) {
12720 for (uint32_t n = 1; n <= 4; n++) {
12721 GemmMicrokernelTester()
12722 .mr(4)
12723 .nr(4)
12724 .kr(1)
12725 .sr(1)
12726 .m(m)
12727 .n(n)
12728 .k(2)
12729 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012730 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012731 }
12732 }
12733 }
12734
Frank Barchard91317c52019-11-22 10:54:35 -080012735 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012736 TEST_REQUIRES_ARM_NEON_FMA;
12737 for (uint32_t m = 1; m <= 4; m++) {
12738 GemmMicrokernelTester()
12739 .mr(4)
12740 .nr(4)
12741 .kr(1)
12742 .sr(1)
12743 .m(m)
12744 .n(4)
12745 .k(2)
12746 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012747 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012748 }
12749 }
12750
Frank Barchard91317c52019-11-22 10:54:35 -080012751 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012752 TEST_REQUIRES_ARM_NEON_FMA;
12753 for (uint32_t n = 1; n <= 4; n++) {
12754 GemmMicrokernelTester()
12755 .mr(4)
12756 .nr(4)
12757 .kr(1)
12758 .sr(1)
12759 .m(4)
12760 .n(n)
12761 .k(2)
12762 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012763 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012764 }
12765 }
12766
Frank Barchard91317c52019-11-22 10:54:35 -080012767 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012768 TEST_REQUIRES_ARM_NEON_FMA;
12769 for (size_t k = 1; k < 2; k++) {
12770 GemmMicrokernelTester()
12771 .mr(4)
12772 .nr(4)
12773 .kr(1)
12774 .sr(1)
12775 .m(4)
12776 .n(4)
12777 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012778 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012779 }
12780 }
12781
Frank Barchard91317c52019-11-22 10:54:35 -080012782 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012783 TEST_REQUIRES_ARM_NEON_FMA;
12784 for (size_t k = 1; k < 2; k++) {
12785 for (uint32_t m = 1; m <= 4; m++) {
12786 for (uint32_t n = 1; n <= 4; n++) {
12787 GemmMicrokernelTester()
12788 .mr(4)
12789 .nr(4)
12790 .kr(1)
12791 .sr(1)
12792 .m(m)
12793 .n(n)
12794 .k(k)
12795 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012796 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012797 }
12798 }
12799 }
12800 }
12801
Frank Barchard91317c52019-11-22 10:54:35 -080012802 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012803 TEST_REQUIRES_ARM_NEON_FMA;
12804 for (size_t k = 3; k < 4; k++) {
12805 GemmMicrokernelTester()
12806 .mr(4)
12807 .nr(4)
12808 .kr(1)
12809 .sr(1)
12810 .m(4)
12811 .n(4)
12812 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012813 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012814 }
12815 }
12816
Frank Barchard91317c52019-11-22 10:54:35 -080012817 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012818 TEST_REQUIRES_ARM_NEON_FMA;
12819 for (size_t k = 3; k < 4; k++) {
12820 for (uint32_t m = 1; m <= 4; m++) {
12821 for (uint32_t n = 1; n <= 4; n++) {
12822 GemmMicrokernelTester()
12823 .mr(4)
12824 .nr(4)
12825 .kr(1)
12826 .sr(1)
12827 .m(m)
12828 .n(n)
12829 .k(k)
12830 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012831 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012832 }
12833 }
12834 }
12835 }
12836
Frank Barchard91317c52019-11-22 10:54:35 -080012837 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012838 TEST_REQUIRES_ARM_NEON_FMA;
12839 for (size_t k = 4; k <= 20; k += 2) {
12840 GemmMicrokernelTester()
12841 .mr(4)
12842 .nr(4)
12843 .kr(1)
12844 .sr(1)
12845 .m(4)
12846 .n(4)
12847 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012848 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012849 }
12850 }
12851
Frank Barchard91317c52019-11-22 10:54:35 -080012852 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012853 TEST_REQUIRES_ARM_NEON_FMA;
12854 for (size_t k = 4; k <= 20; k += 2) {
12855 for (uint32_t m = 1; m <= 4; m++) {
12856 for (uint32_t n = 1; n <= 4; n++) {
12857 GemmMicrokernelTester()
12858 .mr(4)
12859 .nr(4)
12860 .kr(1)
12861 .sr(1)
12862 .m(m)
12863 .n(n)
12864 .k(k)
12865 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012866 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012867 }
12868 }
12869 }
12870 }
12871
Frank Barchard91317c52019-11-22 10:54:35 -080012872 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_gt_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012873 TEST_REQUIRES_ARM_NEON_FMA;
12874 for (uint32_t n = 5; n < 8; n++) {
12875 for (size_t k = 1; k <= 10; k += 3) {
12876 GemmMicrokernelTester()
12877 .mr(4)
12878 .nr(4)
12879 .kr(1)
12880 .sr(1)
12881 .m(4)
12882 .n(4)
12883 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012884 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012885 }
12886 }
12887 }
12888
Frank Barchard91317c52019-11-22 10:54:35 -080012889 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_gt_4_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012890 TEST_REQUIRES_ARM_NEON_FMA;
12891 for (uint32_t n = 5; n < 8; n++) {
12892 for (size_t k = 1; k <= 10; k += 3) {
12893 GemmMicrokernelTester()
12894 .mr(4)
12895 .nr(4)
12896 .kr(1)
12897 .sr(1)
12898 .m(4)
12899 .n(4)
12900 .k(k)
12901 .cn_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080012902 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012903 }
12904 }
12905 }
12906
Frank Barchard91317c52019-11-22 10:54:35 -080012907 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_gt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012908 TEST_REQUIRES_ARM_NEON_FMA;
12909 for (uint32_t n = 5; n < 8; n++) {
12910 for (size_t k = 1; k <= 10; k += 3) {
12911 for (uint32_t m = 1; m <= 4; m++) {
12912 GemmMicrokernelTester()
12913 .mr(4)
12914 .nr(4)
12915 .kr(1)
12916 .sr(1)
12917 .m(m)
12918 .n(n)
12919 .k(k)
12920 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012921 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012922 }
12923 }
12924 }
12925 }
12926
Frank Barchard91317c52019-11-22 10:54:35 -080012927 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_div_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012928 TEST_REQUIRES_ARM_NEON_FMA;
12929 for (uint32_t n = 8; n <= 12; n += 4) {
12930 for (size_t k = 1; k <= 10; k += 3) {
12931 GemmMicrokernelTester()
12932 .mr(4)
12933 .nr(4)
12934 .kr(1)
12935 .sr(1)
12936 .m(4)
12937 .n(4)
12938 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012939 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012940 }
12941 }
12942 }
12943
Frank Barchard91317c52019-11-22 10:54:35 -080012944 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_div_4_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012945 TEST_REQUIRES_ARM_NEON_FMA;
12946 for (uint32_t n = 8; n <= 12; n += 4) {
12947 for (size_t k = 1; k <= 10; k += 3) {
12948 GemmMicrokernelTester()
12949 .mr(4)
12950 .nr(4)
12951 .kr(1)
12952 .sr(1)
12953 .m(4)
12954 .n(n)
12955 .k(k)
12956 .cn_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080012957 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012958 }
12959 }
12960 }
12961
Frank Barchard91317c52019-11-22 10:54:35 -080012962 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_div_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012963 TEST_REQUIRES_ARM_NEON_FMA;
12964 for (uint32_t n = 8; n <= 12; n += 4) {
12965 for (size_t k = 1; k <= 10; k += 3) {
12966 for (uint32_t m = 1; m <= 4; m++) {
12967 GemmMicrokernelTester()
12968 .mr(4)
12969 .nr(4)
12970 .kr(1)
12971 .sr(1)
12972 .m(m)
12973 .n(n)
12974 .k(k)
12975 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012976 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012977 }
12978 }
12979 }
12980 }
12981
Frank Barchard91317c52019-11-22 10:54:35 -080012982 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012983 TEST_REQUIRES_ARM_NEON_FMA;
12984 for (size_t k = 1; k <= 10; k += 3) {
12985 GemmMicrokernelTester()
12986 .mr(4)
12987 .nr(4)
12988 .kr(1)
12989 .sr(1)
12990 .m(4)
12991 .n(4)
12992 .k(k)
12993 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080012994 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012995 }
12996 }
12997
Frank Barchard91317c52019-11-22 10:54:35 -080012998 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, small_kernel_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012999 TEST_REQUIRES_ARM_NEON_FMA;
13000 for (size_t k = 1; k <= 10; k += 3) {
13001 for (uint32_t m = 1; m <= 4; m++) {
13002 for (uint32_t n = 1; n <= 4; n++) {
13003 GemmMicrokernelTester()
13004 .mr(4)
13005 .nr(4)
13006 .kr(1)
13007 .sr(1)
13008 .m(m)
13009 .n(n)
13010 .k(k)
13011 .ks(3)
13012 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013013 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013014 }
13015 }
13016 }
13017 }
13018
Frank Barchard91317c52019-11-22 10:54:35 -080013019 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_gt_4_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013020 TEST_REQUIRES_ARM_NEON_FMA;
13021 for (uint32_t n = 5; n < 8; n++) {
13022 for (size_t k = 1; k <= 10; k += 3) {
13023 GemmMicrokernelTester()
13024 .mr(4)
13025 .nr(4)
13026 .kr(1)
13027 .sr(1)
13028 .m(4)
13029 .n(4)
13030 .k(k)
13031 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080013032 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013033 }
13034 }
13035 }
13036
Frank Barchard91317c52019-11-22 10:54:35 -080013037 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_div_4_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013038 TEST_REQUIRES_ARM_NEON_FMA;
13039 for (uint32_t n = 8; n <= 12; n += 4) {
13040 for (size_t k = 1; k <= 10; k += 3) {
13041 GemmMicrokernelTester()
13042 .mr(4)
13043 .nr(4)
13044 .kr(1)
13045 .sr(1)
13046 .m(4)
13047 .n(4)
13048 .k(k)
13049 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080013050 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013051 }
13052 }
13053 }
13054
Frank Barchard91317c52019-11-22 10:54:35 -080013055 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013056 TEST_REQUIRES_ARM_NEON_FMA;
13057 for (size_t k = 1; k <= 10; k += 3) {
13058 for (uint32_t m = 1; m <= 4; m++) {
13059 for (uint32_t n = 1; n <= 4; n++) {
13060 GemmMicrokernelTester()
13061 .mr(4)
13062 .nr(4)
13063 .kr(1)
13064 .sr(1)
13065 .m(m)
13066 .n(n)
13067 .k(k)
13068 .cm_stride(7)
13069 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013070 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013071 }
13072 }
13073 }
13074 }
13075
Frank Barchard91317c52019-11-22 10:54:35 -080013076 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, a_offset) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013077 TEST_REQUIRES_ARM_NEON_FMA;
13078 for (size_t k = 1; k <= 10; k += 3) {
13079 GemmMicrokernelTester()
13080 .mr(4)
13081 .nr(4)
13082 .kr(1)
13083 .sr(1)
13084 .m(4)
13085 .n(4)
13086 .k(k)
13087 .ks(3)
13088 .a_offset(43)
Frank Barchard91317c52019-11-22 10:54:35 -080013089 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013090 }
13091 }
13092
Frank Barchard91317c52019-11-22 10:54:35 -080013093 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, zero) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013094 TEST_REQUIRES_ARM_NEON_FMA;
13095 for (uint32_t mz = 0; mz < 4; mz++) {
13096 for (size_t k = 1; k <= 10; k += 3) {
13097 GemmMicrokernelTester()
13098 .mr(4)
13099 .nr(4)
13100 .kr(1)
13101 .sr(1)
13102 .m(4)
13103 .n(4)
13104 .k(k)
13105 .ks(3)
13106 .a_offset(43)
13107 .zero_index(mz)
Frank Barchard91317c52019-11-22 10:54:35 -080013108 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013109 }
13110 }
13111 }
13112
Frank Barchard91317c52019-11-22 10:54:35 -080013113 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013114 TEST_REQUIRES_ARM_NEON_FMA;
13115 GemmMicrokernelTester()
13116 .mr(4)
13117 .nr(4)
13118 .kr(1)
13119 .sr(1)
13120 .m(4)
13121 .n(4)
13122 .k(2)
13123 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080013124 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013125 }
13126
Frank Barchard91317c52019-11-22 10:54:35 -080013127 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013128 TEST_REQUIRES_ARM_NEON_FMA;
13129 GemmMicrokernelTester()
13130 .mr(4)
13131 .nr(4)
13132 .kr(1)
13133 .sr(1)
13134 .m(4)
13135 .n(4)
13136 .k(2)
13137 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080013138 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013139 }
13140
Frank Barchard91317c52019-11-22 10:54:35 -080013141 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013142 TEST_REQUIRES_ARM_NEON_FMA;
13143 GemmMicrokernelTester()
13144 .mr(4)
13145 .nr(4)
13146 .kr(1)
13147 .sr(1)
13148 .m(4)
13149 .n(4)
13150 .k(2)
13151 .cm_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080013152 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013153 }
Frank Barchard91317c52019-11-22 10:54:35 -080013154#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070013155
13156
Frank Barchard91317c52019-11-22 10:54:35 -080013157#if XNN_ARCH_ARM64
13158 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_eq_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013159 TEST_REQUIRES_ARM_NEON_FMA;
13160 GemmMicrokernelTester()
13161 .mr(4)
13162 .nr(8)
13163 .kr(1)
13164 .sr(1)
13165 .m(4)
13166 .n(8)
13167 .k(4)
Frank Barchard91317c52019-11-22 10:54:35 -080013168 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013169 }
13170
Frank Barchard91317c52019-11-22 10:54:35 -080013171 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013172 TEST_REQUIRES_ARM_NEON_FMA;
13173 GemmMicrokernelTester()
13174 .mr(4)
13175 .nr(8)
13176 .kr(1)
13177 .sr(1)
13178 .m(4)
13179 .n(8)
13180 .k(4)
13181 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080013182 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013183 }
13184
Frank Barchard91317c52019-11-22 10:54:35 -080013185 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013186 TEST_REQUIRES_ARM_NEON_FMA;
13187 for (uint32_t m = 1; m <= 4; m++) {
13188 for (uint32_t n = 1; n <= 8; n++) {
13189 GemmMicrokernelTester()
13190 .mr(4)
13191 .nr(8)
13192 .kr(1)
13193 .sr(1)
13194 .m(m)
13195 .n(n)
13196 .k(4)
13197 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013198 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013199 }
13200 }
13201 }
13202
Frank Barchard91317c52019-11-22 10:54:35 -080013203 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013204 TEST_REQUIRES_ARM_NEON_FMA;
13205 for (uint32_t m = 1; m <= 4; m++) {
13206 GemmMicrokernelTester()
13207 .mr(4)
13208 .nr(8)
13209 .kr(1)
13210 .sr(1)
13211 .m(m)
13212 .n(8)
13213 .k(4)
13214 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013215 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013216 }
13217 }
13218
Frank Barchard91317c52019-11-22 10:54:35 -080013219 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013220 TEST_REQUIRES_ARM_NEON_FMA;
13221 for (uint32_t n = 1; n <= 8; n++) {
13222 GemmMicrokernelTester()
13223 .mr(4)
13224 .nr(8)
13225 .kr(1)
13226 .sr(1)
13227 .m(4)
13228 .n(n)
13229 .k(4)
13230 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013231 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013232 }
13233 }
13234
Frank Barchard91317c52019-11-22 10:54:35 -080013235 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_lt_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013236 TEST_REQUIRES_ARM_NEON_FMA;
13237 for (size_t k = 1; k < 4; k++) {
13238 GemmMicrokernelTester()
13239 .mr(4)
13240 .nr(8)
13241 .kr(1)
13242 .sr(1)
13243 .m(4)
13244 .n(8)
13245 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080013246 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013247 }
13248 }
13249
Frank Barchard91317c52019-11-22 10:54:35 -080013250 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_lt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013251 TEST_REQUIRES_ARM_NEON_FMA;
13252 for (size_t k = 1; k < 4; k++) {
13253 for (uint32_t m = 1; m <= 4; m++) {
13254 for (uint32_t n = 1; n <= 8; n++) {
13255 GemmMicrokernelTester()
13256 .mr(4)
13257 .nr(8)
13258 .kr(1)
13259 .sr(1)
13260 .m(m)
13261 .n(n)
13262 .k(k)
13263 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013264 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013265 }
13266 }
13267 }
13268 }
13269
Frank Barchard91317c52019-11-22 10:54:35 -080013270 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_gt_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013271 TEST_REQUIRES_ARM_NEON_FMA;
13272 for (size_t k = 5; k < 8; k++) {
13273 GemmMicrokernelTester()
13274 .mr(4)
13275 .nr(8)
13276 .kr(1)
13277 .sr(1)
13278 .m(4)
13279 .n(8)
13280 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080013281 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013282 }
13283 }
13284
Frank Barchard91317c52019-11-22 10:54:35 -080013285 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_gt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013286 TEST_REQUIRES_ARM_NEON_FMA;
13287 for (size_t k = 5; k < 8; k++) {
13288 for (uint32_t m = 1; m <= 4; m++) {
13289 for (uint32_t n = 1; n <= 8; n++) {
13290 GemmMicrokernelTester()
13291 .mr(4)
13292 .nr(8)
13293 .kr(1)
13294 .sr(1)
13295 .m(m)
13296 .n(n)
13297 .k(k)
13298 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013299 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013300 }
13301 }
13302 }
13303 }
13304
Frank Barchard91317c52019-11-22 10:54:35 -080013305 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_div_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013306 TEST_REQUIRES_ARM_NEON_FMA;
13307 for (size_t k = 8; k <= 40; k += 4) {
13308 GemmMicrokernelTester()
13309 .mr(4)
13310 .nr(8)
13311 .kr(1)
13312 .sr(1)
13313 .m(4)
13314 .n(8)
13315 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080013316 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013317 }
13318 }
13319
Frank Barchard91317c52019-11-22 10:54:35 -080013320 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_div_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013321 TEST_REQUIRES_ARM_NEON_FMA;
13322 for (size_t k = 8; k <= 40; k += 4) {
13323 for (uint32_t m = 1; m <= 4; m++) {
13324 for (uint32_t n = 1; n <= 8; n++) {
13325 GemmMicrokernelTester()
13326 .mr(4)
13327 .nr(8)
13328 .kr(1)
13329 .sr(1)
13330 .m(m)
13331 .n(n)
13332 .k(k)
13333 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013334 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013335 }
13336 }
13337 }
13338 }
13339
Frank Barchard91317c52019-11-22 10:54:35 -080013340 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013341 TEST_REQUIRES_ARM_NEON_FMA;
13342 for (uint32_t n = 9; n < 16; n++) {
13343 for (size_t k = 1; k <= 20; k += 5) {
13344 GemmMicrokernelTester()
13345 .mr(4)
13346 .nr(8)
13347 .kr(1)
13348 .sr(1)
13349 .m(4)
13350 .n(8)
13351 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080013352 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013353 }
13354 }
13355 }
13356
Frank Barchard91317c52019-11-22 10:54:35 -080013357 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013358 TEST_REQUIRES_ARM_NEON_FMA;
13359 for (uint32_t n = 9; n < 16; n++) {
13360 for (size_t k = 1; k <= 20; k += 5) {
13361 GemmMicrokernelTester()
13362 .mr(4)
13363 .nr(8)
13364 .kr(1)
13365 .sr(1)
13366 .m(4)
13367 .n(8)
13368 .k(k)
13369 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080013370 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013371 }
13372 }
13373 }
13374
Frank Barchard91317c52019-11-22 10:54:35 -080013375 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013376 TEST_REQUIRES_ARM_NEON_FMA;
13377 for (uint32_t n = 9; n < 16; n++) {
13378 for (size_t k = 1; k <= 20; k += 5) {
13379 for (uint32_t m = 1; m <= 4; m++) {
13380 GemmMicrokernelTester()
13381 .mr(4)
13382 .nr(8)
13383 .kr(1)
13384 .sr(1)
13385 .m(m)
13386 .n(n)
13387 .k(k)
13388 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013389 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013390 }
13391 }
13392 }
13393 }
13394
Frank Barchard91317c52019-11-22 10:54:35 -080013395 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013396 TEST_REQUIRES_ARM_NEON_FMA;
13397 for (uint32_t n = 16; n <= 24; n += 8) {
13398 for (size_t k = 1; k <= 20; k += 5) {
13399 GemmMicrokernelTester()
13400 .mr(4)
13401 .nr(8)
13402 .kr(1)
13403 .sr(1)
13404 .m(4)
13405 .n(8)
13406 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080013407 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013408 }
13409 }
13410 }
13411
Frank Barchard91317c52019-11-22 10:54:35 -080013412 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013413 TEST_REQUIRES_ARM_NEON_FMA;
13414 for (uint32_t n = 16; n <= 24; n += 8) {
13415 for (size_t k = 1; k <= 20; k += 5) {
13416 GemmMicrokernelTester()
13417 .mr(4)
13418 .nr(8)
13419 .kr(1)
13420 .sr(1)
13421 .m(4)
13422 .n(n)
13423 .k(k)
13424 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080013425 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013426 }
13427 }
13428 }
13429
Frank Barchard91317c52019-11-22 10:54:35 -080013430 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013431 TEST_REQUIRES_ARM_NEON_FMA;
13432 for (uint32_t n = 16; n <= 24; n += 8) {
13433 for (size_t k = 1; k <= 20; k += 5) {
13434 for (uint32_t m = 1; m <= 4; m++) {
13435 GemmMicrokernelTester()
13436 .mr(4)
13437 .nr(8)
13438 .kr(1)
13439 .sr(1)
13440 .m(m)
13441 .n(n)
13442 .k(k)
13443 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013444 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013445 }
13446 }
13447 }
13448 }
13449
Frank Barchard91317c52019-11-22 10:54:35 -080013450 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013451 TEST_REQUIRES_ARM_NEON_FMA;
13452 for (size_t k = 1; k <= 20; k += 5) {
13453 GemmMicrokernelTester()
13454 .mr(4)
13455 .nr(8)
13456 .kr(1)
13457 .sr(1)
13458 .m(4)
13459 .n(8)
13460 .k(k)
13461 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080013462 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013463 }
13464 }
13465
Frank Barchard91317c52019-11-22 10:54:35 -080013466 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, small_kernel_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013467 TEST_REQUIRES_ARM_NEON_FMA;
13468 for (size_t k = 1; k <= 20; k += 5) {
13469 for (uint32_t m = 1; m <= 4; m++) {
13470 for (uint32_t n = 1; n <= 8; n++) {
13471 GemmMicrokernelTester()
13472 .mr(4)
13473 .nr(8)
13474 .kr(1)
13475 .sr(1)
13476 .m(m)
13477 .n(n)
13478 .k(k)
13479 .ks(3)
13480 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013481 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013482 }
13483 }
13484 }
13485 }
13486
Frank Barchard91317c52019-11-22 10:54:35 -080013487 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_gt_8_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013488 TEST_REQUIRES_ARM_NEON_FMA;
13489 for (uint32_t n = 9; n < 16; n++) {
13490 for (size_t k = 1; k <= 20; k += 5) {
13491 GemmMicrokernelTester()
13492 .mr(4)
13493 .nr(8)
13494 .kr(1)
13495 .sr(1)
13496 .m(4)
13497 .n(8)
13498 .k(k)
13499 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080013500 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013501 }
13502 }
13503 }
13504
Frank Barchard91317c52019-11-22 10:54:35 -080013505 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_div_8_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013506 TEST_REQUIRES_ARM_NEON_FMA;
13507 for (uint32_t n = 16; n <= 24; n += 8) {
13508 for (size_t k = 1; k <= 20; k += 5) {
13509 GemmMicrokernelTester()
13510 .mr(4)
13511 .nr(8)
13512 .kr(1)
13513 .sr(1)
13514 .m(4)
13515 .n(8)
13516 .k(k)
13517 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080013518 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013519 }
13520 }
13521 }
13522
Frank Barchard91317c52019-11-22 10:54:35 -080013523 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013524 TEST_REQUIRES_ARM_NEON_FMA;
13525 for (size_t k = 1; k <= 20; k += 5) {
13526 for (uint32_t m = 1; m <= 4; m++) {
13527 for (uint32_t n = 1; n <= 8; n++) {
13528 GemmMicrokernelTester()
13529 .mr(4)
13530 .nr(8)
13531 .kr(1)
13532 .sr(1)
13533 .m(m)
13534 .n(n)
13535 .k(k)
13536 .cm_stride(11)
13537 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013538 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013539 }
13540 }
13541 }
13542 }
13543
Frank Barchard91317c52019-11-22 10:54:35 -080013544 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, a_offset) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013545 TEST_REQUIRES_ARM_NEON_FMA;
13546 for (size_t k = 1; k <= 20; k += 5) {
13547 GemmMicrokernelTester()
13548 .mr(4)
13549 .nr(8)
13550 .kr(1)
13551 .sr(1)
13552 .m(4)
13553 .n(8)
13554 .k(k)
13555 .ks(3)
13556 .a_offset(83)
Frank Barchard91317c52019-11-22 10:54:35 -080013557 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013558 }
13559 }
13560
Frank Barchard91317c52019-11-22 10:54:35 -080013561 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, zero) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013562 TEST_REQUIRES_ARM_NEON_FMA;
13563 for (uint32_t mz = 0; mz < 4; mz++) {
13564 for (size_t k = 1; k <= 20; k += 5) {
13565 GemmMicrokernelTester()
13566 .mr(4)
13567 .nr(8)
13568 .kr(1)
13569 .sr(1)
13570 .m(4)
13571 .n(8)
13572 .k(k)
13573 .ks(3)
13574 .a_offset(83)
13575 .zero_index(mz)
Frank Barchard91317c52019-11-22 10:54:35 -080013576 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013577 }
13578 }
13579 }
13580
Frank Barchard91317c52019-11-22 10:54:35 -080013581 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013582 TEST_REQUIRES_ARM_NEON_FMA;
13583 GemmMicrokernelTester()
13584 .mr(4)
13585 .nr(8)
13586 .kr(1)
13587 .sr(1)
13588 .m(4)
13589 .n(8)
13590 .k(4)
13591 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080013592 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013593 }
13594
Frank Barchard91317c52019-11-22 10:54:35 -080013595 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013596 TEST_REQUIRES_ARM_NEON_FMA;
13597 GemmMicrokernelTester()
13598 .mr(4)
13599 .nr(8)
13600 .kr(1)
13601 .sr(1)
13602 .m(4)
13603 .n(8)
13604 .k(4)
13605 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080013606 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013607 }
13608
Frank Barchard91317c52019-11-22 10:54:35 -080013609 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013610 TEST_REQUIRES_ARM_NEON_FMA;
13611 GemmMicrokernelTester()
13612 .mr(4)
13613 .nr(8)
13614 .kr(1)
13615 .sr(1)
13616 .m(4)
13617 .n(8)
13618 .k(4)
13619 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080013620 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013621 }
Frank Barchard91317c52019-11-22 10:54:35 -080013622#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070013623
13624
Frank Barchard91317c52019-11-22 10:54:35 -080013625#if XNN_ARCH_ARM64
13626 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013627 TEST_REQUIRES_ARM_NEON_FMA;
13628 GemmMicrokernelTester()
13629 .mr(4)
13630 .nr(8)
13631 .kr(1)
13632 .sr(1)
13633 .m(4)
13634 .n(8)
13635 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -080013636 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013637 }
13638
Frank Barchard91317c52019-11-22 10:54:35 -080013639 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013640 TEST_REQUIRES_ARM_NEON_FMA;
13641 GemmMicrokernelTester()
13642 .mr(4)
13643 .nr(8)
13644 .kr(1)
13645 .sr(1)
13646 .m(4)
13647 .n(8)
13648 .k(2)
13649 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080013650 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013651 }
13652
Frank Barchard91317c52019-11-22 10:54:35 -080013653 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013654 TEST_REQUIRES_ARM_NEON_FMA;
13655 for (uint32_t m = 1; m <= 4; m++) {
13656 for (uint32_t n = 1; n <= 8; n++) {
13657 GemmMicrokernelTester()
13658 .mr(4)
13659 .nr(8)
13660 .kr(1)
13661 .sr(1)
13662 .m(m)
13663 .n(n)
13664 .k(2)
13665 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013666 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013667 }
13668 }
13669 }
13670
Frank Barchard91317c52019-11-22 10:54:35 -080013671 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013672 TEST_REQUIRES_ARM_NEON_FMA;
13673 for (uint32_t m = 1; m <= 4; m++) {
13674 GemmMicrokernelTester()
13675 .mr(4)
13676 .nr(8)
13677 .kr(1)
13678 .sr(1)
13679 .m(m)
13680 .n(8)
13681 .k(2)
13682 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013683 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013684 }
13685 }
13686
Frank Barchard91317c52019-11-22 10:54:35 -080013687 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013688 TEST_REQUIRES_ARM_NEON_FMA;
13689 for (uint32_t n = 1; n <= 8; n++) {
13690 GemmMicrokernelTester()
13691 .mr(4)
13692 .nr(8)
13693 .kr(1)
13694 .sr(1)
13695 .m(4)
13696 .n(n)
13697 .k(2)
13698 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013699 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013700 }
13701 }
13702
Frank Barchard91317c52019-11-22 10:54:35 -080013703 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013704 TEST_REQUIRES_ARM_NEON_FMA;
13705 for (size_t k = 1; k < 2; k++) {
13706 GemmMicrokernelTester()
13707 .mr(4)
13708 .nr(8)
13709 .kr(1)
13710 .sr(1)
13711 .m(4)
13712 .n(8)
13713 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080013714 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013715 }
13716 }
13717
Frank Barchard91317c52019-11-22 10:54:35 -080013718 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013719 TEST_REQUIRES_ARM_NEON_FMA;
13720 for (size_t k = 1; k < 2; k++) {
13721 for (uint32_t m = 1; m <= 4; m++) {
13722 for (uint32_t n = 1; n <= 8; n++) {
13723 GemmMicrokernelTester()
13724 .mr(4)
13725 .nr(8)
13726 .kr(1)
13727 .sr(1)
13728 .m(m)
13729 .n(n)
13730 .k(k)
13731 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013732 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013733 }
13734 }
13735 }
13736 }
13737
Frank Barchard91317c52019-11-22 10:54:35 -080013738 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013739 TEST_REQUIRES_ARM_NEON_FMA;
13740 for (size_t k = 3; k < 4; k++) {
13741 GemmMicrokernelTester()
13742 .mr(4)
13743 .nr(8)
13744 .kr(1)
13745 .sr(1)
13746 .m(4)
13747 .n(8)
13748 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080013749 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013750 }
13751 }
13752
Frank Barchard91317c52019-11-22 10:54:35 -080013753 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013754 TEST_REQUIRES_ARM_NEON_FMA;
13755 for (size_t k = 3; k < 4; k++) {
13756 for (uint32_t m = 1; m <= 4; m++) {
13757 for (uint32_t n = 1; n <= 8; n++) {
13758 GemmMicrokernelTester()
13759 .mr(4)
13760 .nr(8)
13761 .kr(1)
13762 .sr(1)
13763 .m(m)
13764 .n(n)
13765 .k(k)
13766 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013767 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013768 }
13769 }
13770 }
13771 }
13772
Frank Barchard91317c52019-11-22 10:54:35 -080013773 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013774 TEST_REQUIRES_ARM_NEON_FMA;
13775 for (size_t k = 4; k <= 20; k += 2) {
13776 GemmMicrokernelTester()
13777 .mr(4)
13778 .nr(8)
13779 .kr(1)
13780 .sr(1)
13781 .m(4)
13782 .n(8)
13783 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080013784 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013785 }
13786 }
13787
Frank Barchard91317c52019-11-22 10:54:35 -080013788 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013789 TEST_REQUIRES_ARM_NEON_FMA;
13790 for (size_t k = 4; k <= 20; k += 2) {
13791 for (uint32_t m = 1; m <= 4; m++) {
13792 for (uint32_t n = 1; n <= 8; n++) {
13793 GemmMicrokernelTester()
13794 .mr(4)
13795 .nr(8)
13796 .kr(1)
13797 .sr(1)
13798 .m(m)
13799 .n(n)
13800 .k(k)
13801 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013802 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013803 }
13804 }
13805 }
13806 }
13807
Frank Barchard91317c52019-11-22 10:54:35 -080013808 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013809 TEST_REQUIRES_ARM_NEON_FMA;
13810 for (uint32_t n = 9; n < 16; n++) {
13811 for (size_t k = 1; k <= 10; k += 3) {
13812 GemmMicrokernelTester()
13813 .mr(4)
13814 .nr(8)
13815 .kr(1)
13816 .sr(1)
13817 .m(4)
13818 .n(8)
13819 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080013820 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013821 }
13822 }
13823 }
13824
Frank Barchard91317c52019-11-22 10:54:35 -080013825 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013826 TEST_REQUIRES_ARM_NEON_FMA;
13827 for (uint32_t n = 9; n < 16; n++) {
13828 for (size_t k = 1; k <= 10; k += 3) {
13829 GemmMicrokernelTester()
13830 .mr(4)
13831 .nr(8)
13832 .kr(1)
13833 .sr(1)
13834 .m(4)
13835 .n(8)
13836 .k(k)
13837 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080013838 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013839 }
13840 }
13841 }
13842
Frank Barchard91317c52019-11-22 10:54:35 -080013843 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013844 TEST_REQUIRES_ARM_NEON_FMA;
13845 for (uint32_t n = 9; n < 16; n++) {
13846 for (size_t k = 1; k <= 10; k += 3) {
13847 for (uint32_t m = 1; m <= 4; m++) {
13848 GemmMicrokernelTester()
13849 .mr(4)
13850 .nr(8)
13851 .kr(1)
13852 .sr(1)
13853 .m(m)
13854 .n(n)
13855 .k(k)
13856 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013857 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013858 }
13859 }
13860 }
13861 }
13862
Frank Barchard91317c52019-11-22 10:54:35 -080013863 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013864 TEST_REQUIRES_ARM_NEON_FMA;
13865 for (uint32_t n = 16; n <= 24; n += 8) {
13866 for (size_t k = 1; k <= 10; k += 3) {
13867 GemmMicrokernelTester()
13868 .mr(4)
13869 .nr(8)
13870 .kr(1)
13871 .sr(1)
13872 .m(4)
13873 .n(8)
13874 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080013875 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013876 }
13877 }
13878 }
13879
Frank Barchard91317c52019-11-22 10:54:35 -080013880 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013881 TEST_REQUIRES_ARM_NEON_FMA;
13882 for (uint32_t n = 16; n <= 24; n += 8) {
13883 for (size_t k = 1; k <= 10; k += 3) {
13884 GemmMicrokernelTester()
13885 .mr(4)
13886 .nr(8)
13887 .kr(1)
13888 .sr(1)
13889 .m(4)
13890 .n(n)
13891 .k(k)
13892 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080013893 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013894 }
13895 }
13896 }
13897
Frank Barchard91317c52019-11-22 10:54:35 -080013898 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013899 TEST_REQUIRES_ARM_NEON_FMA;
13900 for (uint32_t n = 16; n <= 24; n += 8) {
13901 for (size_t k = 1; k <= 10; k += 3) {
13902 for (uint32_t m = 1; m <= 4; m++) {
13903 GemmMicrokernelTester()
13904 .mr(4)
13905 .nr(8)
13906 .kr(1)
13907 .sr(1)
13908 .m(m)
13909 .n(n)
13910 .k(k)
13911 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013912 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013913 }
13914 }
13915 }
13916 }
13917
Frank Barchard91317c52019-11-22 10:54:35 -080013918 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013919 TEST_REQUIRES_ARM_NEON_FMA;
13920 for (size_t k = 1; k <= 10; k += 3) {
13921 GemmMicrokernelTester()
13922 .mr(4)
13923 .nr(8)
13924 .kr(1)
13925 .sr(1)
13926 .m(4)
13927 .n(8)
13928 .k(k)
13929 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080013930 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013931 }
13932 }
13933
Frank Barchard91317c52019-11-22 10:54:35 -080013934 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, small_kernel_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013935 TEST_REQUIRES_ARM_NEON_FMA;
13936 for (size_t k = 1; k <= 10; k += 3) {
13937 for (uint32_t m = 1; m <= 4; m++) {
13938 for (uint32_t n = 1; n <= 8; n++) {
13939 GemmMicrokernelTester()
13940 .mr(4)
13941 .nr(8)
13942 .kr(1)
13943 .sr(1)
13944 .m(m)
13945 .n(n)
13946 .k(k)
13947 .ks(3)
13948 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013949 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013950 }
13951 }
13952 }
13953 }
13954
Frank Barchard91317c52019-11-22 10:54:35 -080013955 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_gt_8_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013956 TEST_REQUIRES_ARM_NEON_FMA;
13957 for (uint32_t n = 9; n < 16; n++) {
13958 for (size_t k = 1; k <= 10; k += 3) {
13959 GemmMicrokernelTester()
13960 .mr(4)
13961 .nr(8)
13962 .kr(1)
13963 .sr(1)
13964 .m(4)
13965 .n(8)
13966 .k(k)
13967 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080013968 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013969 }
13970 }
13971 }
13972
Frank Barchard91317c52019-11-22 10:54:35 -080013973 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_div_8_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013974 TEST_REQUIRES_ARM_NEON_FMA;
13975 for (uint32_t n = 16; n <= 24; n += 8) {
13976 for (size_t k = 1; k <= 10; k += 3) {
13977 GemmMicrokernelTester()
13978 .mr(4)
13979 .nr(8)
13980 .kr(1)
13981 .sr(1)
13982 .m(4)
13983 .n(8)
13984 .k(k)
13985 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080013986 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013987 }
13988 }
13989 }
13990
Frank Barchard91317c52019-11-22 10:54:35 -080013991 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013992 TEST_REQUIRES_ARM_NEON_FMA;
13993 for (size_t k = 1; k <= 10; k += 3) {
13994 for (uint32_t m = 1; m <= 4; m++) {
13995 for (uint32_t n = 1; n <= 8; n++) {
13996 GemmMicrokernelTester()
13997 .mr(4)
13998 .nr(8)
13999 .kr(1)
14000 .sr(1)
14001 .m(m)
14002 .n(n)
14003 .k(k)
14004 .cm_stride(11)
14005 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014006 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014007 }
14008 }
14009 }
14010 }
14011
Frank Barchard91317c52019-11-22 10:54:35 -080014012 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, a_offset) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014013 TEST_REQUIRES_ARM_NEON_FMA;
14014 for (size_t k = 1; k <= 10; k += 3) {
14015 GemmMicrokernelTester()
14016 .mr(4)
14017 .nr(8)
14018 .kr(1)
14019 .sr(1)
14020 .m(4)
14021 .n(8)
14022 .k(k)
14023 .ks(3)
14024 .a_offset(43)
Frank Barchard91317c52019-11-22 10:54:35 -080014025 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014026 }
14027 }
14028
Frank Barchard91317c52019-11-22 10:54:35 -080014029 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, zero) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014030 TEST_REQUIRES_ARM_NEON_FMA;
14031 for (uint32_t mz = 0; mz < 4; mz++) {
14032 for (size_t k = 1; k <= 10; k += 3) {
14033 GemmMicrokernelTester()
14034 .mr(4)
14035 .nr(8)
14036 .kr(1)
14037 .sr(1)
14038 .m(4)
14039 .n(8)
14040 .k(k)
14041 .ks(3)
14042 .a_offset(43)
14043 .zero_index(mz)
Frank Barchard91317c52019-11-22 10:54:35 -080014044 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014045 }
14046 }
14047 }
14048
Frank Barchard91317c52019-11-22 10:54:35 -080014049 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014050 TEST_REQUIRES_ARM_NEON_FMA;
14051 GemmMicrokernelTester()
14052 .mr(4)
14053 .nr(8)
14054 .kr(1)
14055 .sr(1)
14056 .m(4)
14057 .n(8)
14058 .k(2)
14059 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080014060 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014061 }
14062
Frank Barchard91317c52019-11-22 10:54:35 -080014063 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014064 TEST_REQUIRES_ARM_NEON_FMA;
14065 GemmMicrokernelTester()
14066 .mr(4)
14067 .nr(8)
14068 .kr(1)
14069 .sr(1)
14070 .m(4)
14071 .n(8)
14072 .k(2)
14073 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080014074 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014075 }
14076
Frank Barchard91317c52019-11-22 10:54:35 -080014077 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014078 TEST_REQUIRES_ARM_NEON_FMA;
14079 GemmMicrokernelTester()
14080 .mr(4)
14081 .nr(8)
14082 .kr(1)
14083 .sr(1)
14084 .m(4)
14085 .n(8)
14086 .k(2)
14087 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080014088 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014089 }
Frank Barchard91317c52019-11-22 10:54:35 -080014090#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070014091
14092
Frank Barchard91317c52019-11-22 10:54:35 -080014093#if XNN_ARCH_ARM64
14094 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014095 TEST_REQUIRES_ARM_NEON_FMA;
14096 GemmMicrokernelTester()
14097 .mr(6)
14098 .nr(8)
14099 .kr(1)
14100 .sr(1)
14101 .m(6)
14102 .n(8)
14103 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -080014104 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014105 }
14106
Frank Barchard91317c52019-11-22 10:54:35 -080014107 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014108 TEST_REQUIRES_ARM_NEON_FMA;
14109 GemmMicrokernelTester()
14110 .mr(6)
14111 .nr(8)
14112 .kr(1)
14113 .sr(1)
14114 .m(6)
14115 .n(8)
14116 .k(2)
14117 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080014118 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014119 }
14120
Frank Barchard91317c52019-11-22 10:54:35 -080014121 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014122 TEST_REQUIRES_ARM_NEON_FMA;
14123 for (uint32_t m = 1; m <= 6; m++) {
14124 for (uint32_t n = 1; n <= 8; n++) {
14125 GemmMicrokernelTester()
14126 .mr(6)
14127 .nr(8)
14128 .kr(1)
14129 .sr(1)
14130 .m(m)
14131 .n(n)
14132 .k(2)
14133 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014134 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014135 }
14136 }
14137 }
14138
Frank Barchard91317c52019-11-22 10:54:35 -080014139 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014140 TEST_REQUIRES_ARM_NEON_FMA;
14141 for (uint32_t m = 1; m <= 6; m++) {
14142 GemmMicrokernelTester()
14143 .mr(6)
14144 .nr(8)
14145 .kr(1)
14146 .sr(1)
14147 .m(m)
14148 .n(8)
14149 .k(2)
14150 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014151 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014152 }
14153 }
14154
Frank Barchard91317c52019-11-22 10:54:35 -080014155 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014156 TEST_REQUIRES_ARM_NEON_FMA;
14157 for (uint32_t n = 1; n <= 8; n++) {
14158 GemmMicrokernelTester()
14159 .mr(6)
14160 .nr(8)
14161 .kr(1)
14162 .sr(1)
14163 .m(6)
14164 .n(n)
14165 .k(2)
14166 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014167 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014168 }
14169 }
14170
Frank Barchard91317c52019-11-22 10:54:35 -080014171 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014172 TEST_REQUIRES_ARM_NEON_FMA;
14173 for (size_t k = 1; k < 2; k++) {
14174 GemmMicrokernelTester()
14175 .mr(6)
14176 .nr(8)
14177 .kr(1)
14178 .sr(1)
14179 .m(6)
14180 .n(8)
14181 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080014182 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014183 }
14184 }
14185
Frank Barchard91317c52019-11-22 10:54:35 -080014186 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014187 TEST_REQUIRES_ARM_NEON_FMA;
14188 for (size_t k = 1; k < 2; k++) {
14189 for (uint32_t m = 1; m <= 6; m++) {
14190 for (uint32_t n = 1; n <= 8; n++) {
14191 GemmMicrokernelTester()
14192 .mr(6)
14193 .nr(8)
14194 .kr(1)
14195 .sr(1)
14196 .m(m)
14197 .n(n)
14198 .k(k)
14199 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014200 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014201 }
14202 }
14203 }
14204 }
14205
Frank Barchard91317c52019-11-22 10:54:35 -080014206 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014207 TEST_REQUIRES_ARM_NEON_FMA;
14208 for (size_t k = 3; k < 4; k++) {
14209 GemmMicrokernelTester()
14210 .mr(6)
14211 .nr(8)
14212 .kr(1)
14213 .sr(1)
14214 .m(6)
14215 .n(8)
14216 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080014217 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014218 }
14219 }
14220
Frank Barchard91317c52019-11-22 10:54:35 -080014221 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014222 TEST_REQUIRES_ARM_NEON_FMA;
14223 for (size_t k = 3; k < 4; k++) {
14224 for (uint32_t m = 1; m <= 6; m++) {
14225 for (uint32_t n = 1; n <= 8; n++) {
14226 GemmMicrokernelTester()
14227 .mr(6)
14228 .nr(8)
14229 .kr(1)
14230 .sr(1)
14231 .m(m)
14232 .n(n)
14233 .k(k)
14234 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014235 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014236 }
14237 }
14238 }
14239 }
14240
Frank Barchard91317c52019-11-22 10:54:35 -080014241 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014242 TEST_REQUIRES_ARM_NEON_FMA;
14243 for (size_t k = 4; k <= 20; k += 2) {
14244 GemmMicrokernelTester()
14245 .mr(6)
14246 .nr(8)
14247 .kr(1)
14248 .sr(1)
14249 .m(6)
14250 .n(8)
14251 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080014252 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014253 }
14254 }
14255
Frank Barchard91317c52019-11-22 10:54:35 -080014256 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014257 TEST_REQUIRES_ARM_NEON_FMA;
14258 for (size_t k = 4; k <= 20; k += 2) {
14259 for (uint32_t m = 1; m <= 6; m++) {
14260 for (uint32_t n = 1; n <= 8; n++) {
14261 GemmMicrokernelTester()
14262 .mr(6)
14263 .nr(8)
14264 .kr(1)
14265 .sr(1)
14266 .m(m)
14267 .n(n)
14268 .k(k)
14269 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014270 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014271 }
14272 }
14273 }
14274 }
14275
Frank Barchard91317c52019-11-22 10:54:35 -080014276 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014277 TEST_REQUIRES_ARM_NEON_FMA;
14278 for (uint32_t n = 9; n < 16; n++) {
14279 for (size_t k = 1; k <= 10; k += 3) {
14280 GemmMicrokernelTester()
14281 .mr(6)
14282 .nr(8)
14283 .kr(1)
14284 .sr(1)
14285 .m(6)
14286 .n(8)
14287 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080014288 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014289 }
14290 }
14291 }
14292
Frank Barchard91317c52019-11-22 10:54:35 -080014293 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014294 TEST_REQUIRES_ARM_NEON_FMA;
14295 for (uint32_t n = 9; n < 16; n++) {
14296 for (size_t k = 1; k <= 10; k += 3) {
14297 GemmMicrokernelTester()
14298 .mr(6)
14299 .nr(8)
14300 .kr(1)
14301 .sr(1)
14302 .m(6)
14303 .n(8)
14304 .k(k)
14305 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080014306 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014307 }
14308 }
14309 }
14310
Frank Barchard91317c52019-11-22 10:54:35 -080014311 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014312 TEST_REQUIRES_ARM_NEON_FMA;
14313 for (uint32_t n = 9; n < 16; n++) {
14314 for (size_t k = 1; k <= 10; k += 3) {
14315 for (uint32_t m = 1; m <= 6; m++) {
14316 GemmMicrokernelTester()
14317 .mr(6)
14318 .nr(8)
14319 .kr(1)
14320 .sr(1)
14321 .m(m)
14322 .n(n)
14323 .k(k)
14324 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014325 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014326 }
14327 }
14328 }
14329 }
14330
Frank Barchard91317c52019-11-22 10:54:35 -080014331 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014332 TEST_REQUIRES_ARM_NEON_FMA;
14333 for (uint32_t n = 16; n <= 24; n += 8) {
14334 for (size_t k = 1; k <= 10; k += 3) {
14335 GemmMicrokernelTester()
14336 .mr(6)
14337 .nr(8)
14338 .kr(1)
14339 .sr(1)
14340 .m(6)
14341 .n(8)
14342 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080014343 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014344 }
14345 }
14346 }
14347
Frank Barchard91317c52019-11-22 10:54:35 -080014348 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014349 TEST_REQUIRES_ARM_NEON_FMA;
14350 for (uint32_t n = 16; n <= 24; n += 8) {
14351 for (size_t k = 1; k <= 10; k += 3) {
14352 GemmMicrokernelTester()
14353 .mr(6)
14354 .nr(8)
14355 .kr(1)
14356 .sr(1)
14357 .m(6)
14358 .n(n)
14359 .k(k)
14360 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080014361 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014362 }
14363 }
14364 }
14365
Frank Barchard91317c52019-11-22 10:54:35 -080014366 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014367 TEST_REQUIRES_ARM_NEON_FMA;
14368 for (uint32_t n = 16; n <= 24; n += 8) {
14369 for (size_t k = 1; k <= 10; k += 3) {
14370 for (uint32_t m = 1; m <= 6; m++) {
14371 GemmMicrokernelTester()
14372 .mr(6)
14373 .nr(8)
14374 .kr(1)
14375 .sr(1)
14376 .m(m)
14377 .n(n)
14378 .k(k)
14379 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014380 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014381 }
14382 }
14383 }
14384 }
14385
Frank Barchard91317c52019-11-22 10:54:35 -080014386 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014387 TEST_REQUIRES_ARM_NEON_FMA;
14388 for (size_t k = 1; k <= 10; k += 3) {
14389 GemmMicrokernelTester()
14390 .mr(6)
14391 .nr(8)
14392 .kr(1)
14393 .sr(1)
14394 .m(6)
14395 .n(8)
14396 .k(k)
14397 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080014398 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014399 }
14400 }
14401
Frank Barchard91317c52019-11-22 10:54:35 -080014402 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, small_kernel_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014403 TEST_REQUIRES_ARM_NEON_FMA;
14404 for (size_t k = 1; k <= 10; k += 3) {
14405 for (uint32_t m = 1; m <= 6; m++) {
14406 for (uint32_t n = 1; n <= 8; n++) {
14407 GemmMicrokernelTester()
14408 .mr(6)
14409 .nr(8)
14410 .kr(1)
14411 .sr(1)
14412 .m(m)
14413 .n(n)
14414 .k(k)
14415 .ks(3)
14416 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014417 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014418 }
14419 }
14420 }
14421 }
14422
Frank Barchard91317c52019-11-22 10:54:35 -080014423 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_gt_8_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014424 TEST_REQUIRES_ARM_NEON_FMA;
14425 for (uint32_t n = 9; n < 16; n++) {
14426 for (size_t k = 1; k <= 10; k += 3) {
14427 GemmMicrokernelTester()
14428 .mr(6)
14429 .nr(8)
14430 .kr(1)
14431 .sr(1)
14432 .m(6)
14433 .n(8)
14434 .k(k)
14435 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080014436 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014437 }
14438 }
14439 }
14440
Frank Barchard91317c52019-11-22 10:54:35 -080014441 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_div_8_small_kernel) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014442 TEST_REQUIRES_ARM_NEON_FMA;
14443 for (uint32_t n = 16; n <= 24; n += 8) {
14444 for (size_t k = 1; k <= 10; k += 3) {
14445 GemmMicrokernelTester()
14446 .mr(6)
14447 .nr(8)
14448 .kr(1)
14449 .sr(1)
14450 .m(6)
14451 .n(8)
14452 .k(k)
14453 .ks(3)
Frank Barchard91317c52019-11-22 10:54:35 -080014454 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014455 }
14456 }
14457 }
14458
Frank Barchard91317c52019-11-22 10:54:35 -080014459 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014460 TEST_REQUIRES_ARM_NEON_FMA;
14461 for (size_t k = 1; k <= 10; k += 3) {
14462 for (uint32_t m = 1; m <= 6; m++) {
14463 for (uint32_t n = 1; n <= 8; n++) {
14464 GemmMicrokernelTester()
14465 .mr(6)
14466 .nr(8)
14467 .kr(1)
14468 .sr(1)
14469 .m(m)
14470 .n(n)
14471 .k(k)
14472 .cm_stride(11)
14473 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014474 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014475 }
14476 }
14477 }
14478 }
14479
Frank Barchard91317c52019-11-22 10:54:35 -080014480 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, a_offset) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014481 TEST_REQUIRES_ARM_NEON_FMA;
14482 for (size_t k = 1; k <= 10; k += 3) {
14483 GemmMicrokernelTester()
14484 .mr(6)
14485 .nr(8)
14486 .kr(1)
14487 .sr(1)
14488 .m(6)
14489 .n(8)
14490 .k(k)
14491 .ks(3)
14492 .a_offset(67)
Frank Barchard91317c52019-11-22 10:54:35 -080014493 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014494 }
14495 }
14496
Frank Barchard91317c52019-11-22 10:54:35 -080014497 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, zero) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014498 TEST_REQUIRES_ARM_NEON_FMA;
14499 for (uint32_t mz = 0; mz < 6; mz++) {
14500 for (size_t k = 1; k <= 10; k += 3) {
14501 GemmMicrokernelTester()
14502 .mr(6)
14503 .nr(8)
14504 .kr(1)
14505 .sr(1)
14506 .m(6)
14507 .n(8)
14508 .k(k)
14509 .ks(3)
14510 .a_offset(67)
14511 .zero_index(mz)
Frank Barchard91317c52019-11-22 10:54:35 -080014512 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014513 }
14514 }
14515 }
14516
Frank Barchard91317c52019-11-22 10:54:35 -080014517 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014518 TEST_REQUIRES_ARM_NEON_FMA;
14519 GemmMicrokernelTester()
14520 .mr(6)
14521 .nr(8)
14522 .kr(1)
14523 .sr(1)
14524 .m(6)
14525 .n(8)
14526 .k(2)
14527 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080014528 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014529 }
14530
Frank Barchard91317c52019-11-22 10:54:35 -080014531 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014532 TEST_REQUIRES_ARM_NEON_FMA;
14533 GemmMicrokernelTester()
14534 .mr(6)
14535 .nr(8)
14536 .kr(1)
14537 .sr(1)
14538 .m(6)
14539 .n(8)
14540 .k(2)
14541 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080014542 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014543 }
14544
Frank Barchard91317c52019-11-22 10:54:35 -080014545 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014546 TEST_REQUIRES_ARM_NEON_FMA;
14547 GemmMicrokernelTester()
14548 .mr(6)
14549 .nr(8)
14550 .kr(1)
14551 .sr(1)
14552 .m(6)
14553 .n(8)
14554 .k(2)
14555 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080014556 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014557 }
Frank Barchard91317c52019-11-22 10:54:35 -080014558#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070014559
14560
Frank Barchard69172d92019-11-26 16:22:39 -080014561#if XNN_ARCH_ARM64
14562 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, k_eq_4) {
14563 TEST_REQUIRES_ARM_NEON_FMA;
14564 GemmMicrokernelTester()
14565 .mr(6)
14566 .nr(8)
14567 .kr(1)
14568 .sr(1)
14569 .m(6)
14570 .n(8)
14571 .k(4)
14572 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14573 }
14574
14575 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, strided_cn) {
14576 TEST_REQUIRES_ARM_NEON_FMA;
14577 GemmMicrokernelTester()
14578 .mr(6)
14579 .nr(8)
14580 .kr(1)
14581 .sr(1)
14582 .m(6)
14583 .n(8)
14584 .k(4)
14585 .cn_stride(11)
14586 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14587 }
14588
14589 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, k_eq_4_subtile) {
14590 TEST_REQUIRES_ARM_NEON_FMA;
14591 for (uint32_t m = 1; m <= 6; m++) {
14592 for (uint32_t n = 1; n <= 8; n++) {
14593 GemmMicrokernelTester()
14594 .mr(6)
14595 .nr(8)
14596 .kr(1)
14597 .sr(1)
14598 .m(m)
14599 .n(n)
14600 .k(4)
14601 .iterations(1)
14602 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14603 }
14604 }
14605 }
14606
14607 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, k_eq_4_subtile_m) {
14608 TEST_REQUIRES_ARM_NEON_FMA;
14609 for (uint32_t m = 1; m <= 6; m++) {
14610 GemmMicrokernelTester()
14611 .mr(6)
14612 .nr(8)
14613 .kr(1)
14614 .sr(1)
14615 .m(m)
14616 .n(8)
14617 .k(4)
14618 .iterations(1)
14619 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14620 }
14621 }
14622
14623 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, k_eq_4_subtile_n) {
14624 TEST_REQUIRES_ARM_NEON_FMA;
14625 for (uint32_t n = 1; n <= 8; n++) {
14626 GemmMicrokernelTester()
14627 .mr(6)
14628 .nr(8)
14629 .kr(1)
14630 .sr(1)
14631 .m(6)
14632 .n(n)
14633 .k(4)
14634 .iterations(1)
14635 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14636 }
14637 }
14638
14639 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, k_lt_4) {
14640 TEST_REQUIRES_ARM_NEON_FMA;
14641 for (size_t k = 1; k < 4; k++) {
14642 GemmMicrokernelTester()
14643 .mr(6)
14644 .nr(8)
14645 .kr(1)
14646 .sr(1)
14647 .m(6)
14648 .n(8)
14649 .k(k)
14650 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14651 }
14652 }
14653
14654 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, k_lt_4_subtile) {
14655 TEST_REQUIRES_ARM_NEON_FMA;
14656 for (size_t k = 1; k < 4; k++) {
14657 for (uint32_t m = 1; m <= 6; m++) {
14658 for (uint32_t n = 1; n <= 8; n++) {
14659 GemmMicrokernelTester()
14660 .mr(6)
14661 .nr(8)
14662 .kr(1)
14663 .sr(1)
14664 .m(m)
14665 .n(n)
14666 .k(k)
14667 .iterations(1)
14668 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14669 }
14670 }
14671 }
14672 }
14673
14674 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, k_gt_4) {
14675 TEST_REQUIRES_ARM_NEON_FMA;
14676 for (size_t k = 5; k < 8; k++) {
14677 GemmMicrokernelTester()
14678 .mr(6)
14679 .nr(8)
14680 .kr(1)
14681 .sr(1)
14682 .m(6)
14683 .n(8)
14684 .k(k)
14685 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14686 }
14687 }
14688
14689 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, k_gt_4_subtile) {
14690 TEST_REQUIRES_ARM_NEON_FMA;
14691 for (size_t k = 5; k < 8; k++) {
14692 for (uint32_t m = 1; m <= 6; m++) {
14693 for (uint32_t n = 1; n <= 8; n++) {
14694 GemmMicrokernelTester()
14695 .mr(6)
14696 .nr(8)
14697 .kr(1)
14698 .sr(1)
14699 .m(m)
14700 .n(n)
14701 .k(k)
14702 .iterations(1)
14703 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14704 }
14705 }
14706 }
14707 }
14708
14709 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, k_div_4) {
14710 TEST_REQUIRES_ARM_NEON_FMA;
14711 for (size_t k = 8; k <= 40; k += 4) {
14712 GemmMicrokernelTester()
14713 .mr(6)
14714 .nr(8)
14715 .kr(1)
14716 .sr(1)
14717 .m(6)
14718 .n(8)
14719 .k(k)
14720 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14721 }
14722 }
14723
14724 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, k_div_4_subtile) {
14725 TEST_REQUIRES_ARM_NEON_FMA;
14726 for (size_t k = 8; k <= 40; k += 4) {
14727 for (uint32_t m = 1; m <= 6; m++) {
14728 for (uint32_t n = 1; n <= 8; n++) {
14729 GemmMicrokernelTester()
14730 .mr(6)
14731 .nr(8)
14732 .kr(1)
14733 .sr(1)
14734 .m(m)
14735 .n(n)
14736 .k(k)
14737 .iterations(1)
14738 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14739 }
14740 }
14741 }
14742 }
14743
14744 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, n_gt_8) {
14745 TEST_REQUIRES_ARM_NEON_FMA;
14746 for (uint32_t n = 9; n < 16; n++) {
14747 for (size_t k = 1; k <= 20; k += 5) {
14748 GemmMicrokernelTester()
14749 .mr(6)
14750 .nr(8)
14751 .kr(1)
14752 .sr(1)
14753 .m(6)
14754 .n(8)
14755 .k(k)
14756 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14757 }
14758 }
14759 }
14760
14761 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, n_gt_8_strided_cn) {
14762 TEST_REQUIRES_ARM_NEON_FMA;
14763 for (uint32_t n = 9; n < 16; n++) {
14764 for (size_t k = 1; k <= 20; k += 5) {
14765 GemmMicrokernelTester()
14766 .mr(6)
14767 .nr(8)
14768 .kr(1)
14769 .sr(1)
14770 .m(6)
14771 .n(8)
14772 .k(k)
14773 .cn_stride(11)
14774 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14775 }
14776 }
14777 }
14778
14779 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, n_gt_8_subtile) {
14780 TEST_REQUIRES_ARM_NEON_FMA;
14781 for (uint32_t n = 9; n < 16; n++) {
14782 for (size_t k = 1; k <= 20; k += 5) {
14783 for (uint32_t m = 1; m <= 6; m++) {
14784 GemmMicrokernelTester()
14785 .mr(6)
14786 .nr(8)
14787 .kr(1)
14788 .sr(1)
14789 .m(m)
14790 .n(n)
14791 .k(k)
14792 .iterations(1)
14793 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14794 }
14795 }
14796 }
14797 }
14798
14799 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, n_div_8) {
14800 TEST_REQUIRES_ARM_NEON_FMA;
14801 for (uint32_t n = 16; n <= 24; n += 8) {
14802 for (size_t k = 1; k <= 20; k += 5) {
14803 GemmMicrokernelTester()
14804 .mr(6)
14805 .nr(8)
14806 .kr(1)
14807 .sr(1)
14808 .m(6)
14809 .n(8)
14810 .k(k)
14811 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14812 }
14813 }
14814 }
14815
14816 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, n_div_8_strided_cn) {
14817 TEST_REQUIRES_ARM_NEON_FMA;
14818 for (uint32_t n = 16; n <= 24; n += 8) {
14819 for (size_t k = 1; k <= 20; k += 5) {
14820 GemmMicrokernelTester()
14821 .mr(6)
14822 .nr(8)
14823 .kr(1)
14824 .sr(1)
14825 .m(6)
14826 .n(n)
14827 .k(k)
14828 .cn_stride(11)
14829 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14830 }
14831 }
14832 }
14833
14834 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, n_div_8_subtile) {
14835 TEST_REQUIRES_ARM_NEON_FMA;
14836 for (uint32_t n = 16; n <= 24; n += 8) {
14837 for (size_t k = 1; k <= 20; k += 5) {
14838 for (uint32_t m = 1; m <= 6; m++) {
14839 GemmMicrokernelTester()
14840 .mr(6)
14841 .nr(8)
14842 .kr(1)
14843 .sr(1)
14844 .m(m)
14845 .n(n)
14846 .k(k)
14847 .iterations(1)
14848 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14849 }
14850 }
14851 }
14852 }
14853
14854 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, small_kernel) {
14855 TEST_REQUIRES_ARM_NEON_FMA;
14856 for (size_t k = 1; k <= 20; k += 5) {
14857 GemmMicrokernelTester()
14858 .mr(6)
14859 .nr(8)
14860 .kr(1)
14861 .sr(1)
14862 .m(6)
14863 .n(8)
14864 .k(k)
14865 .ks(3)
14866 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14867 }
14868 }
14869
14870 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, small_kernel_subtile) {
14871 TEST_REQUIRES_ARM_NEON_FMA;
14872 for (size_t k = 1; k <= 20; k += 5) {
14873 for (uint32_t m = 1; m <= 6; m++) {
14874 for (uint32_t n = 1; n <= 8; n++) {
14875 GemmMicrokernelTester()
14876 .mr(6)
14877 .nr(8)
14878 .kr(1)
14879 .sr(1)
14880 .m(m)
14881 .n(n)
14882 .k(k)
14883 .ks(3)
14884 .iterations(1)
14885 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14886 }
14887 }
14888 }
14889 }
14890
14891 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, n_gt_8_small_kernel) {
14892 TEST_REQUIRES_ARM_NEON_FMA;
14893 for (uint32_t n = 9; n < 16; n++) {
14894 for (size_t k = 1; k <= 20; k += 5) {
14895 GemmMicrokernelTester()
14896 .mr(6)
14897 .nr(8)
14898 .kr(1)
14899 .sr(1)
14900 .m(6)
14901 .n(8)
14902 .k(k)
14903 .ks(3)
14904 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14905 }
14906 }
14907 }
14908
14909 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, n_div_8_small_kernel) {
14910 TEST_REQUIRES_ARM_NEON_FMA;
14911 for (uint32_t n = 16; n <= 24; n += 8) {
14912 for (size_t k = 1; k <= 20; k += 5) {
14913 GemmMicrokernelTester()
14914 .mr(6)
14915 .nr(8)
14916 .kr(1)
14917 .sr(1)
14918 .m(6)
14919 .n(8)
14920 .k(k)
14921 .ks(3)
14922 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14923 }
14924 }
14925 }
14926
14927 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, strided_cm_subtile) {
14928 TEST_REQUIRES_ARM_NEON_FMA;
14929 for (size_t k = 1; k <= 20; k += 5) {
14930 for (uint32_t m = 1; m <= 6; m++) {
14931 for (uint32_t n = 1; n <= 8; n++) {
14932 GemmMicrokernelTester()
14933 .mr(6)
14934 .nr(8)
14935 .kr(1)
14936 .sr(1)
14937 .m(m)
14938 .n(n)
14939 .k(k)
14940 .cm_stride(11)
14941 .iterations(1)
14942 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14943 }
14944 }
14945 }
14946 }
14947
14948 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, a_offset) {
14949 TEST_REQUIRES_ARM_NEON_FMA;
14950 for (size_t k = 1; k <= 20; k += 5) {
14951 GemmMicrokernelTester()
14952 .mr(6)
14953 .nr(8)
14954 .kr(1)
14955 .sr(1)
14956 .m(6)
14957 .n(8)
14958 .k(k)
14959 .ks(3)
14960 .a_offset(127)
14961 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14962 }
14963 }
14964
14965 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, zero) {
14966 TEST_REQUIRES_ARM_NEON_FMA;
14967 for (uint32_t mz = 0; mz < 6; mz++) {
14968 for (size_t k = 1; k <= 20; k += 5) {
14969 GemmMicrokernelTester()
14970 .mr(6)
14971 .nr(8)
14972 .kr(1)
14973 .sr(1)
14974 .m(6)
14975 .n(8)
14976 .k(k)
14977 .ks(3)
14978 .a_offset(127)
14979 .zero_index(mz)
14980 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14981 }
14982 }
14983 }
14984
14985 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, qmin) {
14986 TEST_REQUIRES_ARM_NEON_FMA;
14987 GemmMicrokernelTester()
14988 .mr(6)
14989 .nr(8)
14990 .kr(1)
14991 .sr(1)
14992 .m(6)
14993 .n(8)
14994 .k(4)
14995 .qmin(128)
14996 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
14997 }
14998
14999 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, qmax) {
15000 TEST_REQUIRES_ARM_NEON_FMA;
15001 GemmMicrokernelTester()
15002 .mr(6)
15003 .nr(8)
15004 .kr(1)
15005 .sr(1)
15006 .m(6)
15007 .n(8)
15008 .k(4)
15009 .qmax(128)
15010 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
15011 }
15012
15013 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, strided_cm) {
15014 TEST_REQUIRES_ARM_NEON_FMA;
15015 GemmMicrokernelTester()
15016 .mr(6)
15017 .nr(8)
15018 .kr(1)
15019 .sr(1)
15020 .m(6)
15021 .n(8)
15022 .k(4)
15023 .cm_stride(11)
15024 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
15025 }
15026#endif // XNN_ARCH_ARM64
15027
15028
Frank Barcharddf06d802019-11-20 15:53:46 -080015029#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard5243bb02019-11-22 16:37:50 -080015030 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, k_eq_2) {
15031 TEST_REQUIRES_ARM_NEON;
15032 GemmMicrokernelTester()
15033 .mr(1)
15034 .nr(8)
15035 .kr(1)
15036 .sr(1)
15037 .m(1)
15038 .n(8)
15039 .k(2)
15040 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15041 }
15042
15043 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, strided_cn) {
15044 TEST_REQUIRES_ARM_NEON;
15045 GemmMicrokernelTester()
15046 .mr(1)
15047 .nr(8)
15048 .kr(1)
15049 .sr(1)
15050 .m(1)
15051 .n(8)
15052 .k(2)
15053 .cn_stride(11)
15054 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15055 }
15056
15057 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, k_eq_2_subtile) {
15058 TEST_REQUIRES_ARM_NEON;
15059 for (uint32_t m = 1; m <= 1; m++) {
15060 for (uint32_t n = 1; n <= 8; n++) {
15061 GemmMicrokernelTester()
15062 .mr(1)
15063 .nr(8)
15064 .kr(1)
15065 .sr(1)
15066 .m(m)
15067 .n(n)
15068 .k(2)
15069 .iterations(1)
15070 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15071 }
15072 }
15073 }
15074
15075 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, k_eq_2_subtile_m) {
15076 TEST_REQUIRES_ARM_NEON;
15077 for (uint32_t m = 1; m <= 1; m++) {
15078 GemmMicrokernelTester()
15079 .mr(1)
15080 .nr(8)
15081 .kr(1)
15082 .sr(1)
15083 .m(m)
15084 .n(8)
15085 .k(2)
15086 .iterations(1)
15087 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15088 }
15089 }
15090
15091 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, k_eq_2_subtile_n) {
15092 TEST_REQUIRES_ARM_NEON;
15093 for (uint32_t n = 1; n <= 8; n++) {
15094 GemmMicrokernelTester()
15095 .mr(1)
15096 .nr(8)
15097 .kr(1)
15098 .sr(1)
15099 .m(1)
15100 .n(n)
15101 .k(2)
15102 .iterations(1)
15103 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15104 }
15105 }
15106
15107 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, k_lt_2) {
15108 TEST_REQUIRES_ARM_NEON;
15109 for (size_t k = 1; k < 2; k++) {
15110 GemmMicrokernelTester()
15111 .mr(1)
15112 .nr(8)
15113 .kr(1)
15114 .sr(1)
15115 .m(1)
15116 .n(8)
15117 .k(k)
15118 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15119 }
15120 }
15121
15122 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, k_lt_2_subtile) {
15123 TEST_REQUIRES_ARM_NEON;
15124 for (size_t k = 1; k < 2; k++) {
15125 for (uint32_t m = 1; m <= 1; m++) {
15126 for (uint32_t n = 1; n <= 8; n++) {
15127 GemmMicrokernelTester()
15128 .mr(1)
15129 .nr(8)
15130 .kr(1)
15131 .sr(1)
15132 .m(m)
15133 .n(n)
15134 .k(k)
15135 .iterations(1)
15136 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15137 }
15138 }
15139 }
15140 }
15141
15142 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, k_gt_2) {
15143 TEST_REQUIRES_ARM_NEON;
15144 for (size_t k = 3; k < 4; k++) {
15145 GemmMicrokernelTester()
15146 .mr(1)
15147 .nr(8)
15148 .kr(1)
15149 .sr(1)
15150 .m(1)
15151 .n(8)
15152 .k(k)
15153 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15154 }
15155 }
15156
15157 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, k_gt_2_subtile) {
15158 TEST_REQUIRES_ARM_NEON;
15159 for (size_t k = 3; k < 4; k++) {
15160 for (uint32_t m = 1; m <= 1; m++) {
15161 for (uint32_t n = 1; n <= 8; n++) {
15162 GemmMicrokernelTester()
15163 .mr(1)
15164 .nr(8)
15165 .kr(1)
15166 .sr(1)
15167 .m(m)
15168 .n(n)
15169 .k(k)
15170 .iterations(1)
15171 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15172 }
15173 }
15174 }
15175 }
15176
15177 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, k_div_2) {
15178 TEST_REQUIRES_ARM_NEON;
15179 for (size_t k = 4; k <= 20; k += 2) {
15180 GemmMicrokernelTester()
15181 .mr(1)
15182 .nr(8)
15183 .kr(1)
15184 .sr(1)
15185 .m(1)
15186 .n(8)
15187 .k(k)
15188 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15189 }
15190 }
15191
15192 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, k_div_2_subtile) {
15193 TEST_REQUIRES_ARM_NEON;
15194 for (size_t k = 4; k <= 20; k += 2) {
15195 for (uint32_t m = 1; m <= 1; m++) {
15196 for (uint32_t n = 1; n <= 8; n++) {
15197 GemmMicrokernelTester()
15198 .mr(1)
15199 .nr(8)
15200 .kr(1)
15201 .sr(1)
15202 .m(m)
15203 .n(n)
15204 .k(k)
15205 .iterations(1)
15206 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15207 }
15208 }
15209 }
15210 }
15211
15212 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, n_gt_8) {
15213 TEST_REQUIRES_ARM_NEON;
15214 for (uint32_t n = 9; n < 16; n++) {
15215 for (size_t k = 1; k <= 10; k += 3) {
15216 GemmMicrokernelTester()
15217 .mr(1)
15218 .nr(8)
15219 .kr(1)
15220 .sr(1)
15221 .m(1)
15222 .n(8)
15223 .k(k)
15224 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15225 }
15226 }
15227 }
15228
15229 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, n_gt_8_strided_cn) {
15230 TEST_REQUIRES_ARM_NEON;
15231 for (uint32_t n = 9; n < 16; n++) {
15232 for (size_t k = 1; k <= 10; k += 3) {
15233 GemmMicrokernelTester()
15234 .mr(1)
15235 .nr(8)
15236 .kr(1)
15237 .sr(1)
15238 .m(1)
15239 .n(8)
15240 .k(k)
15241 .cn_stride(11)
15242 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15243 }
15244 }
15245 }
15246
15247 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, n_gt_8_subtile) {
15248 TEST_REQUIRES_ARM_NEON;
15249 for (uint32_t n = 9; n < 16; n++) {
15250 for (size_t k = 1; k <= 10; k += 3) {
15251 for (uint32_t m = 1; m <= 1; m++) {
15252 GemmMicrokernelTester()
15253 .mr(1)
15254 .nr(8)
15255 .kr(1)
15256 .sr(1)
15257 .m(m)
15258 .n(n)
15259 .k(k)
15260 .iterations(1)
15261 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15262 }
15263 }
15264 }
15265 }
15266
15267 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, n_div_8) {
15268 TEST_REQUIRES_ARM_NEON;
15269 for (uint32_t n = 16; n <= 24; n += 8) {
15270 for (size_t k = 1; k <= 10; k += 3) {
15271 GemmMicrokernelTester()
15272 .mr(1)
15273 .nr(8)
15274 .kr(1)
15275 .sr(1)
15276 .m(1)
15277 .n(8)
15278 .k(k)
15279 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15280 }
15281 }
15282 }
15283
15284 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, n_div_8_strided_cn) {
15285 TEST_REQUIRES_ARM_NEON;
15286 for (uint32_t n = 16; n <= 24; n += 8) {
15287 for (size_t k = 1; k <= 10; k += 3) {
15288 GemmMicrokernelTester()
15289 .mr(1)
15290 .nr(8)
15291 .kr(1)
15292 .sr(1)
15293 .m(1)
15294 .n(n)
15295 .k(k)
15296 .cn_stride(11)
15297 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15298 }
15299 }
15300 }
15301
15302 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, n_div_8_subtile) {
15303 TEST_REQUIRES_ARM_NEON;
15304 for (uint32_t n = 16; n <= 24; n += 8) {
15305 for (size_t k = 1; k <= 10; k += 3) {
15306 for (uint32_t m = 1; m <= 1; m++) {
15307 GemmMicrokernelTester()
15308 .mr(1)
15309 .nr(8)
15310 .kr(1)
15311 .sr(1)
15312 .m(m)
15313 .n(n)
15314 .k(k)
15315 .iterations(1)
15316 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15317 }
15318 }
15319 }
15320 }
15321
15322 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, small_kernel) {
15323 TEST_REQUIRES_ARM_NEON;
15324 for (size_t k = 1; k <= 10; k += 3) {
15325 GemmMicrokernelTester()
15326 .mr(1)
15327 .nr(8)
15328 .kr(1)
15329 .sr(1)
15330 .m(1)
15331 .n(8)
15332 .k(k)
15333 .ks(3)
15334 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15335 }
15336 }
15337
15338 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, small_kernel_subtile) {
15339 TEST_REQUIRES_ARM_NEON;
15340 for (size_t k = 1; k <= 10; k += 3) {
15341 for (uint32_t m = 1; m <= 1; m++) {
15342 for (uint32_t n = 1; n <= 8; n++) {
15343 GemmMicrokernelTester()
15344 .mr(1)
15345 .nr(8)
15346 .kr(1)
15347 .sr(1)
15348 .m(m)
15349 .n(n)
15350 .k(k)
15351 .ks(3)
15352 .iterations(1)
15353 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15354 }
15355 }
15356 }
15357 }
15358
15359 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, n_gt_8_small_kernel) {
15360 TEST_REQUIRES_ARM_NEON;
15361 for (uint32_t n = 9; n < 16; n++) {
15362 for (size_t k = 1; k <= 10; k += 3) {
15363 GemmMicrokernelTester()
15364 .mr(1)
15365 .nr(8)
15366 .kr(1)
15367 .sr(1)
15368 .m(1)
15369 .n(8)
15370 .k(k)
15371 .ks(3)
15372 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15373 }
15374 }
15375 }
15376
15377 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, n_div_8_small_kernel) {
15378 TEST_REQUIRES_ARM_NEON;
15379 for (uint32_t n = 16; n <= 24; n += 8) {
15380 for (size_t k = 1; k <= 10; k += 3) {
15381 GemmMicrokernelTester()
15382 .mr(1)
15383 .nr(8)
15384 .kr(1)
15385 .sr(1)
15386 .m(1)
15387 .n(8)
15388 .k(k)
15389 .ks(3)
15390 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15391 }
15392 }
15393 }
15394
15395 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, strided_cm_subtile) {
15396 TEST_REQUIRES_ARM_NEON;
15397 for (size_t k = 1; k <= 10; k += 3) {
15398 for (uint32_t m = 1; m <= 1; m++) {
15399 for (uint32_t n = 1; n <= 8; n++) {
15400 GemmMicrokernelTester()
15401 .mr(1)
15402 .nr(8)
15403 .kr(1)
15404 .sr(1)
15405 .m(m)
15406 .n(n)
15407 .k(k)
15408 .cm_stride(11)
15409 .iterations(1)
15410 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15411 }
15412 }
15413 }
15414 }
15415
15416 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, a_offset) {
15417 TEST_REQUIRES_ARM_NEON;
15418 for (size_t k = 1; k <= 10; k += 3) {
15419 GemmMicrokernelTester()
15420 .mr(1)
15421 .nr(8)
15422 .kr(1)
15423 .sr(1)
15424 .m(1)
15425 .n(8)
15426 .k(k)
15427 .ks(3)
15428 .a_offset(13)
15429 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15430 }
15431 }
15432
15433 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, zero) {
15434 TEST_REQUIRES_ARM_NEON;
15435 for (uint32_t mz = 0; mz < 1; mz++) {
15436 for (size_t k = 1; k <= 10; k += 3) {
15437 GemmMicrokernelTester()
15438 .mr(1)
15439 .nr(8)
15440 .kr(1)
15441 .sr(1)
15442 .m(1)
15443 .n(8)
15444 .k(k)
15445 .ks(3)
15446 .a_offset(13)
15447 .zero_index(mz)
15448 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15449 }
15450 }
15451 }
15452
15453 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, qmin) {
15454 TEST_REQUIRES_ARM_NEON;
15455 GemmMicrokernelTester()
15456 .mr(1)
15457 .nr(8)
15458 .kr(1)
15459 .sr(1)
15460 .m(1)
15461 .n(8)
15462 .k(2)
15463 .qmin(128)
15464 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15465 }
15466
15467 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, qmax) {
15468 TEST_REQUIRES_ARM_NEON;
15469 GemmMicrokernelTester()
15470 .mr(1)
15471 .nr(8)
15472 .kr(1)
15473 .sr(1)
15474 .m(1)
15475 .n(8)
15476 .k(2)
15477 .qmax(128)
15478 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15479 }
15480
15481 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, strided_cm) {
15482 TEST_REQUIRES_ARM_NEON;
15483 GemmMicrokernelTester()
15484 .mr(1)
15485 .nr(8)
15486 .kr(1)
15487 .sr(1)
15488 .m(1)
15489 .n(8)
15490 .k(2)
15491 .cm_stride(11)
15492 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
15493 }
15494#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
15495
15496
15497#if XNN_ARCH_ARM || XNN_ARCH_ARM64
15498 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, k_eq_4) {
15499 TEST_REQUIRES_ARM_NEON;
15500 GemmMicrokernelTester()
15501 .mr(4)
15502 .nr(8)
15503 .kr(1)
15504 .sr(1)
15505 .m(4)
15506 .n(8)
15507 .k(4)
15508 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15509 }
15510
15511 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, strided_cn) {
15512 TEST_REQUIRES_ARM_NEON;
15513 GemmMicrokernelTester()
15514 .mr(4)
15515 .nr(8)
15516 .kr(1)
15517 .sr(1)
15518 .m(4)
15519 .n(8)
15520 .k(4)
15521 .cn_stride(11)
15522 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15523 }
15524
15525 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, k_eq_4_subtile) {
15526 TEST_REQUIRES_ARM_NEON;
15527 for (uint32_t m = 1; m <= 4; m++) {
15528 for (uint32_t n = 1; n <= 8; n++) {
15529 GemmMicrokernelTester()
15530 .mr(4)
15531 .nr(8)
15532 .kr(1)
15533 .sr(1)
15534 .m(m)
15535 .n(n)
15536 .k(4)
15537 .iterations(1)
15538 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15539 }
15540 }
15541 }
15542
15543 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, k_eq_4_subtile_m) {
15544 TEST_REQUIRES_ARM_NEON;
15545 for (uint32_t m = 1; m <= 4; m++) {
15546 GemmMicrokernelTester()
15547 .mr(4)
15548 .nr(8)
15549 .kr(1)
15550 .sr(1)
15551 .m(m)
15552 .n(8)
15553 .k(4)
15554 .iterations(1)
15555 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15556 }
15557 }
15558
15559 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, k_eq_4_subtile_n) {
15560 TEST_REQUIRES_ARM_NEON;
15561 for (uint32_t n = 1; n <= 8; n++) {
15562 GemmMicrokernelTester()
15563 .mr(4)
15564 .nr(8)
15565 .kr(1)
15566 .sr(1)
15567 .m(4)
15568 .n(n)
15569 .k(4)
15570 .iterations(1)
15571 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15572 }
15573 }
15574
15575 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, k_lt_4) {
15576 TEST_REQUIRES_ARM_NEON;
15577 for (size_t k = 1; k < 4; k++) {
15578 GemmMicrokernelTester()
15579 .mr(4)
15580 .nr(8)
15581 .kr(1)
15582 .sr(1)
15583 .m(4)
15584 .n(8)
15585 .k(k)
15586 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15587 }
15588 }
15589
15590 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, k_lt_4_subtile) {
15591 TEST_REQUIRES_ARM_NEON;
15592 for (size_t k = 1; k < 4; k++) {
15593 for (uint32_t m = 1; m <= 4; m++) {
15594 for (uint32_t n = 1; n <= 8; n++) {
15595 GemmMicrokernelTester()
15596 .mr(4)
15597 .nr(8)
15598 .kr(1)
15599 .sr(1)
15600 .m(m)
15601 .n(n)
15602 .k(k)
15603 .iterations(1)
15604 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15605 }
15606 }
15607 }
15608 }
15609
15610 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, k_gt_4) {
15611 TEST_REQUIRES_ARM_NEON;
15612 for (size_t k = 5; k < 8; k++) {
15613 GemmMicrokernelTester()
15614 .mr(4)
15615 .nr(8)
15616 .kr(1)
15617 .sr(1)
15618 .m(4)
15619 .n(8)
15620 .k(k)
15621 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15622 }
15623 }
15624
15625 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, k_gt_4_subtile) {
15626 TEST_REQUIRES_ARM_NEON;
15627 for (size_t k = 5; k < 8; k++) {
15628 for (uint32_t m = 1; m <= 4; m++) {
15629 for (uint32_t n = 1; n <= 8; n++) {
15630 GemmMicrokernelTester()
15631 .mr(4)
15632 .nr(8)
15633 .kr(1)
15634 .sr(1)
15635 .m(m)
15636 .n(n)
15637 .k(k)
15638 .iterations(1)
15639 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15640 }
15641 }
15642 }
15643 }
15644
15645 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, k_div_4) {
15646 TEST_REQUIRES_ARM_NEON;
15647 for (size_t k = 8; k <= 40; k += 4) {
15648 GemmMicrokernelTester()
15649 .mr(4)
15650 .nr(8)
15651 .kr(1)
15652 .sr(1)
15653 .m(4)
15654 .n(8)
15655 .k(k)
15656 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15657 }
15658 }
15659
15660 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, k_div_4_subtile) {
15661 TEST_REQUIRES_ARM_NEON;
15662 for (size_t k = 8; k <= 40; k += 4) {
15663 for (uint32_t m = 1; m <= 4; m++) {
15664 for (uint32_t n = 1; n <= 8; n++) {
15665 GemmMicrokernelTester()
15666 .mr(4)
15667 .nr(8)
15668 .kr(1)
15669 .sr(1)
15670 .m(m)
15671 .n(n)
15672 .k(k)
15673 .iterations(1)
15674 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15675 }
15676 }
15677 }
15678 }
15679
15680 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, n_gt_8) {
15681 TEST_REQUIRES_ARM_NEON;
15682 for (uint32_t n = 9; n < 16; n++) {
15683 for (size_t k = 1; k <= 20; k += 5) {
15684 GemmMicrokernelTester()
15685 .mr(4)
15686 .nr(8)
15687 .kr(1)
15688 .sr(1)
15689 .m(4)
15690 .n(8)
15691 .k(k)
15692 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15693 }
15694 }
15695 }
15696
15697 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, n_gt_8_strided_cn) {
15698 TEST_REQUIRES_ARM_NEON;
15699 for (uint32_t n = 9; n < 16; n++) {
15700 for (size_t k = 1; k <= 20; k += 5) {
15701 GemmMicrokernelTester()
15702 .mr(4)
15703 .nr(8)
15704 .kr(1)
15705 .sr(1)
15706 .m(4)
15707 .n(8)
15708 .k(k)
15709 .cn_stride(11)
15710 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15711 }
15712 }
15713 }
15714
15715 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, n_gt_8_subtile) {
15716 TEST_REQUIRES_ARM_NEON;
15717 for (uint32_t n = 9; n < 16; n++) {
15718 for (size_t k = 1; k <= 20; k += 5) {
15719 for (uint32_t m = 1; m <= 4; m++) {
15720 GemmMicrokernelTester()
15721 .mr(4)
15722 .nr(8)
15723 .kr(1)
15724 .sr(1)
15725 .m(m)
15726 .n(n)
15727 .k(k)
15728 .iterations(1)
15729 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15730 }
15731 }
15732 }
15733 }
15734
15735 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, n_div_8) {
15736 TEST_REQUIRES_ARM_NEON;
15737 for (uint32_t n = 16; n <= 24; n += 8) {
15738 for (size_t k = 1; k <= 20; k += 5) {
15739 GemmMicrokernelTester()
15740 .mr(4)
15741 .nr(8)
15742 .kr(1)
15743 .sr(1)
15744 .m(4)
15745 .n(8)
15746 .k(k)
15747 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15748 }
15749 }
15750 }
15751
15752 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, n_div_8_strided_cn) {
15753 TEST_REQUIRES_ARM_NEON;
15754 for (uint32_t n = 16; n <= 24; n += 8) {
15755 for (size_t k = 1; k <= 20; k += 5) {
15756 GemmMicrokernelTester()
15757 .mr(4)
15758 .nr(8)
15759 .kr(1)
15760 .sr(1)
15761 .m(4)
15762 .n(n)
15763 .k(k)
15764 .cn_stride(11)
15765 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15766 }
15767 }
15768 }
15769
15770 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, n_div_8_subtile) {
15771 TEST_REQUIRES_ARM_NEON;
15772 for (uint32_t n = 16; n <= 24; n += 8) {
15773 for (size_t k = 1; k <= 20; k += 5) {
15774 for (uint32_t m = 1; m <= 4; m++) {
15775 GemmMicrokernelTester()
15776 .mr(4)
15777 .nr(8)
15778 .kr(1)
15779 .sr(1)
15780 .m(m)
15781 .n(n)
15782 .k(k)
15783 .iterations(1)
15784 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15785 }
15786 }
15787 }
15788 }
15789
15790 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, small_kernel) {
15791 TEST_REQUIRES_ARM_NEON;
15792 for (size_t k = 1; k <= 20; k += 5) {
15793 GemmMicrokernelTester()
15794 .mr(4)
15795 .nr(8)
15796 .kr(1)
15797 .sr(1)
15798 .m(4)
15799 .n(8)
15800 .k(k)
15801 .ks(3)
15802 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15803 }
15804 }
15805
15806 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, small_kernel_subtile) {
15807 TEST_REQUIRES_ARM_NEON;
15808 for (size_t k = 1; k <= 20; k += 5) {
15809 for (uint32_t m = 1; m <= 4; m++) {
15810 for (uint32_t n = 1; n <= 8; n++) {
15811 GemmMicrokernelTester()
15812 .mr(4)
15813 .nr(8)
15814 .kr(1)
15815 .sr(1)
15816 .m(m)
15817 .n(n)
15818 .k(k)
15819 .ks(3)
15820 .iterations(1)
15821 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15822 }
15823 }
15824 }
15825 }
15826
15827 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, n_gt_8_small_kernel) {
15828 TEST_REQUIRES_ARM_NEON;
15829 for (uint32_t n = 9; n < 16; n++) {
15830 for (size_t k = 1; k <= 20; k += 5) {
15831 GemmMicrokernelTester()
15832 .mr(4)
15833 .nr(8)
15834 .kr(1)
15835 .sr(1)
15836 .m(4)
15837 .n(8)
15838 .k(k)
15839 .ks(3)
15840 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15841 }
15842 }
15843 }
15844
15845 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, n_div_8_small_kernel) {
15846 TEST_REQUIRES_ARM_NEON;
15847 for (uint32_t n = 16; n <= 24; n += 8) {
15848 for (size_t k = 1; k <= 20; k += 5) {
15849 GemmMicrokernelTester()
15850 .mr(4)
15851 .nr(8)
15852 .kr(1)
15853 .sr(1)
15854 .m(4)
15855 .n(8)
15856 .k(k)
15857 .ks(3)
15858 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15859 }
15860 }
15861 }
15862
15863 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, strided_cm_subtile) {
15864 TEST_REQUIRES_ARM_NEON;
15865 for (size_t k = 1; k <= 20; k += 5) {
15866 for (uint32_t m = 1; m <= 4; m++) {
15867 for (uint32_t n = 1; n <= 8; n++) {
15868 GemmMicrokernelTester()
15869 .mr(4)
15870 .nr(8)
15871 .kr(1)
15872 .sr(1)
15873 .m(m)
15874 .n(n)
15875 .k(k)
15876 .cm_stride(11)
15877 .iterations(1)
15878 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15879 }
15880 }
15881 }
15882 }
15883
15884 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, a_offset) {
15885 TEST_REQUIRES_ARM_NEON;
15886 for (size_t k = 1; k <= 20; k += 5) {
15887 GemmMicrokernelTester()
15888 .mr(4)
15889 .nr(8)
15890 .kr(1)
15891 .sr(1)
15892 .m(4)
15893 .n(8)
15894 .k(k)
15895 .ks(3)
15896 .a_offset(83)
15897 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15898 }
15899 }
15900
15901 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, zero) {
15902 TEST_REQUIRES_ARM_NEON;
15903 for (uint32_t mz = 0; mz < 4; mz++) {
15904 for (size_t k = 1; k <= 20; k += 5) {
15905 GemmMicrokernelTester()
15906 .mr(4)
15907 .nr(8)
15908 .kr(1)
15909 .sr(1)
15910 .m(4)
15911 .n(8)
15912 .k(k)
15913 .ks(3)
15914 .a_offset(83)
15915 .zero_index(mz)
15916 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15917 }
15918 }
15919 }
15920
15921 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, qmin) {
15922 TEST_REQUIRES_ARM_NEON;
15923 GemmMicrokernelTester()
15924 .mr(4)
15925 .nr(8)
15926 .kr(1)
15927 .sr(1)
15928 .m(4)
15929 .n(8)
15930 .k(4)
15931 .qmin(128)
15932 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15933 }
15934
15935 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, qmax) {
15936 TEST_REQUIRES_ARM_NEON;
15937 GemmMicrokernelTester()
15938 .mr(4)
15939 .nr(8)
15940 .kr(1)
15941 .sr(1)
15942 .m(4)
15943 .n(8)
15944 .k(4)
15945 .qmax(128)
15946 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15947 }
15948
15949 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, strided_cm) {
15950 TEST_REQUIRES_ARM_NEON;
15951 GemmMicrokernelTester()
15952 .mr(4)
15953 .nr(8)
15954 .kr(1)
15955 .sr(1)
15956 .m(4)
15957 .n(8)
15958 .k(4)
15959 .cm_stride(11)
15960 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
15961 }
15962#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
15963
15964
15965#if XNN_ARCH_ARM || XNN_ARCH_ARM64
15966 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, k_eq_2) {
15967 TEST_REQUIRES_ARM_NEON;
15968 GemmMicrokernelTester()
15969 .mr(4)
15970 .nr(8)
15971 .kr(1)
15972 .sr(1)
15973 .m(4)
15974 .n(8)
15975 .k(2)
15976 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
15977 }
15978
15979 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, strided_cn) {
15980 TEST_REQUIRES_ARM_NEON;
15981 GemmMicrokernelTester()
15982 .mr(4)
15983 .nr(8)
15984 .kr(1)
15985 .sr(1)
15986 .m(4)
15987 .n(8)
15988 .k(2)
15989 .cn_stride(11)
15990 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
15991 }
15992
15993 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, k_eq_2_subtile) {
15994 TEST_REQUIRES_ARM_NEON;
15995 for (uint32_t m = 1; m <= 4; m++) {
15996 for (uint32_t n = 1; n <= 8; n++) {
15997 GemmMicrokernelTester()
15998 .mr(4)
15999 .nr(8)
16000 .kr(1)
16001 .sr(1)
16002 .m(m)
16003 .n(n)
16004 .k(2)
16005 .iterations(1)
16006 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16007 }
16008 }
16009 }
16010
16011 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, k_eq_2_subtile_m) {
16012 TEST_REQUIRES_ARM_NEON;
16013 for (uint32_t m = 1; m <= 4; m++) {
16014 GemmMicrokernelTester()
16015 .mr(4)
16016 .nr(8)
16017 .kr(1)
16018 .sr(1)
16019 .m(m)
16020 .n(8)
16021 .k(2)
16022 .iterations(1)
16023 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16024 }
16025 }
16026
16027 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, k_eq_2_subtile_n) {
16028 TEST_REQUIRES_ARM_NEON;
16029 for (uint32_t n = 1; n <= 8; n++) {
16030 GemmMicrokernelTester()
16031 .mr(4)
16032 .nr(8)
16033 .kr(1)
16034 .sr(1)
16035 .m(4)
16036 .n(n)
16037 .k(2)
16038 .iterations(1)
16039 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16040 }
16041 }
16042
16043 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, k_lt_2) {
16044 TEST_REQUIRES_ARM_NEON;
16045 for (size_t k = 1; k < 2; k++) {
16046 GemmMicrokernelTester()
16047 .mr(4)
16048 .nr(8)
16049 .kr(1)
16050 .sr(1)
16051 .m(4)
16052 .n(8)
16053 .k(k)
16054 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16055 }
16056 }
16057
16058 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, k_lt_2_subtile) {
16059 TEST_REQUIRES_ARM_NEON;
16060 for (size_t k = 1; k < 2; k++) {
16061 for (uint32_t m = 1; m <= 4; m++) {
16062 for (uint32_t n = 1; n <= 8; n++) {
16063 GemmMicrokernelTester()
16064 .mr(4)
16065 .nr(8)
16066 .kr(1)
16067 .sr(1)
16068 .m(m)
16069 .n(n)
16070 .k(k)
16071 .iterations(1)
16072 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16073 }
16074 }
16075 }
16076 }
16077
16078 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, k_gt_2) {
16079 TEST_REQUIRES_ARM_NEON;
16080 for (size_t k = 3; k < 4; k++) {
16081 GemmMicrokernelTester()
16082 .mr(4)
16083 .nr(8)
16084 .kr(1)
16085 .sr(1)
16086 .m(4)
16087 .n(8)
16088 .k(k)
16089 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16090 }
16091 }
16092
16093 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, k_gt_2_subtile) {
16094 TEST_REQUIRES_ARM_NEON;
16095 for (size_t k = 3; k < 4; k++) {
16096 for (uint32_t m = 1; m <= 4; m++) {
16097 for (uint32_t n = 1; n <= 8; n++) {
16098 GemmMicrokernelTester()
16099 .mr(4)
16100 .nr(8)
16101 .kr(1)
16102 .sr(1)
16103 .m(m)
16104 .n(n)
16105 .k(k)
16106 .iterations(1)
16107 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16108 }
16109 }
16110 }
16111 }
16112
16113 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, k_div_2) {
16114 TEST_REQUIRES_ARM_NEON;
16115 for (size_t k = 4; k <= 20; k += 2) {
16116 GemmMicrokernelTester()
16117 .mr(4)
16118 .nr(8)
16119 .kr(1)
16120 .sr(1)
16121 .m(4)
16122 .n(8)
16123 .k(k)
16124 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16125 }
16126 }
16127
16128 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, k_div_2_subtile) {
16129 TEST_REQUIRES_ARM_NEON;
16130 for (size_t k = 4; k <= 20; k += 2) {
16131 for (uint32_t m = 1; m <= 4; m++) {
16132 for (uint32_t n = 1; n <= 8; n++) {
16133 GemmMicrokernelTester()
16134 .mr(4)
16135 .nr(8)
16136 .kr(1)
16137 .sr(1)
16138 .m(m)
16139 .n(n)
16140 .k(k)
16141 .iterations(1)
16142 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16143 }
16144 }
16145 }
16146 }
16147
16148 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, n_gt_8) {
16149 TEST_REQUIRES_ARM_NEON;
16150 for (uint32_t n = 9; n < 16; n++) {
16151 for (size_t k = 1; k <= 10; k += 3) {
16152 GemmMicrokernelTester()
16153 .mr(4)
16154 .nr(8)
16155 .kr(1)
16156 .sr(1)
16157 .m(4)
16158 .n(8)
16159 .k(k)
16160 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16161 }
16162 }
16163 }
16164
16165 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, n_gt_8_strided_cn) {
16166 TEST_REQUIRES_ARM_NEON;
16167 for (uint32_t n = 9; n < 16; n++) {
16168 for (size_t k = 1; k <= 10; k += 3) {
16169 GemmMicrokernelTester()
16170 .mr(4)
16171 .nr(8)
16172 .kr(1)
16173 .sr(1)
16174 .m(4)
16175 .n(8)
16176 .k(k)
16177 .cn_stride(11)
16178 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16179 }
16180 }
16181 }
16182
16183 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, n_gt_8_subtile) {
16184 TEST_REQUIRES_ARM_NEON;
16185 for (uint32_t n = 9; n < 16; n++) {
16186 for (size_t k = 1; k <= 10; k += 3) {
16187 for (uint32_t m = 1; m <= 4; m++) {
16188 GemmMicrokernelTester()
16189 .mr(4)
16190 .nr(8)
16191 .kr(1)
16192 .sr(1)
16193 .m(m)
16194 .n(n)
16195 .k(k)
16196 .iterations(1)
16197 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16198 }
16199 }
16200 }
16201 }
16202
16203 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, n_div_8) {
16204 TEST_REQUIRES_ARM_NEON;
16205 for (uint32_t n = 16; n <= 24; n += 8) {
16206 for (size_t k = 1; k <= 10; k += 3) {
16207 GemmMicrokernelTester()
16208 .mr(4)
16209 .nr(8)
16210 .kr(1)
16211 .sr(1)
16212 .m(4)
16213 .n(8)
16214 .k(k)
16215 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16216 }
16217 }
16218 }
16219
16220 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, n_div_8_strided_cn) {
16221 TEST_REQUIRES_ARM_NEON;
16222 for (uint32_t n = 16; n <= 24; n += 8) {
16223 for (size_t k = 1; k <= 10; k += 3) {
16224 GemmMicrokernelTester()
16225 .mr(4)
16226 .nr(8)
16227 .kr(1)
16228 .sr(1)
16229 .m(4)
16230 .n(n)
16231 .k(k)
16232 .cn_stride(11)
16233 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16234 }
16235 }
16236 }
16237
16238 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, n_div_8_subtile) {
16239 TEST_REQUIRES_ARM_NEON;
16240 for (uint32_t n = 16; n <= 24; n += 8) {
16241 for (size_t k = 1; k <= 10; k += 3) {
16242 for (uint32_t m = 1; m <= 4; m++) {
16243 GemmMicrokernelTester()
16244 .mr(4)
16245 .nr(8)
16246 .kr(1)
16247 .sr(1)
16248 .m(m)
16249 .n(n)
16250 .k(k)
16251 .iterations(1)
16252 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16253 }
16254 }
16255 }
16256 }
16257
16258 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, small_kernel) {
16259 TEST_REQUIRES_ARM_NEON;
16260 for (size_t k = 1; k <= 10; k += 3) {
16261 GemmMicrokernelTester()
16262 .mr(4)
16263 .nr(8)
16264 .kr(1)
16265 .sr(1)
16266 .m(4)
16267 .n(8)
16268 .k(k)
16269 .ks(3)
16270 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16271 }
16272 }
16273
16274 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, small_kernel_subtile) {
16275 TEST_REQUIRES_ARM_NEON;
16276 for (size_t k = 1; k <= 10; k += 3) {
16277 for (uint32_t m = 1; m <= 4; m++) {
16278 for (uint32_t n = 1; n <= 8; n++) {
16279 GemmMicrokernelTester()
16280 .mr(4)
16281 .nr(8)
16282 .kr(1)
16283 .sr(1)
16284 .m(m)
16285 .n(n)
16286 .k(k)
16287 .ks(3)
16288 .iterations(1)
16289 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16290 }
16291 }
16292 }
16293 }
16294
16295 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, n_gt_8_small_kernel) {
16296 TEST_REQUIRES_ARM_NEON;
16297 for (uint32_t n = 9; n < 16; n++) {
16298 for (size_t k = 1; k <= 10; k += 3) {
16299 GemmMicrokernelTester()
16300 .mr(4)
16301 .nr(8)
16302 .kr(1)
16303 .sr(1)
16304 .m(4)
16305 .n(8)
16306 .k(k)
16307 .ks(3)
16308 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16309 }
16310 }
16311 }
16312
16313 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, n_div_8_small_kernel) {
16314 TEST_REQUIRES_ARM_NEON;
16315 for (uint32_t n = 16; n <= 24; n += 8) {
16316 for (size_t k = 1; k <= 10; k += 3) {
16317 GemmMicrokernelTester()
16318 .mr(4)
16319 .nr(8)
16320 .kr(1)
16321 .sr(1)
16322 .m(4)
16323 .n(8)
16324 .k(k)
16325 .ks(3)
16326 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16327 }
16328 }
16329 }
16330
16331 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, strided_cm_subtile) {
16332 TEST_REQUIRES_ARM_NEON;
16333 for (size_t k = 1; k <= 10; k += 3) {
16334 for (uint32_t m = 1; m <= 4; m++) {
16335 for (uint32_t n = 1; n <= 8; n++) {
16336 GemmMicrokernelTester()
16337 .mr(4)
16338 .nr(8)
16339 .kr(1)
16340 .sr(1)
16341 .m(m)
16342 .n(n)
16343 .k(k)
16344 .cm_stride(11)
16345 .iterations(1)
16346 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16347 }
16348 }
16349 }
16350 }
16351
16352 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, a_offset) {
16353 TEST_REQUIRES_ARM_NEON;
16354 for (size_t k = 1; k <= 10; k += 3) {
16355 GemmMicrokernelTester()
16356 .mr(4)
16357 .nr(8)
16358 .kr(1)
16359 .sr(1)
16360 .m(4)
16361 .n(8)
16362 .k(k)
16363 .ks(3)
16364 .a_offset(43)
16365 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16366 }
16367 }
16368
16369 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, zero) {
16370 TEST_REQUIRES_ARM_NEON;
16371 for (uint32_t mz = 0; mz < 4; mz++) {
16372 for (size_t k = 1; k <= 10; k += 3) {
16373 GemmMicrokernelTester()
16374 .mr(4)
16375 .nr(8)
16376 .kr(1)
16377 .sr(1)
16378 .m(4)
16379 .n(8)
16380 .k(k)
16381 .ks(3)
16382 .a_offset(43)
16383 .zero_index(mz)
16384 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16385 }
16386 }
16387 }
16388
16389 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, qmin) {
16390 TEST_REQUIRES_ARM_NEON;
16391 GemmMicrokernelTester()
16392 .mr(4)
16393 .nr(8)
16394 .kr(1)
16395 .sr(1)
16396 .m(4)
16397 .n(8)
16398 .k(2)
16399 .qmin(128)
16400 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16401 }
16402
16403 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, qmax) {
16404 TEST_REQUIRES_ARM_NEON;
16405 GemmMicrokernelTester()
16406 .mr(4)
16407 .nr(8)
16408 .kr(1)
16409 .sr(1)
16410 .m(4)
16411 .n(8)
16412 .k(2)
16413 .qmax(128)
16414 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16415 }
16416
16417 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, strided_cm) {
16418 TEST_REQUIRES_ARM_NEON;
16419 GemmMicrokernelTester()
16420 .mr(4)
16421 .nr(8)
16422 .kr(1)
16423 .sr(1)
16424 .m(4)
16425 .n(8)
16426 .k(2)
16427 .cm_stride(11)
16428 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
16429 }
16430#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
16431
16432
16433#if XNN_ARCH_ARM || XNN_ARCH_ARM64
16434 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, k_eq_2) {
16435 TEST_REQUIRES_ARM_NEON;
16436 GemmMicrokernelTester()
16437 .mr(6)
16438 .nr(8)
16439 .kr(1)
16440 .sr(1)
16441 .m(6)
16442 .n(8)
16443 .k(2)
16444 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16445 }
16446
16447 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, strided_cn) {
16448 TEST_REQUIRES_ARM_NEON;
16449 GemmMicrokernelTester()
16450 .mr(6)
16451 .nr(8)
16452 .kr(1)
16453 .sr(1)
16454 .m(6)
16455 .n(8)
16456 .k(2)
16457 .cn_stride(11)
16458 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16459 }
16460
16461 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, k_eq_2_subtile) {
16462 TEST_REQUIRES_ARM_NEON;
16463 for (uint32_t m = 1; m <= 6; m++) {
16464 for (uint32_t n = 1; n <= 8; n++) {
16465 GemmMicrokernelTester()
16466 .mr(6)
16467 .nr(8)
16468 .kr(1)
16469 .sr(1)
16470 .m(m)
16471 .n(n)
16472 .k(2)
16473 .iterations(1)
16474 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16475 }
16476 }
16477 }
16478
16479 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, k_eq_2_subtile_m) {
16480 TEST_REQUIRES_ARM_NEON;
16481 for (uint32_t m = 1; m <= 6; m++) {
16482 GemmMicrokernelTester()
16483 .mr(6)
16484 .nr(8)
16485 .kr(1)
16486 .sr(1)
16487 .m(m)
16488 .n(8)
16489 .k(2)
16490 .iterations(1)
16491 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16492 }
16493 }
16494
16495 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, k_eq_2_subtile_n) {
16496 TEST_REQUIRES_ARM_NEON;
16497 for (uint32_t n = 1; n <= 8; n++) {
16498 GemmMicrokernelTester()
16499 .mr(6)
16500 .nr(8)
16501 .kr(1)
16502 .sr(1)
16503 .m(6)
16504 .n(n)
16505 .k(2)
16506 .iterations(1)
16507 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16508 }
16509 }
16510
16511 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, k_lt_2) {
16512 TEST_REQUIRES_ARM_NEON;
16513 for (size_t k = 1; k < 2; k++) {
16514 GemmMicrokernelTester()
16515 .mr(6)
16516 .nr(8)
16517 .kr(1)
16518 .sr(1)
16519 .m(6)
16520 .n(8)
16521 .k(k)
16522 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16523 }
16524 }
16525
16526 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, k_lt_2_subtile) {
16527 TEST_REQUIRES_ARM_NEON;
16528 for (size_t k = 1; k < 2; k++) {
16529 for (uint32_t m = 1; m <= 6; m++) {
16530 for (uint32_t n = 1; n <= 8; n++) {
16531 GemmMicrokernelTester()
16532 .mr(6)
16533 .nr(8)
16534 .kr(1)
16535 .sr(1)
16536 .m(m)
16537 .n(n)
16538 .k(k)
16539 .iterations(1)
16540 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16541 }
16542 }
16543 }
16544 }
16545
16546 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, k_gt_2) {
16547 TEST_REQUIRES_ARM_NEON;
16548 for (size_t k = 3; k < 4; k++) {
16549 GemmMicrokernelTester()
16550 .mr(6)
16551 .nr(8)
16552 .kr(1)
16553 .sr(1)
16554 .m(6)
16555 .n(8)
16556 .k(k)
16557 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16558 }
16559 }
16560
16561 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, k_gt_2_subtile) {
16562 TEST_REQUIRES_ARM_NEON;
16563 for (size_t k = 3; k < 4; k++) {
16564 for (uint32_t m = 1; m <= 6; m++) {
16565 for (uint32_t n = 1; n <= 8; n++) {
16566 GemmMicrokernelTester()
16567 .mr(6)
16568 .nr(8)
16569 .kr(1)
16570 .sr(1)
16571 .m(m)
16572 .n(n)
16573 .k(k)
16574 .iterations(1)
16575 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16576 }
16577 }
16578 }
16579 }
16580
16581 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, k_div_2) {
16582 TEST_REQUIRES_ARM_NEON;
16583 for (size_t k = 4; k <= 20; k += 2) {
16584 GemmMicrokernelTester()
16585 .mr(6)
16586 .nr(8)
16587 .kr(1)
16588 .sr(1)
16589 .m(6)
16590 .n(8)
16591 .k(k)
16592 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16593 }
16594 }
16595
16596 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, k_div_2_subtile) {
16597 TEST_REQUIRES_ARM_NEON;
16598 for (size_t k = 4; k <= 20; k += 2) {
16599 for (uint32_t m = 1; m <= 6; m++) {
16600 for (uint32_t n = 1; n <= 8; n++) {
16601 GemmMicrokernelTester()
16602 .mr(6)
16603 .nr(8)
16604 .kr(1)
16605 .sr(1)
16606 .m(m)
16607 .n(n)
16608 .k(k)
16609 .iterations(1)
16610 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16611 }
16612 }
16613 }
16614 }
16615
16616 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, n_gt_8) {
16617 TEST_REQUIRES_ARM_NEON;
16618 for (uint32_t n = 9; n < 16; n++) {
16619 for (size_t k = 1; k <= 10; k += 3) {
16620 GemmMicrokernelTester()
16621 .mr(6)
16622 .nr(8)
16623 .kr(1)
16624 .sr(1)
16625 .m(6)
16626 .n(8)
16627 .k(k)
16628 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16629 }
16630 }
16631 }
16632
16633 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, n_gt_8_strided_cn) {
16634 TEST_REQUIRES_ARM_NEON;
16635 for (uint32_t n = 9; n < 16; n++) {
16636 for (size_t k = 1; k <= 10; k += 3) {
16637 GemmMicrokernelTester()
16638 .mr(6)
16639 .nr(8)
16640 .kr(1)
16641 .sr(1)
16642 .m(6)
16643 .n(8)
16644 .k(k)
16645 .cn_stride(11)
16646 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16647 }
16648 }
16649 }
16650
16651 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, n_gt_8_subtile) {
16652 TEST_REQUIRES_ARM_NEON;
16653 for (uint32_t n = 9; n < 16; n++) {
16654 for (size_t k = 1; k <= 10; k += 3) {
16655 for (uint32_t m = 1; m <= 6; m++) {
16656 GemmMicrokernelTester()
16657 .mr(6)
16658 .nr(8)
16659 .kr(1)
16660 .sr(1)
16661 .m(m)
16662 .n(n)
16663 .k(k)
16664 .iterations(1)
16665 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16666 }
16667 }
16668 }
16669 }
16670
16671 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, n_div_8) {
16672 TEST_REQUIRES_ARM_NEON;
16673 for (uint32_t n = 16; n <= 24; n += 8) {
16674 for (size_t k = 1; k <= 10; k += 3) {
16675 GemmMicrokernelTester()
16676 .mr(6)
16677 .nr(8)
16678 .kr(1)
16679 .sr(1)
16680 .m(6)
16681 .n(8)
16682 .k(k)
16683 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16684 }
16685 }
16686 }
16687
16688 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, n_div_8_strided_cn) {
16689 TEST_REQUIRES_ARM_NEON;
16690 for (uint32_t n = 16; n <= 24; n += 8) {
16691 for (size_t k = 1; k <= 10; k += 3) {
16692 GemmMicrokernelTester()
16693 .mr(6)
16694 .nr(8)
16695 .kr(1)
16696 .sr(1)
16697 .m(6)
16698 .n(n)
16699 .k(k)
16700 .cn_stride(11)
16701 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16702 }
16703 }
16704 }
16705
16706 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, n_div_8_subtile) {
16707 TEST_REQUIRES_ARM_NEON;
16708 for (uint32_t n = 16; n <= 24; n += 8) {
16709 for (size_t k = 1; k <= 10; k += 3) {
16710 for (uint32_t m = 1; m <= 6; m++) {
16711 GemmMicrokernelTester()
16712 .mr(6)
16713 .nr(8)
16714 .kr(1)
16715 .sr(1)
16716 .m(m)
16717 .n(n)
16718 .k(k)
16719 .iterations(1)
16720 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16721 }
16722 }
16723 }
16724 }
16725
16726 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, small_kernel) {
16727 TEST_REQUIRES_ARM_NEON;
16728 for (size_t k = 1; k <= 10; k += 3) {
16729 GemmMicrokernelTester()
16730 .mr(6)
16731 .nr(8)
16732 .kr(1)
16733 .sr(1)
16734 .m(6)
16735 .n(8)
16736 .k(k)
16737 .ks(3)
16738 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16739 }
16740 }
16741
16742 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, small_kernel_subtile) {
16743 TEST_REQUIRES_ARM_NEON;
16744 for (size_t k = 1; k <= 10; k += 3) {
16745 for (uint32_t m = 1; m <= 6; m++) {
16746 for (uint32_t n = 1; n <= 8; n++) {
16747 GemmMicrokernelTester()
16748 .mr(6)
16749 .nr(8)
16750 .kr(1)
16751 .sr(1)
16752 .m(m)
16753 .n(n)
16754 .k(k)
16755 .ks(3)
16756 .iterations(1)
16757 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16758 }
16759 }
16760 }
16761 }
16762
16763 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, n_gt_8_small_kernel) {
16764 TEST_REQUIRES_ARM_NEON;
16765 for (uint32_t n = 9; n < 16; n++) {
16766 for (size_t k = 1; k <= 10; k += 3) {
16767 GemmMicrokernelTester()
16768 .mr(6)
16769 .nr(8)
16770 .kr(1)
16771 .sr(1)
16772 .m(6)
16773 .n(8)
16774 .k(k)
16775 .ks(3)
16776 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16777 }
16778 }
16779 }
16780
16781 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, n_div_8_small_kernel) {
16782 TEST_REQUIRES_ARM_NEON;
16783 for (uint32_t n = 16; n <= 24; n += 8) {
16784 for (size_t k = 1; k <= 10; k += 3) {
16785 GemmMicrokernelTester()
16786 .mr(6)
16787 .nr(8)
16788 .kr(1)
16789 .sr(1)
16790 .m(6)
16791 .n(8)
16792 .k(k)
16793 .ks(3)
16794 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16795 }
16796 }
16797 }
16798
16799 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, strided_cm_subtile) {
16800 TEST_REQUIRES_ARM_NEON;
16801 for (size_t k = 1; k <= 10; k += 3) {
16802 for (uint32_t m = 1; m <= 6; m++) {
16803 for (uint32_t n = 1; n <= 8; n++) {
16804 GemmMicrokernelTester()
16805 .mr(6)
16806 .nr(8)
16807 .kr(1)
16808 .sr(1)
16809 .m(m)
16810 .n(n)
16811 .k(k)
16812 .cm_stride(11)
16813 .iterations(1)
16814 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16815 }
16816 }
16817 }
16818 }
16819
16820 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, a_offset) {
16821 TEST_REQUIRES_ARM_NEON;
16822 for (size_t k = 1; k <= 10; k += 3) {
16823 GemmMicrokernelTester()
16824 .mr(6)
16825 .nr(8)
16826 .kr(1)
16827 .sr(1)
16828 .m(6)
16829 .n(8)
16830 .k(k)
16831 .ks(3)
16832 .a_offset(67)
16833 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16834 }
16835 }
16836
16837 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, zero) {
16838 TEST_REQUIRES_ARM_NEON;
16839 for (uint32_t mz = 0; mz < 6; mz++) {
16840 for (size_t k = 1; k <= 10; k += 3) {
16841 GemmMicrokernelTester()
16842 .mr(6)
16843 .nr(8)
16844 .kr(1)
16845 .sr(1)
16846 .m(6)
16847 .n(8)
16848 .k(k)
16849 .ks(3)
16850 .a_offset(67)
16851 .zero_index(mz)
16852 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16853 }
16854 }
16855 }
16856
16857 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, qmin) {
16858 TEST_REQUIRES_ARM_NEON;
16859 GemmMicrokernelTester()
16860 .mr(6)
16861 .nr(8)
16862 .kr(1)
16863 .sr(1)
16864 .m(6)
16865 .n(8)
16866 .k(2)
16867 .qmin(128)
16868 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16869 }
16870
16871 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, qmax) {
16872 TEST_REQUIRES_ARM_NEON;
16873 GemmMicrokernelTester()
16874 .mr(6)
16875 .nr(8)
16876 .kr(1)
16877 .sr(1)
16878 .m(6)
16879 .n(8)
16880 .k(2)
16881 .qmax(128)
16882 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16883 }
16884
16885 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, strided_cm) {
16886 TEST_REQUIRES_ARM_NEON;
16887 GemmMicrokernelTester()
16888 .mr(6)
16889 .nr(8)
16890 .kr(1)
16891 .sr(1)
16892 .m(6)
16893 .n(8)
16894 .k(2)
16895 .cm_stride(11)
16896 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
16897 }
16898#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
16899
16900
16901#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard69172d92019-11-26 16:22:39 -080016902 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, k_eq_4) {
16903 TEST_REQUIRES_ARM_NEON;
16904 GemmMicrokernelTester()
16905 .mr(6)
16906 .nr(8)
16907 .kr(1)
16908 .sr(1)
16909 .m(6)
16910 .n(8)
16911 .k(4)
16912 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
16913 }
16914
16915 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, strided_cn) {
16916 TEST_REQUIRES_ARM_NEON;
16917 GemmMicrokernelTester()
16918 .mr(6)
16919 .nr(8)
16920 .kr(1)
16921 .sr(1)
16922 .m(6)
16923 .n(8)
16924 .k(4)
16925 .cn_stride(11)
16926 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
16927 }
16928
16929 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, k_eq_4_subtile) {
16930 TEST_REQUIRES_ARM_NEON;
16931 for (uint32_t m = 1; m <= 6; m++) {
16932 for (uint32_t n = 1; n <= 8; n++) {
16933 GemmMicrokernelTester()
16934 .mr(6)
16935 .nr(8)
16936 .kr(1)
16937 .sr(1)
16938 .m(m)
16939 .n(n)
16940 .k(4)
16941 .iterations(1)
16942 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
16943 }
16944 }
16945 }
16946
16947 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, k_eq_4_subtile_m) {
16948 TEST_REQUIRES_ARM_NEON;
16949 for (uint32_t m = 1; m <= 6; m++) {
16950 GemmMicrokernelTester()
16951 .mr(6)
16952 .nr(8)
16953 .kr(1)
16954 .sr(1)
16955 .m(m)
16956 .n(8)
16957 .k(4)
16958 .iterations(1)
16959 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
16960 }
16961 }
16962
16963 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, k_eq_4_subtile_n) {
16964 TEST_REQUIRES_ARM_NEON;
16965 for (uint32_t n = 1; n <= 8; n++) {
16966 GemmMicrokernelTester()
16967 .mr(6)
16968 .nr(8)
16969 .kr(1)
16970 .sr(1)
16971 .m(6)
16972 .n(n)
16973 .k(4)
16974 .iterations(1)
16975 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
16976 }
16977 }
16978
16979 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, k_lt_4) {
16980 TEST_REQUIRES_ARM_NEON;
16981 for (size_t k = 1; k < 4; k++) {
16982 GemmMicrokernelTester()
16983 .mr(6)
16984 .nr(8)
16985 .kr(1)
16986 .sr(1)
16987 .m(6)
16988 .n(8)
16989 .k(k)
16990 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
16991 }
16992 }
16993
16994 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, k_lt_4_subtile) {
16995 TEST_REQUIRES_ARM_NEON;
16996 for (size_t k = 1; k < 4; k++) {
16997 for (uint32_t m = 1; m <= 6; m++) {
16998 for (uint32_t n = 1; n <= 8; n++) {
16999 GemmMicrokernelTester()
17000 .mr(6)
17001 .nr(8)
17002 .kr(1)
17003 .sr(1)
17004 .m(m)
17005 .n(n)
17006 .k(k)
17007 .iterations(1)
17008 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
17009 }
17010 }
17011 }
17012 }
17013
17014 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, k_gt_4) {
17015 TEST_REQUIRES_ARM_NEON;
17016 for (size_t k = 5; k < 8; k++) {
17017 GemmMicrokernelTester()
17018 .mr(6)
17019 .nr(8)
17020 .kr(1)
17021 .sr(1)
17022 .m(6)
17023 .n(8)
17024 .k(k)
17025 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
17026 }
17027 }
17028
17029 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, k_gt_4_subtile) {
17030 TEST_REQUIRES_ARM_NEON;
17031 for (size_t k = 5; k < 8; k++) {
17032 for (uint32_t m = 1; m <= 6; m++) {
17033 for (uint32_t n = 1; n <= 8; n++) {
17034 GemmMicrokernelTester()
17035 .mr(6)
17036 .nr(8)
17037 .kr(1)
17038 .sr(1)
17039 .m(m)
17040 .n(n)
17041 .k(k)
17042 .iterations(1)
17043 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
17044 }
17045 }
17046 }
17047 }
17048
17049 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, k_div_4) {
17050 TEST_REQUIRES_ARM_NEON;
17051 for (size_t k = 8; k <= 40; k += 4) {
17052 GemmMicrokernelTester()
17053 .mr(6)
17054 .nr(8)
17055 .kr(1)
17056 .sr(1)
17057 .m(6)
17058 .n(8)
17059 .k(k)
17060 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
17061 }
17062 }
17063
17064 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, k_div_4_subtile) {
17065 TEST_REQUIRES_ARM_NEON;
17066 for (size_t k = 8; k <= 40; k += 4) {
17067 for (uint32_t m = 1; m <= 6; m++) {
17068 for (uint32_t n = 1; n <= 8; n++) {
17069 GemmMicrokernelTester()
17070 .mr(6)
17071 .nr(8)
17072 .kr(1)
17073 .sr(1)
17074 .m(m)
17075 .n(n)
17076 .k(k)
17077 .iterations(1)
17078 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
17079 }
17080 }
17081 }
17082 }
17083
17084 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, n_gt_8) {
17085 TEST_REQUIRES_ARM_NEON;
17086 for (uint32_t n = 9; n < 16; n++) {
17087 for (size_t k = 1; k <= 20; k += 5) {
17088 GemmMicrokernelTester()
17089 .mr(6)
17090 .nr(8)
17091 .kr(1)
17092 .sr(1)
17093 .m(6)
17094 .n(8)
17095 .k(k)
17096 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
17097 }
17098 }
17099 }
17100
17101 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, n_gt_8_strided_cn) {
17102 TEST_REQUIRES_ARM_NEON;
17103 for (uint32_t n = 9; n < 16; n++) {
17104 for (size_t k = 1; k <= 20; k += 5) {
17105 GemmMicrokernelTester()
17106 .mr(6)
17107 .nr(8)
17108 .kr(1)
17109 .sr(1)
17110 .m(6)
17111 .n(8)
17112 .k(k)
17113 .cn_stride(11)
17114 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
17115 }
17116 }
17117 }
17118
17119 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, n_gt_8_subtile) {
17120 TEST_REQUIRES_ARM_NEON;
17121 for (uint32_t n = 9; n < 16; n++) {
17122 for (size_t k = 1; k <= 20; k += 5) {
17123 for (uint32_t m = 1; m <= 6; m++) {
17124 GemmMicrokernelTester()
17125 .mr(6)
17126 .nr(8)
17127 .kr(1)
17128 .sr(1)
17129 .m(m)
17130 .n(n)
17131 .k(k)
17132 .iterations(1)
17133 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
17134 }
17135 }
17136 }
17137 }
17138
17139 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, n_div_8) {
17140 TEST_REQUIRES_ARM_NEON;
17141 for (uint32_t n = 16; n <= 24; n += 8) {
17142 for (size_t k = 1; k <= 20; k += 5) {
17143 GemmMicrokernelTester()
17144 .mr(6)
17145 .nr(8)
17146 .kr(1)
17147 .sr(1)
17148 .m(6)
17149 .n(8)
17150 .k(k)
17151 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
17152 }
17153 }
17154 }
17155
17156 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, n_div_8_strided_cn) {
17157 TEST_REQUIRES_ARM_NEON;
17158 for (uint32_t n = 16; n <= 24; n += 8) {
17159 for (size_t k = 1; k <= 20; k += 5) {
17160 GemmMicrokernelTester()
17161 .mr(6)
17162 .nr(8)
17163 .kr(1)
17164 .sr(1)
17165 .m(6)
17166 .n(n)
17167 .k(k)
17168 .cn_stride(11)
17169 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
17170 }
17171 }
17172 }
17173
17174 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, n_div_8_subtile) {
17175 TEST_REQUIRES_ARM_NEON;
17176 for (uint32_t n = 16; n <= 24; n += 8) {
17177 for (size_t k = 1; k <= 20; k += 5) {
17178 for (uint32_t m = 1; m <= 6; m++) {
17179 GemmMicrokernelTester()
17180 .mr(6)
17181 .nr(8)
17182 .kr(1)
17183 .sr(1)
17184 .m(m)
17185 .n(n)
17186 .k(k)
17187 .iterations(1)
17188 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
17189 }
17190 }
17191 }
17192 }
17193
17194 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, small_kernel) {
17195 TEST_REQUIRES_ARM_NEON;
17196 for (size_t k = 1; k <= 20; k += 5) {
17197 GemmMicrokernelTester()
17198 .mr(6)
17199 .nr(8)
17200 .kr(1)
17201 .sr(1)
17202 .m(6)
17203 .n(8)
17204 .k(k)
17205 .ks(3)
17206 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
17207 }
17208 }
17209
17210 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, small_kernel_subtile) {
17211 TEST_REQUIRES_ARM_NEON;
17212 for (size_t k = 1; k <= 20; k += 5) {
17213 for (uint32_t m = 1; m <= 6; m++) {
17214 for (uint32_t n = 1; n <= 8; n++) {
17215 GemmMicrokernelTester()
17216 .mr(6)
17217 .nr(8)
17218 .kr(1)
17219 .sr(1)
17220 .m(m)
17221 .n(n)
17222 .k(k)
17223 .ks(3)
17224 .iterations(1)
17225 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
17226 }
17227 }
17228 }
17229 }
17230
17231 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, n_gt_8_small_kernel) {
17232 TEST_REQUIRES_ARM_NEON;
17233 for (uint32_t n = 9; n < 16; n++) {
17234 for (size_t k = 1; k <= 20; k += 5) {
17235 GemmMicrokernelTester()
17236 .mr(6)
17237 .nr(8)
17238 .kr(1)
17239 .sr(1)
17240 .m(6)
17241 .n(8)
17242 .k(k)
17243 .ks(3)
17244 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
17245 }
17246 }
17247 }
17248
17249 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, n_div_8_small_kernel) {
17250 TEST_REQUIRES_ARM_NEON;
17251 for (uint32_t n = 16; n <= 24; n += 8) {
17252 for (size_t k = 1; k <= 20; k += 5) {
17253 GemmMicrokernelTester()
17254 .mr(6)
17255 .nr(8)
17256 .kr(1)
17257 .sr(1)
17258 .m(6)
17259 .n(8)
17260 .k(k)
17261 .ks(3)
17262 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
17263 }
17264 }
17265 }
17266
17267 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, strided_cm_subtile) {
17268 TEST_REQUIRES_ARM_NEON;
17269 for (size_t k = 1; k <= 20; k += 5) {
17270 for (uint32_t m = 1; m <= 6; m++) {
17271 for (uint32_t n = 1; n <= 8; n++) {
17272 GemmMicrokernelTester()
17273 .mr(6)
17274 .nr(8)
17275 .kr(1)
17276 .sr(1)
17277 .m(m)
17278 .n(n)
17279 .k(k)
17280 .cm_stride(11)
17281 .iterations(1)
17282 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
17283 }
17284 }
17285 }
17286 }
17287
17288 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, a_offset) {
17289 TEST_REQUIRES_ARM_NEON;
17290 for (size_t k = 1; k <= 20; k += 5) {
17291 GemmMicrokernelTester()
17292 .mr(6)
17293 .nr(8)
17294 .kr(1)
17295 .sr(1)
17296 .m(6)
17297 .n(8)
17298 .k(k)
17299 .ks(3)
17300 .a_offset(127)
17301 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
17302 }
17303 }
17304
17305 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, zero) {
17306 TEST_REQUIRES_ARM_NEON;
17307 for (uint32_t mz = 0; mz < 6; mz++) {
17308 for (size_t k = 1; k <= 20; k += 5) {
17309 GemmMicrokernelTester()
17310 .mr(6)
17311 .nr(8)
17312 .kr(1)
17313 .sr(1)
17314 .m(6)
17315 .n(8)
17316 .k(k)
17317 .ks(3)
17318 .a_offset(127)
17319 .zero_index(mz)
17320 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
17321 }
17322 }
17323 }
17324
17325 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, qmin) {
17326 TEST_REQUIRES_ARM_NEON;
17327 GemmMicrokernelTester()
17328 .mr(6)
17329 .nr(8)
17330 .kr(1)
17331 .sr(1)
17332 .m(6)
17333 .n(8)
17334 .k(4)
17335 .qmin(128)
17336 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
17337 }
17338
17339 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, qmax) {
17340 TEST_REQUIRES_ARM_NEON;
17341 GemmMicrokernelTester()
17342 .mr(6)
17343 .nr(8)
17344 .kr(1)
17345 .sr(1)
17346 .m(6)
17347 .n(8)
17348 .k(4)
17349 .qmax(128)
17350 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
17351 }
17352
17353 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, strided_cm) {
17354 TEST_REQUIRES_ARM_NEON;
17355 GemmMicrokernelTester()
17356 .mr(6)
17357 .nr(8)
17358 .kr(1)
17359 .sr(1)
17360 .m(6)
17361 .n(8)
17362 .k(4)
17363 .cm_stride(11)
17364 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
17365 }
17366#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
17367
17368
17369#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard5243bb02019-11-22 16:37:50 -080017370 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, k_eq_2) {
17371 TEST_REQUIRES_ARM_NEON_FMA;
17372 GemmMicrokernelTester()
17373 .mr(1)
17374 .nr(8)
17375 .kr(1)
17376 .sr(1)
17377 .m(1)
17378 .n(8)
17379 .k(2)
17380 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17381 }
17382
17383 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, strided_cn) {
17384 TEST_REQUIRES_ARM_NEON_FMA;
17385 GemmMicrokernelTester()
17386 .mr(1)
17387 .nr(8)
17388 .kr(1)
17389 .sr(1)
17390 .m(1)
17391 .n(8)
17392 .k(2)
17393 .cn_stride(11)
17394 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17395 }
17396
17397 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, k_eq_2_subtile) {
17398 TEST_REQUIRES_ARM_NEON_FMA;
17399 for (uint32_t m = 1; m <= 1; m++) {
17400 for (uint32_t n = 1; n <= 8; n++) {
17401 GemmMicrokernelTester()
17402 .mr(1)
17403 .nr(8)
17404 .kr(1)
17405 .sr(1)
17406 .m(m)
17407 .n(n)
17408 .k(2)
17409 .iterations(1)
17410 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17411 }
17412 }
17413 }
17414
17415 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, k_eq_2_subtile_m) {
17416 TEST_REQUIRES_ARM_NEON_FMA;
17417 for (uint32_t m = 1; m <= 1; m++) {
17418 GemmMicrokernelTester()
17419 .mr(1)
17420 .nr(8)
17421 .kr(1)
17422 .sr(1)
17423 .m(m)
17424 .n(8)
17425 .k(2)
17426 .iterations(1)
17427 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17428 }
17429 }
17430
17431 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, k_eq_2_subtile_n) {
17432 TEST_REQUIRES_ARM_NEON_FMA;
17433 for (uint32_t n = 1; n <= 8; n++) {
17434 GemmMicrokernelTester()
17435 .mr(1)
17436 .nr(8)
17437 .kr(1)
17438 .sr(1)
17439 .m(1)
17440 .n(n)
17441 .k(2)
17442 .iterations(1)
17443 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17444 }
17445 }
17446
17447 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, k_lt_2) {
17448 TEST_REQUIRES_ARM_NEON_FMA;
17449 for (size_t k = 1; k < 2; k++) {
17450 GemmMicrokernelTester()
17451 .mr(1)
17452 .nr(8)
17453 .kr(1)
17454 .sr(1)
17455 .m(1)
17456 .n(8)
17457 .k(k)
17458 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17459 }
17460 }
17461
17462 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, k_lt_2_subtile) {
17463 TEST_REQUIRES_ARM_NEON_FMA;
17464 for (size_t k = 1; k < 2; k++) {
17465 for (uint32_t m = 1; m <= 1; m++) {
17466 for (uint32_t n = 1; n <= 8; n++) {
17467 GemmMicrokernelTester()
17468 .mr(1)
17469 .nr(8)
17470 .kr(1)
17471 .sr(1)
17472 .m(m)
17473 .n(n)
17474 .k(k)
17475 .iterations(1)
17476 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17477 }
17478 }
17479 }
17480 }
17481
17482 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, k_gt_2) {
17483 TEST_REQUIRES_ARM_NEON_FMA;
17484 for (size_t k = 3; k < 4; k++) {
17485 GemmMicrokernelTester()
17486 .mr(1)
17487 .nr(8)
17488 .kr(1)
17489 .sr(1)
17490 .m(1)
17491 .n(8)
17492 .k(k)
17493 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17494 }
17495 }
17496
17497 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, k_gt_2_subtile) {
17498 TEST_REQUIRES_ARM_NEON_FMA;
17499 for (size_t k = 3; k < 4; k++) {
17500 for (uint32_t m = 1; m <= 1; m++) {
17501 for (uint32_t n = 1; n <= 8; n++) {
17502 GemmMicrokernelTester()
17503 .mr(1)
17504 .nr(8)
17505 .kr(1)
17506 .sr(1)
17507 .m(m)
17508 .n(n)
17509 .k(k)
17510 .iterations(1)
17511 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17512 }
17513 }
17514 }
17515 }
17516
17517 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, k_div_2) {
17518 TEST_REQUIRES_ARM_NEON_FMA;
17519 for (size_t k = 4; k <= 20; k += 2) {
17520 GemmMicrokernelTester()
17521 .mr(1)
17522 .nr(8)
17523 .kr(1)
17524 .sr(1)
17525 .m(1)
17526 .n(8)
17527 .k(k)
17528 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17529 }
17530 }
17531
17532 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, k_div_2_subtile) {
17533 TEST_REQUIRES_ARM_NEON_FMA;
17534 for (size_t k = 4; k <= 20; k += 2) {
17535 for (uint32_t m = 1; m <= 1; m++) {
17536 for (uint32_t n = 1; n <= 8; n++) {
17537 GemmMicrokernelTester()
17538 .mr(1)
17539 .nr(8)
17540 .kr(1)
17541 .sr(1)
17542 .m(m)
17543 .n(n)
17544 .k(k)
17545 .iterations(1)
17546 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17547 }
17548 }
17549 }
17550 }
17551
17552 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, n_gt_8) {
17553 TEST_REQUIRES_ARM_NEON_FMA;
17554 for (uint32_t n = 9; n < 16; n++) {
17555 for (size_t k = 1; k <= 10; k += 3) {
17556 GemmMicrokernelTester()
17557 .mr(1)
17558 .nr(8)
17559 .kr(1)
17560 .sr(1)
17561 .m(1)
17562 .n(8)
17563 .k(k)
17564 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17565 }
17566 }
17567 }
17568
17569 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, n_gt_8_strided_cn) {
17570 TEST_REQUIRES_ARM_NEON_FMA;
17571 for (uint32_t n = 9; n < 16; n++) {
17572 for (size_t k = 1; k <= 10; k += 3) {
17573 GemmMicrokernelTester()
17574 .mr(1)
17575 .nr(8)
17576 .kr(1)
17577 .sr(1)
17578 .m(1)
17579 .n(8)
17580 .k(k)
17581 .cn_stride(11)
17582 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17583 }
17584 }
17585 }
17586
17587 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, n_gt_8_subtile) {
17588 TEST_REQUIRES_ARM_NEON_FMA;
17589 for (uint32_t n = 9; n < 16; n++) {
17590 for (size_t k = 1; k <= 10; k += 3) {
17591 for (uint32_t m = 1; m <= 1; m++) {
17592 GemmMicrokernelTester()
17593 .mr(1)
17594 .nr(8)
17595 .kr(1)
17596 .sr(1)
17597 .m(m)
17598 .n(n)
17599 .k(k)
17600 .iterations(1)
17601 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17602 }
17603 }
17604 }
17605 }
17606
17607 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, n_div_8) {
17608 TEST_REQUIRES_ARM_NEON_FMA;
17609 for (uint32_t n = 16; n <= 24; n += 8) {
17610 for (size_t k = 1; k <= 10; k += 3) {
17611 GemmMicrokernelTester()
17612 .mr(1)
17613 .nr(8)
17614 .kr(1)
17615 .sr(1)
17616 .m(1)
17617 .n(8)
17618 .k(k)
17619 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17620 }
17621 }
17622 }
17623
17624 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, n_div_8_strided_cn) {
17625 TEST_REQUIRES_ARM_NEON_FMA;
17626 for (uint32_t n = 16; n <= 24; n += 8) {
17627 for (size_t k = 1; k <= 10; k += 3) {
17628 GemmMicrokernelTester()
17629 .mr(1)
17630 .nr(8)
17631 .kr(1)
17632 .sr(1)
17633 .m(1)
17634 .n(n)
17635 .k(k)
17636 .cn_stride(11)
17637 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17638 }
17639 }
17640 }
17641
17642 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, n_div_8_subtile) {
17643 TEST_REQUIRES_ARM_NEON_FMA;
17644 for (uint32_t n = 16; n <= 24; n += 8) {
17645 for (size_t k = 1; k <= 10; k += 3) {
17646 for (uint32_t m = 1; m <= 1; m++) {
17647 GemmMicrokernelTester()
17648 .mr(1)
17649 .nr(8)
17650 .kr(1)
17651 .sr(1)
17652 .m(m)
17653 .n(n)
17654 .k(k)
17655 .iterations(1)
17656 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17657 }
17658 }
17659 }
17660 }
17661
17662 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, small_kernel) {
17663 TEST_REQUIRES_ARM_NEON_FMA;
17664 for (size_t k = 1; k <= 10; k += 3) {
17665 GemmMicrokernelTester()
17666 .mr(1)
17667 .nr(8)
17668 .kr(1)
17669 .sr(1)
17670 .m(1)
17671 .n(8)
17672 .k(k)
17673 .ks(3)
17674 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17675 }
17676 }
17677
17678 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, small_kernel_subtile) {
17679 TEST_REQUIRES_ARM_NEON_FMA;
17680 for (size_t k = 1; k <= 10; k += 3) {
17681 for (uint32_t m = 1; m <= 1; m++) {
17682 for (uint32_t n = 1; n <= 8; n++) {
17683 GemmMicrokernelTester()
17684 .mr(1)
17685 .nr(8)
17686 .kr(1)
17687 .sr(1)
17688 .m(m)
17689 .n(n)
17690 .k(k)
17691 .ks(3)
17692 .iterations(1)
17693 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17694 }
17695 }
17696 }
17697 }
17698
17699 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, n_gt_8_small_kernel) {
17700 TEST_REQUIRES_ARM_NEON_FMA;
17701 for (uint32_t n = 9; n < 16; n++) {
17702 for (size_t k = 1; k <= 10; k += 3) {
17703 GemmMicrokernelTester()
17704 .mr(1)
17705 .nr(8)
17706 .kr(1)
17707 .sr(1)
17708 .m(1)
17709 .n(8)
17710 .k(k)
17711 .ks(3)
17712 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17713 }
17714 }
17715 }
17716
17717 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, n_div_8_small_kernel) {
17718 TEST_REQUIRES_ARM_NEON_FMA;
17719 for (uint32_t n = 16; n <= 24; n += 8) {
17720 for (size_t k = 1; k <= 10; k += 3) {
17721 GemmMicrokernelTester()
17722 .mr(1)
17723 .nr(8)
17724 .kr(1)
17725 .sr(1)
17726 .m(1)
17727 .n(8)
17728 .k(k)
17729 .ks(3)
17730 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17731 }
17732 }
17733 }
17734
17735 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, strided_cm_subtile) {
17736 TEST_REQUIRES_ARM_NEON_FMA;
17737 for (size_t k = 1; k <= 10; k += 3) {
17738 for (uint32_t m = 1; m <= 1; m++) {
17739 for (uint32_t n = 1; n <= 8; n++) {
17740 GemmMicrokernelTester()
17741 .mr(1)
17742 .nr(8)
17743 .kr(1)
17744 .sr(1)
17745 .m(m)
17746 .n(n)
17747 .k(k)
17748 .cm_stride(11)
17749 .iterations(1)
17750 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17751 }
17752 }
17753 }
17754 }
17755
17756 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, a_offset) {
17757 TEST_REQUIRES_ARM_NEON_FMA;
17758 for (size_t k = 1; k <= 10; k += 3) {
17759 GemmMicrokernelTester()
17760 .mr(1)
17761 .nr(8)
17762 .kr(1)
17763 .sr(1)
17764 .m(1)
17765 .n(8)
17766 .k(k)
17767 .ks(3)
17768 .a_offset(13)
17769 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17770 }
17771 }
17772
17773 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, zero) {
17774 TEST_REQUIRES_ARM_NEON_FMA;
17775 for (uint32_t mz = 0; mz < 1; mz++) {
17776 for (size_t k = 1; k <= 10; k += 3) {
17777 GemmMicrokernelTester()
17778 .mr(1)
17779 .nr(8)
17780 .kr(1)
17781 .sr(1)
17782 .m(1)
17783 .n(8)
17784 .k(k)
17785 .ks(3)
17786 .a_offset(13)
17787 .zero_index(mz)
17788 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17789 }
17790 }
17791 }
17792
17793 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, qmin) {
17794 TEST_REQUIRES_ARM_NEON_FMA;
17795 GemmMicrokernelTester()
17796 .mr(1)
17797 .nr(8)
17798 .kr(1)
17799 .sr(1)
17800 .m(1)
17801 .n(8)
17802 .k(2)
17803 .qmin(128)
17804 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17805 }
17806
17807 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, qmax) {
17808 TEST_REQUIRES_ARM_NEON_FMA;
17809 GemmMicrokernelTester()
17810 .mr(1)
17811 .nr(8)
17812 .kr(1)
17813 .sr(1)
17814 .m(1)
17815 .n(8)
17816 .k(2)
17817 .qmax(128)
17818 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17819 }
17820
17821 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, strided_cm) {
17822 TEST_REQUIRES_ARM_NEON_FMA;
17823 GemmMicrokernelTester()
17824 .mr(1)
17825 .nr(8)
17826 .kr(1)
17827 .sr(1)
17828 .m(1)
17829 .n(8)
17830 .k(2)
17831 .cm_stride(11)
17832 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
17833 }
17834#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
17835
17836
17837#if XNN_ARCH_ARM || XNN_ARCH_ARM64
17838 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, k_eq_4) {
17839 TEST_REQUIRES_ARM_NEON_FMA;
17840 GemmMicrokernelTester()
17841 .mr(4)
17842 .nr(8)
17843 .kr(1)
17844 .sr(1)
17845 .m(4)
17846 .n(8)
17847 .k(4)
17848 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
17849 }
17850
17851 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, strided_cn) {
17852 TEST_REQUIRES_ARM_NEON_FMA;
17853 GemmMicrokernelTester()
17854 .mr(4)
17855 .nr(8)
17856 .kr(1)
17857 .sr(1)
17858 .m(4)
17859 .n(8)
17860 .k(4)
17861 .cn_stride(11)
17862 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
17863 }
17864
17865 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, k_eq_4_subtile) {
17866 TEST_REQUIRES_ARM_NEON_FMA;
17867 for (uint32_t m = 1; m <= 4; m++) {
17868 for (uint32_t n = 1; n <= 8; n++) {
17869 GemmMicrokernelTester()
17870 .mr(4)
17871 .nr(8)
17872 .kr(1)
17873 .sr(1)
17874 .m(m)
17875 .n(n)
17876 .k(4)
17877 .iterations(1)
17878 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
17879 }
17880 }
17881 }
17882
17883 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, k_eq_4_subtile_m) {
17884 TEST_REQUIRES_ARM_NEON_FMA;
17885 for (uint32_t m = 1; m <= 4; m++) {
17886 GemmMicrokernelTester()
17887 .mr(4)
17888 .nr(8)
17889 .kr(1)
17890 .sr(1)
17891 .m(m)
17892 .n(8)
17893 .k(4)
17894 .iterations(1)
17895 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
17896 }
17897 }
17898
17899 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, k_eq_4_subtile_n) {
17900 TEST_REQUIRES_ARM_NEON_FMA;
17901 for (uint32_t n = 1; n <= 8; n++) {
17902 GemmMicrokernelTester()
17903 .mr(4)
17904 .nr(8)
17905 .kr(1)
17906 .sr(1)
17907 .m(4)
17908 .n(n)
17909 .k(4)
17910 .iterations(1)
17911 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
17912 }
17913 }
17914
17915 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, k_lt_4) {
17916 TEST_REQUIRES_ARM_NEON_FMA;
17917 for (size_t k = 1; k < 4; k++) {
17918 GemmMicrokernelTester()
17919 .mr(4)
17920 .nr(8)
17921 .kr(1)
17922 .sr(1)
17923 .m(4)
17924 .n(8)
17925 .k(k)
17926 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
17927 }
17928 }
17929
17930 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, k_lt_4_subtile) {
17931 TEST_REQUIRES_ARM_NEON_FMA;
17932 for (size_t k = 1; k < 4; k++) {
17933 for (uint32_t m = 1; m <= 4; m++) {
17934 for (uint32_t n = 1; n <= 8; n++) {
17935 GemmMicrokernelTester()
17936 .mr(4)
17937 .nr(8)
17938 .kr(1)
17939 .sr(1)
17940 .m(m)
17941 .n(n)
17942 .k(k)
17943 .iterations(1)
17944 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
17945 }
17946 }
17947 }
17948 }
17949
17950 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, k_gt_4) {
17951 TEST_REQUIRES_ARM_NEON_FMA;
17952 for (size_t k = 5; k < 8; k++) {
17953 GemmMicrokernelTester()
17954 .mr(4)
17955 .nr(8)
17956 .kr(1)
17957 .sr(1)
17958 .m(4)
17959 .n(8)
17960 .k(k)
17961 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
17962 }
17963 }
17964
17965 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, k_gt_4_subtile) {
17966 TEST_REQUIRES_ARM_NEON_FMA;
17967 for (size_t k = 5; k < 8; k++) {
17968 for (uint32_t m = 1; m <= 4; m++) {
17969 for (uint32_t n = 1; n <= 8; n++) {
17970 GemmMicrokernelTester()
17971 .mr(4)
17972 .nr(8)
17973 .kr(1)
17974 .sr(1)
17975 .m(m)
17976 .n(n)
17977 .k(k)
17978 .iterations(1)
17979 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
17980 }
17981 }
17982 }
17983 }
17984
17985 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, k_div_4) {
17986 TEST_REQUIRES_ARM_NEON_FMA;
17987 for (size_t k = 8; k <= 40; k += 4) {
17988 GemmMicrokernelTester()
17989 .mr(4)
17990 .nr(8)
17991 .kr(1)
17992 .sr(1)
17993 .m(4)
17994 .n(8)
17995 .k(k)
17996 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
17997 }
17998 }
17999
18000 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, k_div_4_subtile) {
18001 TEST_REQUIRES_ARM_NEON_FMA;
18002 for (size_t k = 8; k <= 40; k += 4) {
18003 for (uint32_t m = 1; m <= 4; m++) {
18004 for (uint32_t n = 1; n <= 8; n++) {
18005 GemmMicrokernelTester()
18006 .mr(4)
18007 .nr(8)
18008 .kr(1)
18009 .sr(1)
18010 .m(m)
18011 .n(n)
18012 .k(k)
18013 .iterations(1)
18014 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
18015 }
18016 }
18017 }
18018 }
18019
18020 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, n_gt_8) {
18021 TEST_REQUIRES_ARM_NEON_FMA;
18022 for (uint32_t n = 9; n < 16; n++) {
18023 for (size_t k = 1; k <= 20; k += 5) {
18024 GemmMicrokernelTester()
18025 .mr(4)
18026 .nr(8)
18027 .kr(1)
18028 .sr(1)
18029 .m(4)
18030 .n(8)
18031 .k(k)
18032 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
18033 }
18034 }
18035 }
18036
18037 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, n_gt_8_strided_cn) {
18038 TEST_REQUIRES_ARM_NEON_FMA;
18039 for (uint32_t n = 9; n < 16; n++) {
18040 for (size_t k = 1; k <= 20; k += 5) {
18041 GemmMicrokernelTester()
18042 .mr(4)
18043 .nr(8)
18044 .kr(1)
18045 .sr(1)
18046 .m(4)
18047 .n(8)
18048 .k(k)
18049 .cn_stride(11)
18050 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
18051 }
18052 }
18053 }
18054
18055 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, n_gt_8_subtile) {
18056 TEST_REQUIRES_ARM_NEON_FMA;
18057 for (uint32_t n = 9; n < 16; n++) {
18058 for (size_t k = 1; k <= 20; k += 5) {
18059 for (uint32_t m = 1; m <= 4; m++) {
18060 GemmMicrokernelTester()
18061 .mr(4)
18062 .nr(8)
18063 .kr(1)
18064 .sr(1)
18065 .m(m)
18066 .n(n)
18067 .k(k)
18068 .iterations(1)
18069 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
18070 }
18071 }
18072 }
18073 }
18074
18075 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, n_div_8) {
18076 TEST_REQUIRES_ARM_NEON_FMA;
18077 for (uint32_t n = 16; n <= 24; n += 8) {
18078 for (size_t k = 1; k <= 20; k += 5) {
18079 GemmMicrokernelTester()
18080 .mr(4)
18081 .nr(8)
18082 .kr(1)
18083 .sr(1)
18084 .m(4)
18085 .n(8)
18086 .k(k)
18087 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
18088 }
18089 }
18090 }
18091
18092 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, n_div_8_strided_cn) {
18093 TEST_REQUIRES_ARM_NEON_FMA;
18094 for (uint32_t n = 16; n <= 24; n += 8) {
18095 for (size_t k = 1; k <= 20; k += 5) {
18096 GemmMicrokernelTester()
18097 .mr(4)
18098 .nr(8)
18099 .kr(1)
18100 .sr(1)
18101 .m(4)
18102 .n(n)
18103 .k(k)
18104 .cn_stride(11)
18105 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
18106 }
18107 }
18108 }
18109
18110 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, n_div_8_subtile) {
18111 TEST_REQUIRES_ARM_NEON_FMA;
18112 for (uint32_t n = 16; n <= 24; n += 8) {
18113 for (size_t k = 1; k <= 20; k += 5) {
18114 for (uint32_t m = 1; m <= 4; m++) {
18115 GemmMicrokernelTester()
18116 .mr(4)
18117 .nr(8)
18118 .kr(1)
18119 .sr(1)
18120 .m(m)
18121 .n(n)
18122 .k(k)
18123 .iterations(1)
18124 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
18125 }
18126 }
18127 }
18128 }
18129
18130 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, small_kernel) {
18131 TEST_REQUIRES_ARM_NEON_FMA;
18132 for (size_t k = 1; k <= 20; k += 5) {
18133 GemmMicrokernelTester()
18134 .mr(4)
18135 .nr(8)
18136 .kr(1)
18137 .sr(1)
18138 .m(4)
18139 .n(8)
18140 .k(k)
18141 .ks(3)
18142 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
18143 }
18144 }
18145
18146 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, small_kernel_subtile) {
18147 TEST_REQUIRES_ARM_NEON_FMA;
18148 for (size_t k = 1; k <= 20; k += 5) {
18149 for (uint32_t m = 1; m <= 4; m++) {
18150 for (uint32_t n = 1; n <= 8; n++) {
18151 GemmMicrokernelTester()
18152 .mr(4)
18153 .nr(8)
18154 .kr(1)
18155 .sr(1)
18156 .m(m)
18157 .n(n)
18158 .k(k)
18159 .ks(3)
18160 .iterations(1)
18161 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
18162 }
18163 }
18164 }
18165 }
18166
18167 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, n_gt_8_small_kernel) {
18168 TEST_REQUIRES_ARM_NEON_FMA;
18169 for (uint32_t n = 9; n < 16; n++) {
18170 for (size_t k = 1; k <= 20; k += 5) {
18171 GemmMicrokernelTester()
18172 .mr(4)
18173 .nr(8)
18174 .kr(1)
18175 .sr(1)
18176 .m(4)
18177 .n(8)
18178 .k(k)
18179 .ks(3)
18180 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
18181 }
18182 }
18183 }
18184
18185 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, n_div_8_small_kernel) {
18186 TEST_REQUIRES_ARM_NEON_FMA;
18187 for (uint32_t n = 16; n <= 24; n += 8) {
18188 for (size_t k = 1; k <= 20; k += 5) {
18189 GemmMicrokernelTester()
18190 .mr(4)
18191 .nr(8)
18192 .kr(1)
18193 .sr(1)
18194 .m(4)
18195 .n(8)
18196 .k(k)
18197 .ks(3)
18198 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
18199 }
18200 }
18201 }
18202
18203 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, strided_cm_subtile) {
18204 TEST_REQUIRES_ARM_NEON_FMA;
18205 for (size_t k = 1; k <= 20; k += 5) {
18206 for (uint32_t m = 1; m <= 4; m++) {
18207 for (uint32_t n = 1; n <= 8; n++) {
18208 GemmMicrokernelTester()
18209 .mr(4)
18210 .nr(8)
18211 .kr(1)
18212 .sr(1)
18213 .m(m)
18214 .n(n)
18215 .k(k)
18216 .cm_stride(11)
18217 .iterations(1)
18218 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
18219 }
18220 }
18221 }
18222 }
18223
18224 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, a_offset) {
18225 TEST_REQUIRES_ARM_NEON_FMA;
18226 for (size_t k = 1; k <= 20; k += 5) {
18227 GemmMicrokernelTester()
18228 .mr(4)
18229 .nr(8)
18230 .kr(1)
18231 .sr(1)
18232 .m(4)
18233 .n(8)
18234 .k(k)
18235 .ks(3)
18236 .a_offset(83)
18237 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
18238 }
18239 }
18240
18241 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, zero) {
18242 TEST_REQUIRES_ARM_NEON_FMA;
18243 for (uint32_t mz = 0; mz < 4; mz++) {
18244 for (size_t k = 1; k <= 20; k += 5) {
18245 GemmMicrokernelTester()
18246 .mr(4)
18247 .nr(8)
18248 .kr(1)
18249 .sr(1)
18250 .m(4)
18251 .n(8)
18252 .k(k)
18253 .ks(3)
18254 .a_offset(83)
18255 .zero_index(mz)
18256 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
18257 }
18258 }
18259 }
18260
18261 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, qmin) {
18262 TEST_REQUIRES_ARM_NEON_FMA;
18263 GemmMicrokernelTester()
18264 .mr(4)
18265 .nr(8)
18266 .kr(1)
18267 .sr(1)
18268 .m(4)
18269 .n(8)
18270 .k(4)
18271 .qmin(128)
18272 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
18273 }
18274
18275 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, qmax) {
18276 TEST_REQUIRES_ARM_NEON_FMA;
18277 GemmMicrokernelTester()
18278 .mr(4)
18279 .nr(8)
18280 .kr(1)
18281 .sr(1)
18282 .m(4)
18283 .n(8)
18284 .k(4)
18285 .qmax(128)
18286 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
18287 }
18288
18289 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, strided_cm) {
18290 TEST_REQUIRES_ARM_NEON_FMA;
18291 GemmMicrokernelTester()
18292 .mr(4)
18293 .nr(8)
18294 .kr(1)
18295 .sr(1)
18296 .m(4)
18297 .n(8)
18298 .k(4)
18299 .cm_stride(11)
18300 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
18301 }
18302#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
18303
18304
18305#if XNN_ARCH_ARM || XNN_ARCH_ARM64
18306 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, k_eq_2) {
18307 TEST_REQUIRES_ARM_NEON_FMA;
18308 GemmMicrokernelTester()
18309 .mr(4)
18310 .nr(8)
18311 .kr(1)
18312 .sr(1)
18313 .m(4)
18314 .n(8)
18315 .k(2)
18316 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18317 }
18318
18319 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, strided_cn) {
18320 TEST_REQUIRES_ARM_NEON_FMA;
18321 GemmMicrokernelTester()
18322 .mr(4)
18323 .nr(8)
18324 .kr(1)
18325 .sr(1)
18326 .m(4)
18327 .n(8)
18328 .k(2)
18329 .cn_stride(11)
18330 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18331 }
18332
18333 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, k_eq_2_subtile) {
18334 TEST_REQUIRES_ARM_NEON_FMA;
18335 for (uint32_t m = 1; m <= 4; m++) {
18336 for (uint32_t n = 1; n <= 8; n++) {
18337 GemmMicrokernelTester()
18338 .mr(4)
18339 .nr(8)
18340 .kr(1)
18341 .sr(1)
18342 .m(m)
18343 .n(n)
18344 .k(2)
18345 .iterations(1)
18346 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18347 }
18348 }
18349 }
18350
18351 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, k_eq_2_subtile_m) {
18352 TEST_REQUIRES_ARM_NEON_FMA;
18353 for (uint32_t m = 1; m <= 4; m++) {
18354 GemmMicrokernelTester()
18355 .mr(4)
18356 .nr(8)
18357 .kr(1)
18358 .sr(1)
18359 .m(m)
18360 .n(8)
18361 .k(2)
18362 .iterations(1)
18363 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18364 }
18365 }
18366
18367 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, k_eq_2_subtile_n) {
18368 TEST_REQUIRES_ARM_NEON_FMA;
18369 for (uint32_t n = 1; n <= 8; n++) {
18370 GemmMicrokernelTester()
18371 .mr(4)
18372 .nr(8)
18373 .kr(1)
18374 .sr(1)
18375 .m(4)
18376 .n(n)
18377 .k(2)
18378 .iterations(1)
18379 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18380 }
18381 }
18382
18383 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, k_lt_2) {
18384 TEST_REQUIRES_ARM_NEON_FMA;
18385 for (size_t k = 1; k < 2; k++) {
18386 GemmMicrokernelTester()
18387 .mr(4)
18388 .nr(8)
18389 .kr(1)
18390 .sr(1)
18391 .m(4)
18392 .n(8)
18393 .k(k)
18394 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18395 }
18396 }
18397
18398 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, k_lt_2_subtile) {
18399 TEST_REQUIRES_ARM_NEON_FMA;
18400 for (size_t k = 1; k < 2; k++) {
18401 for (uint32_t m = 1; m <= 4; m++) {
18402 for (uint32_t n = 1; n <= 8; n++) {
18403 GemmMicrokernelTester()
18404 .mr(4)
18405 .nr(8)
18406 .kr(1)
18407 .sr(1)
18408 .m(m)
18409 .n(n)
18410 .k(k)
18411 .iterations(1)
18412 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18413 }
18414 }
18415 }
18416 }
18417
18418 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, k_gt_2) {
18419 TEST_REQUIRES_ARM_NEON_FMA;
18420 for (size_t k = 3; k < 4; k++) {
18421 GemmMicrokernelTester()
18422 .mr(4)
18423 .nr(8)
18424 .kr(1)
18425 .sr(1)
18426 .m(4)
18427 .n(8)
18428 .k(k)
18429 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18430 }
18431 }
18432
18433 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, k_gt_2_subtile) {
18434 TEST_REQUIRES_ARM_NEON_FMA;
18435 for (size_t k = 3; k < 4; k++) {
18436 for (uint32_t m = 1; m <= 4; m++) {
18437 for (uint32_t n = 1; n <= 8; n++) {
18438 GemmMicrokernelTester()
18439 .mr(4)
18440 .nr(8)
18441 .kr(1)
18442 .sr(1)
18443 .m(m)
18444 .n(n)
18445 .k(k)
18446 .iterations(1)
18447 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18448 }
18449 }
18450 }
18451 }
18452
18453 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, k_div_2) {
18454 TEST_REQUIRES_ARM_NEON_FMA;
18455 for (size_t k = 4; k <= 20; k += 2) {
18456 GemmMicrokernelTester()
18457 .mr(4)
18458 .nr(8)
18459 .kr(1)
18460 .sr(1)
18461 .m(4)
18462 .n(8)
18463 .k(k)
18464 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18465 }
18466 }
18467
18468 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, k_div_2_subtile) {
18469 TEST_REQUIRES_ARM_NEON_FMA;
18470 for (size_t k = 4; k <= 20; k += 2) {
18471 for (uint32_t m = 1; m <= 4; m++) {
18472 for (uint32_t n = 1; n <= 8; n++) {
18473 GemmMicrokernelTester()
18474 .mr(4)
18475 .nr(8)
18476 .kr(1)
18477 .sr(1)
18478 .m(m)
18479 .n(n)
18480 .k(k)
18481 .iterations(1)
18482 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18483 }
18484 }
18485 }
18486 }
18487
18488 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, n_gt_8) {
18489 TEST_REQUIRES_ARM_NEON_FMA;
18490 for (uint32_t n = 9; n < 16; n++) {
18491 for (size_t k = 1; k <= 10; k += 3) {
18492 GemmMicrokernelTester()
18493 .mr(4)
18494 .nr(8)
18495 .kr(1)
18496 .sr(1)
18497 .m(4)
18498 .n(8)
18499 .k(k)
18500 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18501 }
18502 }
18503 }
18504
18505 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, n_gt_8_strided_cn) {
18506 TEST_REQUIRES_ARM_NEON_FMA;
18507 for (uint32_t n = 9; n < 16; n++) {
18508 for (size_t k = 1; k <= 10; k += 3) {
18509 GemmMicrokernelTester()
18510 .mr(4)
18511 .nr(8)
18512 .kr(1)
18513 .sr(1)
18514 .m(4)
18515 .n(8)
18516 .k(k)
18517 .cn_stride(11)
18518 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18519 }
18520 }
18521 }
18522
18523 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, n_gt_8_subtile) {
18524 TEST_REQUIRES_ARM_NEON_FMA;
18525 for (uint32_t n = 9; n < 16; n++) {
18526 for (size_t k = 1; k <= 10; k += 3) {
18527 for (uint32_t m = 1; m <= 4; m++) {
18528 GemmMicrokernelTester()
18529 .mr(4)
18530 .nr(8)
18531 .kr(1)
18532 .sr(1)
18533 .m(m)
18534 .n(n)
18535 .k(k)
18536 .iterations(1)
18537 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18538 }
18539 }
18540 }
18541 }
18542
18543 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, n_div_8) {
18544 TEST_REQUIRES_ARM_NEON_FMA;
18545 for (uint32_t n = 16; n <= 24; n += 8) {
18546 for (size_t k = 1; k <= 10; k += 3) {
18547 GemmMicrokernelTester()
18548 .mr(4)
18549 .nr(8)
18550 .kr(1)
18551 .sr(1)
18552 .m(4)
18553 .n(8)
18554 .k(k)
18555 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18556 }
18557 }
18558 }
18559
18560 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, n_div_8_strided_cn) {
18561 TEST_REQUIRES_ARM_NEON_FMA;
18562 for (uint32_t n = 16; n <= 24; n += 8) {
18563 for (size_t k = 1; k <= 10; k += 3) {
18564 GemmMicrokernelTester()
18565 .mr(4)
18566 .nr(8)
18567 .kr(1)
18568 .sr(1)
18569 .m(4)
18570 .n(n)
18571 .k(k)
18572 .cn_stride(11)
18573 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18574 }
18575 }
18576 }
18577
18578 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, n_div_8_subtile) {
18579 TEST_REQUIRES_ARM_NEON_FMA;
18580 for (uint32_t n = 16; n <= 24; n += 8) {
18581 for (size_t k = 1; k <= 10; k += 3) {
18582 for (uint32_t m = 1; m <= 4; m++) {
18583 GemmMicrokernelTester()
18584 .mr(4)
18585 .nr(8)
18586 .kr(1)
18587 .sr(1)
18588 .m(m)
18589 .n(n)
18590 .k(k)
18591 .iterations(1)
18592 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18593 }
18594 }
18595 }
18596 }
18597
18598 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, small_kernel) {
18599 TEST_REQUIRES_ARM_NEON_FMA;
18600 for (size_t k = 1; k <= 10; k += 3) {
18601 GemmMicrokernelTester()
18602 .mr(4)
18603 .nr(8)
18604 .kr(1)
18605 .sr(1)
18606 .m(4)
18607 .n(8)
18608 .k(k)
18609 .ks(3)
18610 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18611 }
18612 }
18613
18614 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, small_kernel_subtile) {
18615 TEST_REQUIRES_ARM_NEON_FMA;
18616 for (size_t k = 1; k <= 10; k += 3) {
18617 for (uint32_t m = 1; m <= 4; m++) {
18618 for (uint32_t n = 1; n <= 8; n++) {
18619 GemmMicrokernelTester()
18620 .mr(4)
18621 .nr(8)
18622 .kr(1)
18623 .sr(1)
18624 .m(m)
18625 .n(n)
18626 .k(k)
18627 .ks(3)
18628 .iterations(1)
18629 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18630 }
18631 }
18632 }
18633 }
18634
18635 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, n_gt_8_small_kernel) {
18636 TEST_REQUIRES_ARM_NEON_FMA;
18637 for (uint32_t n = 9; n < 16; n++) {
18638 for (size_t k = 1; k <= 10; k += 3) {
18639 GemmMicrokernelTester()
18640 .mr(4)
18641 .nr(8)
18642 .kr(1)
18643 .sr(1)
18644 .m(4)
18645 .n(8)
18646 .k(k)
18647 .ks(3)
18648 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18649 }
18650 }
18651 }
18652
18653 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, n_div_8_small_kernel) {
18654 TEST_REQUIRES_ARM_NEON_FMA;
18655 for (uint32_t n = 16; n <= 24; n += 8) {
18656 for (size_t k = 1; k <= 10; k += 3) {
18657 GemmMicrokernelTester()
18658 .mr(4)
18659 .nr(8)
18660 .kr(1)
18661 .sr(1)
18662 .m(4)
18663 .n(8)
18664 .k(k)
18665 .ks(3)
18666 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18667 }
18668 }
18669 }
18670
18671 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, strided_cm_subtile) {
18672 TEST_REQUIRES_ARM_NEON_FMA;
18673 for (size_t k = 1; k <= 10; k += 3) {
18674 for (uint32_t m = 1; m <= 4; m++) {
18675 for (uint32_t n = 1; n <= 8; n++) {
18676 GemmMicrokernelTester()
18677 .mr(4)
18678 .nr(8)
18679 .kr(1)
18680 .sr(1)
18681 .m(m)
18682 .n(n)
18683 .k(k)
18684 .cm_stride(11)
18685 .iterations(1)
18686 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18687 }
18688 }
18689 }
18690 }
18691
18692 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, a_offset) {
18693 TEST_REQUIRES_ARM_NEON_FMA;
18694 for (size_t k = 1; k <= 10; k += 3) {
18695 GemmMicrokernelTester()
18696 .mr(4)
18697 .nr(8)
18698 .kr(1)
18699 .sr(1)
18700 .m(4)
18701 .n(8)
18702 .k(k)
18703 .ks(3)
18704 .a_offset(43)
18705 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18706 }
18707 }
18708
18709 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, zero) {
18710 TEST_REQUIRES_ARM_NEON_FMA;
18711 for (uint32_t mz = 0; mz < 4; mz++) {
18712 for (size_t k = 1; k <= 10; k += 3) {
18713 GemmMicrokernelTester()
18714 .mr(4)
18715 .nr(8)
18716 .kr(1)
18717 .sr(1)
18718 .m(4)
18719 .n(8)
18720 .k(k)
18721 .ks(3)
18722 .a_offset(43)
18723 .zero_index(mz)
18724 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18725 }
18726 }
18727 }
18728
18729 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, qmin) {
18730 TEST_REQUIRES_ARM_NEON_FMA;
18731 GemmMicrokernelTester()
18732 .mr(4)
18733 .nr(8)
18734 .kr(1)
18735 .sr(1)
18736 .m(4)
18737 .n(8)
18738 .k(2)
18739 .qmin(128)
18740 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18741 }
18742
18743 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, qmax) {
18744 TEST_REQUIRES_ARM_NEON_FMA;
18745 GemmMicrokernelTester()
18746 .mr(4)
18747 .nr(8)
18748 .kr(1)
18749 .sr(1)
18750 .m(4)
18751 .n(8)
18752 .k(2)
18753 .qmax(128)
18754 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18755 }
18756
18757 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, strided_cm) {
18758 TEST_REQUIRES_ARM_NEON_FMA;
18759 GemmMicrokernelTester()
18760 .mr(4)
18761 .nr(8)
18762 .kr(1)
18763 .sr(1)
18764 .m(4)
18765 .n(8)
18766 .k(2)
18767 .cm_stride(11)
18768 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
18769 }
18770#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
18771
18772
18773#if XNN_ARCH_ARM || XNN_ARCH_ARM64
18774 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, k_eq_2) {
18775 TEST_REQUIRES_ARM_NEON_FMA;
18776 GemmMicrokernelTester()
18777 .mr(6)
18778 .nr(8)
18779 .kr(1)
18780 .sr(1)
18781 .m(6)
18782 .n(8)
18783 .k(2)
18784 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
18785 }
18786
18787 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, strided_cn) {
18788 TEST_REQUIRES_ARM_NEON_FMA;
18789 GemmMicrokernelTester()
18790 .mr(6)
18791 .nr(8)
18792 .kr(1)
18793 .sr(1)
18794 .m(6)
18795 .n(8)
18796 .k(2)
18797 .cn_stride(11)
18798 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
18799 }
18800
18801 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, k_eq_2_subtile) {
18802 TEST_REQUIRES_ARM_NEON_FMA;
18803 for (uint32_t m = 1; m <= 6; m++) {
18804 for (uint32_t n = 1; n <= 8; n++) {
18805 GemmMicrokernelTester()
18806 .mr(6)
18807 .nr(8)
18808 .kr(1)
18809 .sr(1)
18810 .m(m)
18811 .n(n)
18812 .k(2)
18813 .iterations(1)
18814 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
18815 }
18816 }
18817 }
18818
18819 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, k_eq_2_subtile_m) {
18820 TEST_REQUIRES_ARM_NEON_FMA;
18821 for (uint32_t m = 1; m <= 6; m++) {
18822 GemmMicrokernelTester()
18823 .mr(6)
18824 .nr(8)
18825 .kr(1)
18826 .sr(1)
18827 .m(m)
18828 .n(8)
18829 .k(2)
18830 .iterations(1)
18831 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
18832 }
18833 }
18834
18835 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, k_eq_2_subtile_n) {
18836 TEST_REQUIRES_ARM_NEON_FMA;
18837 for (uint32_t n = 1; n <= 8; n++) {
18838 GemmMicrokernelTester()
18839 .mr(6)
18840 .nr(8)
18841 .kr(1)
18842 .sr(1)
18843 .m(6)
18844 .n(n)
18845 .k(2)
18846 .iterations(1)
18847 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
18848 }
18849 }
18850
18851 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, k_lt_2) {
18852 TEST_REQUIRES_ARM_NEON_FMA;
18853 for (size_t k = 1; k < 2; k++) {
18854 GemmMicrokernelTester()
18855 .mr(6)
18856 .nr(8)
18857 .kr(1)
18858 .sr(1)
18859 .m(6)
18860 .n(8)
18861 .k(k)
18862 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
18863 }
18864 }
18865
18866 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, k_lt_2_subtile) {
18867 TEST_REQUIRES_ARM_NEON_FMA;
18868 for (size_t k = 1; k < 2; k++) {
18869 for (uint32_t m = 1; m <= 6; m++) {
18870 for (uint32_t n = 1; n <= 8; n++) {
18871 GemmMicrokernelTester()
18872 .mr(6)
18873 .nr(8)
18874 .kr(1)
18875 .sr(1)
18876 .m(m)
18877 .n(n)
18878 .k(k)
18879 .iterations(1)
18880 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
18881 }
18882 }
18883 }
18884 }
18885
18886 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, k_gt_2) {
18887 TEST_REQUIRES_ARM_NEON_FMA;
18888 for (size_t k = 3; k < 4; k++) {
18889 GemmMicrokernelTester()
18890 .mr(6)
18891 .nr(8)
18892 .kr(1)
18893 .sr(1)
18894 .m(6)
18895 .n(8)
18896 .k(k)
18897 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
18898 }
18899 }
18900
18901 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, k_gt_2_subtile) {
18902 TEST_REQUIRES_ARM_NEON_FMA;
18903 for (size_t k = 3; k < 4; k++) {
18904 for (uint32_t m = 1; m <= 6; m++) {
18905 for (uint32_t n = 1; n <= 8; n++) {
18906 GemmMicrokernelTester()
18907 .mr(6)
18908 .nr(8)
18909 .kr(1)
18910 .sr(1)
18911 .m(m)
18912 .n(n)
18913 .k(k)
18914 .iterations(1)
18915 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
18916 }
18917 }
18918 }
18919 }
18920
18921 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, k_div_2) {
18922 TEST_REQUIRES_ARM_NEON_FMA;
18923 for (size_t k = 4; k <= 20; k += 2) {
18924 GemmMicrokernelTester()
18925 .mr(6)
18926 .nr(8)
18927 .kr(1)
18928 .sr(1)
18929 .m(6)
18930 .n(8)
18931 .k(k)
18932 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
18933 }
18934 }
18935
18936 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, k_div_2_subtile) {
18937 TEST_REQUIRES_ARM_NEON_FMA;
18938 for (size_t k = 4; k <= 20; k += 2) {
18939 for (uint32_t m = 1; m <= 6; m++) {
18940 for (uint32_t n = 1; n <= 8; n++) {
18941 GemmMicrokernelTester()
18942 .mr(6)
18943 .nr(8)
18944 .kr(1)
18945 .sr(1)
18946 .m(m)
18947 .n(n)
18948 .k(k)
18949 .iterations(1)
18950 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
18951 }
18952 }
18953 }
18954 }
18955
18956 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, n_gt_8) {
18957 TEST_REQUIRES_ARM_NEON_FMA;
18958 for (uint32_t n = 9; n < 16; n++) {
18959 for (size_t k = 1; k <= 10; k += 3) {
18960 GemmMicrokernelTester()
18961 .mr(6)
18962 .nr(8)
18963 .kr(1)
18964 .sr(1)
18965 .m(6)
18966 .n(8)
18967 .k(k)
18968 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
18969 }
18970 }
18971 }
18972
18973 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, n_gt_8_strided_cn) {
18974 TEST_REQUIRES_ARM_NEON_FMA;
18975 for (uint32_t n = 9; n < 16; n++) {
18976 for (size_t k = 1; k <= 10; k += 3) {
18977 GemmMicrokernelTester()
18978 .mr(6)
18979 .nr(8)
18980 .kr(1)
18981 .sr(1)
18982 .m(6)
18983 .n(8)
18984 .k(k)
18985 .cn_stride(11)
18986 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
18987 }
18988 }
18989 }
18990
18991 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, n_gt_8_subtile) {
18992 TEST_REQUIRES_ARM_NEON_FMA;
18993 for (uint32_t n = 9; n < 16; n++) {
18994 for (size_t k = 1; k <= 10; k += 3) {
18995 for (uint32_t m = 1; m <= 6; m++) {
18996 GemmMicrokernelTester()
18997 .mr(6)
18998 .nr(8)
18999 .kr(1)
19000 .sr(1)
19001 .m(m)
19002 .n(n)
19003 .k(k)
19004 .iterations(1)
19005 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
19006 }
19007 }
19008 }
19009 }
19010
19011 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, n_div_8) {
19012 TEST_REQUIRES_ARM_NEON_FMA;
19013 for (uint32_t n = 16; n <= 24; n += 8) {
19014 for (size_t k = 1; k <= 10; k += 3) {
19015 GemmMicrokernelTester()
19016 .mr(6)
19017 .nr(8)
19018 .kr(1)
19019 .sr(1)
19020 .m(6)
19021 .n(8)
19022 .k(k)
19023 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
19024 }
19025 }
19026 }
19027
19028 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, n_div_8_strided_cn) {
19029 TEST_REQUIRES_ARM_NEON_FMA;
19030 for (uint32_t n = 16; n <= 24; n += 8) {
19031 for (size_t k = 1; k <= 10; k += 3) {
19032 GemmMicrokernelTester()
19033 .mr(6)
19034 .nr(8)
19035 .kr(1)
19036 .sr(1)
19037 .m(6)
19038 .n(n)
19039 .k(k)
19040 .cn_stride(11)
19041 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
19042 }
19043 }
19044 }
19045
19046 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, n_div_8_subtile) {
19047 TEST_REQUIRES_ARM_NEON_FMA;
19048 for (uint32_t n = 16; n <= 24; n += 8) {
19049 for (size_t k = 1; k <= 10; k += 3) {
19050 for (uint32_t m = 1; m <= 6; m++) {
19051 GemmMicrokernelTester()
19052 .mr(6)
19053 .nr(8)
19054 .kr(1)
19055 .sr(1)
19056 .m(m)
19057 .n(n)
19058 .k(k)
19059 .iterations(1)
19060 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
19061 }
19062 }
19063 }
19064 }
19065
19066 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, small_kernel) {
19067 TEST_REQUIRES_ARM_NEON_FMA;
19068 for (size_t k = 1; k <= 10; k += 3) {
19069 GemmMicrokernelTester()
19070 .mr(6)
19071 .nr(8)
19072 .kr(1)
19073 .sr(1)
19074 .m(6)
19075 .n(8)
19076 .k(k)
19077 .ks(3)
19078 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
19079 }
19080 }
19081
19082 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, small_kernel_subtile) {
19083 TEST_REQUIRES_ARM_NEON_FMA;
19084 for (size_t k = 1; k <= 10; k += 3) {
19085 for (uint32_t m = 1; m <= 6; m++) {
19086 for (uint32_t n = 1; n <= 8; n++) {
19087 GemmMicrokernelTester()
19088 .mr(6)
19089 .nr(8)
19090 .kr(1)
19091 .sr(1)
19092 .m(m)
19093 .n(n)
19094 .k(k)
19095 .ks(3)
19096 .iterations(1)
19097 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
19098 }
19099 }
19100 }
19101 }
19102
19103 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, n_gt_8_small_kernel) {
19104 TEST_REQUIRES_ARM_NEON_FMA;
19105 for (uint32_t n = 9; n < 16; n++) {
19106 for (size_t k = 1; k <= 10; k += 3) {
19107 GemmMicrokernelTester()
19108 .mr(6)
19109 .nr(8)
19110 .kr(1)
19111 .sr(1)
19112 .m(6)
19113 .n(8)
19114 .k(k)
19115 .ks(3)
19116 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
19117 }
19118 }
19119 }
19120
19121 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, n_div_8_small_kernel) {
19122 TEST_REQUIRES_ARM_NEON_FMA;
19123 for (uint32_t n = 16; n <= 24; n += 8) {
19124 for (size_t k = 1; k <= 10; k += 3) {
19125 GemmMicrokernelTester()
19126 .mr(6)
19127 .nr(8)
19128 .kr(1)
19129 .sr(1)
19130 .m(6)
19131 .n(8)
19132 .k(k)
19133 .ks(3)
19134 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
19135 }
19136 }
19137 }
19138
19139 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, strided_cm_subtile) {
19140 TEST_REQUIRES_ARM_NEON_FMA;
19141 for (size_t k = 1; k <= 10; k += 3) {
19142 for (uint32_t m = 1; m <= 6; m++) {
19143 for (uint32_t n = 1; n <= 8; n++) {
19144 GemmMicrokernelTester()
19145 .mr(6)
19146 .nr(8)
19147 .kr(1)
19148 .sr(1)
19149 .m(m)
19150 .n(n)
19151 .k(k)
19152 .cm_stride(11)
19153 .iterations(1)
19154 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
19155 }
19156 }
19157 }
19158 }
19159
19160 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, a_offset) {
19161 TEST_REQUIRES_ARM_NEON_FMA;
19162 for (size_t k = 1; k <= 10; k += 3) {
19163 GemmMicrokernelTester()
19164 .mr(6)
19165 .nr(8)
19166 .kr(1)
19167 .sr(1)
19168 .m(6)
19169 .n(8)
19170 .k(k)
19171 .ks(3)
19172 .a_offset(67)
19173 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
19174 }
19175 }
19176
19177 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, zero) {
19178 TEST_REQUIRES_ARM_NEON_FMA;
19179 for (uint32_t mz = 0; mz < 6; mz++) {
19180 for (size_t k = 1; k <= 10; k += 3) {
19181 GemmMicrokernelTester()
19182 .mr(6)
19183 .nr(8)
19184 .kr(1)
19185 .sr(1)
19186 .m(6)
19187 .n(8)
19188 .k(k)
19189 .ks(3)
19190 .a_offset(67)
19191 .zero_index(mz)
19192 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
19193 }
19194 }
19195 }
19196
19197 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, qmin) {
19198 TEST_REQUIRES_ARM_NEON_FMA;
19199 GemmMicrokernelTester()
19200 .mr(6)
19201 .nr(8)
19202 .kr(1)
19203 .sr(1)
19204 .m(6)
19205 .n(8)
19206 .k(2)
19207 .qmin(128)
19208 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
19209 }
19210
19211 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, qmax) {
19212 TEST_REQUIRES_ARM_NEON_FMA;
19213 GemmMicrokernelTester()
19214 .mr(6)
19215 .nr(8)
19216 .kr(1)
19217 .sr(1)
19218 .m(6)
19219 .n(8)
19220 .k(2)
19221 .qmax(128)
19222 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
19223 }
19224
19225 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, strided_cm) {
19226 TEST_REQUIRES_ARM_NEON_FMA;
19227 GemmMicrokernelTester()
19228 .mr(6)
19229 .nr(8)
19230 .kr(1)
19231 .sr(1)
19232 .m(6)
19233 .n(8)
19234 .k(2)
19235 .cm_stride(11)
19236 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
19237 }
19238#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
19239
19240
19241#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard69172d92019-11-26 16:22:39 -080019242 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, k_eq_4) {
19243 TEST_REQUIRES_ARM_NEON_FMA;
19244 GemmMicrokernelTester()
19245 .mr(6)
19246 .nr(8)
19247 .kr(1)
19248 .sr(1)
19249 .m(6)
19250 .n(8)
19251 .k(4)
19252 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19253 }
19254
19255 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, strided_cn) {
19256 TEST_REQUIRES_ARM_NEON_FMA;
19257 GemmMicrokernelTester()
19258 .mr(6)
19259 .nr(8)
19260 .kr(1)
19261 .sr(1)
19262 .m(6)
19263 .n(8)
19264 .k(4)
19265 .cn_stride(11)
19266 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19267 }
19268
19269 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, k_eq_4_subtile) {
19270 TEST_REQUIRES_ARM_NEON_FMA;
19271 for (uint32_t m = 1; m <= 6; m++) {
19272 for (uint32_t n = 1; n <= 8; n++) {
19273 GemmMicrokernelTester()
19274 .mr(6)
19275 .nr(8)
19276 .kr(1)
19277 .sr(1)
19278 .m(m)
19279 .n(n)
19280 .k(4)
19281 .iterations(1)
19282 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19283 }
19284 }
19285 }
19286
19287 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, k_eq_4_subtile_m) {
19288 TEST_REQUIRES_ARM_NEON_FMA;
19289 for (uint32_t m = 1; m <= 6; m++) {
19290 GemmMicrokernelTester()
19291 .mr(6)
19292 .nr(8)
19293 .kr(1)
19294 .sr(1)
19295 .m(m)
19296 .n(8)
19297 .k(4)
19298 .iterations(1)
19299 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19300 }
19301 }
19302
19303 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, k_eq_4_subtile_n) {
19304 TEST_REQUIRES_ARM_NEON_FMA;
19305 for (uint32_t n = 1; n <= 8; n++) {
19306 GemmMicrokernelTester()
19307 .mr(6)
19308 .nr(8)
19309 .kr(1)
19310 .sr(1)
19311 .m(6)
19312 .n(n)
19313 .k(4)
19314 .iterations(1)
19315 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19316 }
19317 }
19318
19319 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, k_lt_4) {
19320 TEST_REQUIRES_ARM_NEON_FMA;
19321 for (size_t k = 1; k < 4; k++) {
19322 GemmMicrokernelTester()
19323 .mr(6)
19324 .nr(8)
19325 .kr(1)
19326 .sr(1)
19327 .m(6)
19328 .n(8)
19329 .k(k)
19330 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19331 }
19332 }
19333
19334 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, k_lt_4_subtile) {
19335 TEST_REQUIRES_ARM_NEON_FMA;
19336 for (size_t k = 1; k < 4; k++) {
19337 for (uint32_t m = 1; m <= 6; m++) {
19338 for (uint32_t n = 1; n <= 8; n++) {
19339 GemmMicrokernelTester()
19340 .mr(6)
19341 .nr(8)
19342 .kr(1)
19343 .sr(1)
19344 .m(m)
19345 .n(n)
19346 .k(k)
19347 .iterations(1)
19348 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19349 }
19350 }
19351 }
19352 }
19353
19354 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, k_gt_4) {
19355 TEST_REQUIRES_ARM_NEON_FMA;
19356 for (size_t k = 5; k < 8; k++) {
19357 GemmMicrokernelTester()
19358 .mr(6)
19359 .nr(8)
19360 .kr(1)
19361 .sr(1)
19362 .m(6)
19363 .n(8)
19364 .k(k)
19365 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19366 }
19367 }
19368
19369 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, k_gt_4_subtile) {
19370 TEST_REQUIRES_ARM_NEON_FMA;
19371 for (size_t k = 5; k < 8; k++) {
19372 for (uint32_t m = 1; m <= 6; m++) {
19373 for (uint32_t n = 1; n <= 8; n++) {
19374 GemmMicrokernelTester()
19375 .mr(6)
19376 .nr(8)
19377 .kr(1)
19378 .sr(1)
19379 .m(m)
19380 .n(n)
19381 .k(k)
19382 .iterations(1)
19383 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19384 }
19385 }
19386 }
19387 }
19388
19389 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, k_div_4) {
19390 TEST_REQUIRES_ARM_NEON_FMA;
19391 for (size_t k = 8; k <= 40; k += 4) {
19392 GemmMicrokernelTester()
19393 .mr(6)
19394 .nr(8)
19395 .kr(1)
19396 .sr(1)
19397 .m(6)
19398 .n(8)
19399 .k(k)
19400 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19401 }
19402 }
19403
19404 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, k_div_4_subtile) {
19405 TEST_REQUIRES_ARM_NEON_FMA;
19406 for (size_t k = 8; k <= 40; k += 4) {
19407 for (uint32_t m = 1; m <= 6; m++) {
19408 for (uint32_t n = 1; n <= 8; n++) {
19409 GemmMicrokernelTester()
19410 .mr(6)
19411 .nr(8)
19412 .kr(1)
19413 .sr(1)
19414 .m(m)
19415 .n(n)
19416 .k(k)
19417 .iterations(1)
19418 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19419 }
19420 }
19421 }
19422 }
19423
19424 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, n_gt_8) {
19425 TEST_REQUIRES_ARM_NEON_FMA;
19426 for (uint32_t n = 9; n < 16; n++) {
19427 for (size_t k = 1; k <= 20; k += 5) {
19428 GemmMicrokernelTester()
19429 .mr(6)
19430 .nr(8)
19431 .kr(1)
19432 .sr(1)
19433 .m(6)
19434 .n(8)
19435 .k(k)
19436 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19437 }
19438 }
19439 }
19440
19441 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, n_gt_8_strided_cn) {
19442 TEST_REQUIRES_ARM_NEON_FMA;
19443 for (uint32_t n = 9; n < 16; n++) {
19444 for (size_t k = 1; k <= 20; k += 5) {
19445 GemmMicrokernelTester()
19446 .mr(6)
19447 .nr(8)
19448 .kr(1)
19449 .sr(1)
19450 .m(6)
19451 .n(8)
19452 .k(k)
19453 .cn_stride(11)
19454 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19455 }
19456 }
19457 }
19458
19459 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, n_gt_8_subtile) {
19460 TEST_REQUIRES_ARM_NEON_FMA;
19461 for (uint32_t n = 9; n < 16; n++) {
19462 for (size_t k = 1; k <= 20; k += 5) {
19463 for (uint32_t m = 1; m <= 6; m++) {
19464 GemmMicrokernelTester()
19465 .mr(6)
19466 .nr(8)
19467 .kr(1)
19468 .sr(1)
19469 .m(m)
19470 .n(n)
19471 .k(k)
19472 .iterations(1)
19473 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19474 }
19475 }
19476 }
19477 }
19478
19479 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, n_div_8) {
19480 TEST_REQUIRES_ARM_NEON_FMA;
19481 for (uint32_t n = 16; n <= 24; n += 8) {
19482 for (size_t k = 1; k <= 20; k += 5) {
19483 GemmMicrokernelTester()
19484 .mr(6)
19485 .nr(8)
19486 .kr(1)
19487 .sr(1)
19488 .m(6)
19489 .n(8)
19490 .k(k)
19491 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19492 }
19493 }
19494 }
19495
19496 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, n_div_8_strided_cn) {
19497 TEST_REQUIRES_ARM_NEON_FMA;
19498 for (uint32_t n = 16; n <= 24; n += 8) {
19499 for (size_t k = 1; k <= 20; k += 5) {
19500 GemmMicrokernelTester()
19501 .mr(6)
19502 .nr(8)
19503 .kr(1)
19504 .sr(1)
19505 .m(6)
19506 .n(n)
19507 .k(k)
19508 .cn_stride(11)
19509 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19510 }
19511 }
19512 }
19513
19514 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, n_div_8_subtile) {
19515 TEST_REQUIRES_ARM_NEON_FMA;
19516 for (uint32_t n = 16; n <= 24; n += 8) {
19517 for (size_t k = 1; k <= 20; k += 5) {
19518 for (uint32_t m = 1; m <= 6; m++) {
19519 GemmMicrokernelTester()
19520 .mr(6)
19521 .nr(8)
19522 .kr(1)
19523 .sr(1)
19524 .m(m)
19525 .n(n)
19526 .k(k)
19527 .iterations(1)
19528 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19529 }
19530 }
19531 }
19532 }
19533
19534 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, small_kernel) {
19535 TEST_REQUIRES_ARM_NEON_FMA;
19536 for (size_t k = 1; k <= 20; k += 5) {
19537 GemmMicrokernelTester()
19538 .mr(6)
19539 .nr(8)
19540 .kr(1)
19541 .sr(1)
19542 .m(6)
19543 .n(8)
19544 .k(k)
19545 .ks(3)
19546 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19547 }
19548 }
19549
19550 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, small_kernel_subtile) {
19551 TEST_REQUIRES_ARM_NEON_FMA;
19552 for (size_t k = 1; k <= 20; k += 5) {
19553 for (uint32_t m = 1; m <= 6; m++) {
19554 for (uint32_t n = 1; n <= 8; n++) {
19555 GemmMicrokernelTester()
19556 .mr(6)
19557 .nr(8)
19558 .kr(1)
19559 .sr(1)
19560 .m(m)
19561 .n(n)
19562 .k(k)
19563 .ks(3)
19564 .iterations(1)
19565 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19566 }
19567 }
19568 }
19569 }
19570
19571 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, n_gt_8_small_kernel) {
19572 TEST_REQUIRES_ARM_NEON_FMA;
19573 for (uint32_t n = 9; n < 16; n++) {
19574 for (size_t k = 1; k <= 20; k += 5) {
19575 GemmMicrokernelTester()
19576 .mr(6)
19577 .nr(8)
19578 .kr(1)
19579 .sr(1)
19580 .m(6)
19581 .n(8)
19582 .k(k)
19583 .ks(3)
19584 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19585 }
19586 }
19587 }
19588
19589 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, n_div_8_small_kernel) {
19590 TEST_REQUIRES_ARM_NEON_FMA;
19591 for (uint32_t n = 16; n <= 24; n += 8) {
19592 for (size_t k = 1; k <= 20; k += 5) {
19593 GemmMicrokernelTester()
19594 .mr(6)
19595 .nr(8)
19596 .kr(1)
19597 .sr(1)
19598 .m(6)
19599 .n(8)
19600 .k(k)
19601 .ks(3)
19602 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19603 }
19604 }
19605 }
19606
19607 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, strided_cm_subtile) {
19608 TEST_REQUIRES_ARM_NEON_FMA;
19609 for (size_t k = 1; k <= 20; k += 5) {
19610 for (uint32_t m = 1; m <= 6; m++) {
19611 for (uint32_t n = 1; n <= 8; n++) {
19612 GemmMicrokernelTester()
19613 .mr(6)
19614 .nr(8)
19615 .kr(1)
19616 .sr(1)
19617 .m(m)
19618 .n(n)
19619 .k(k)
19620 .cm_stride(11)
19621 .iterations(1)
19622 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19623 }
19624 }
19625 }
19626 }
19627
19628 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, a_offset) {
19629 TEST_REQUIRES_ARM_NEON_FMA;
19630 for (size_t k = 1; k <= 20; k += 5) {
19631 GemmMicrokernelTester()
19632 .mr(6)
19633 .nr(8)
19634 .kr(1)
19635 .sr(1)
19636 .m(6)
19637 .n(8)
19638 .k(k)
19639 .ks(3)
19640 .a_offset(127)
19641 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19642 }
19643 }
19644
19645 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, zero) {
19646 TEST_REQUIRES_ARM_NEON_FMA;
19647 for (uint32_t mz = 0; mz < 6; mz++) {
19648 for (size_t k = 1; k <= 20; k += 5) {
19649 GemmMicrokernelTester()
19650 .mr(6)
19651 .nr(8)
19652 .kr(1)
19653 .sr(1)
19654 .m(6)
19655 .n(8)
19656 .k(k)
19657 .ks(3)
19658 .a_offset(127)
19659 .zero_index(mz)
19660 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19661 }
19662 }
19663 }
19664
19665 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, qmin) {
19666 TEST_REQUIRES_ARM_NEON_FMA;
19667 GemmMicrokernelTester()
19668 .mr(6)
19669 .nr(8)
19670 .kr(1)
19671 .sr(1)
19672 .m(6)
19673 .n(8)
19674 .k(4)
19675 .qmin(128)
19676 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19677 }
19678
19679 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, qmax) {
19680 TEST_REQUIRES_ARM_NEON_FMA;
19681 GemmMicrokernelTester()
19682 .mr(6)
19683 .nr(8)
19684 .kr(1)
19685 .sr(1)
19686 .m(6)
19687 .n(8)
19688 .k(4)
19689 .qmax(128)
19690 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19691 }
19692
19693 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, strided_cm) {
19694 TEST_REQUIRES_ARM_NEON_FMA;
19695 GemmMicrokernelTester()
19696 .mr(6)
19697 .nr(8)
19698 .kr(1)
19699 .sr(1)
19700 .m(6)
19701 .n(8)
19702 .k(4)
19703 .cm_stride(11)
19704 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
19705 }
19706#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
19707
19708
19709#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard5243bb02019-11-22 16:37:50 -080019710 TEST(F32_IGEMM_1X8S4__NEON, k_eq_4) {
19711 TEST_REQUIRES_ARM_NEON;
19712 GemmMicrokernelTester()
19713 .mr(1)
19714 .nr(8)
19715 .kr(1)
19716 .sr(4)
19717 .m(1)
19718 .n(8)
19719 .k(4)
19720 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
19721 }
19722
19723 TEST(F32_IGEMM_1X8S4__NEON, strided_cn) {
19724 TEST_REQUIRES_ARM_NEON;
19725 GemmMicrokernelTester()
19726 .mr(1)
19727 .nr(8)
19728 .kr(1)
19729 .sr(4)
19730 .m(1)
19731 .n(8)
19732 .k(4)
19733 .cn_stride(11)
19734 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
19735 }
19736
19737 TEST(F32_IGEMM_1X8S4__NEON, k_eq_4_subtile) {
19738 TEST_REQUIRES_ARM_NEON;
19739 for (uint32_t m = 1; m <= 1; m++) {
19740 for (uint32_t n = 1; n <= 8; n++) {
19741 GemmMicrokernelTester()
19742 .mr(1)
19743 .nr(8)
19744 .kr(1)
19745 .sr(4)
19746 .m(m)
19747 .n(n)
19748 .k(4)
19749 .iterations(1)
19750 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
19751 }
19752 }
19753 }
19754
19755 TEST(F32_IGEMM_1X8S4__NEON, k_eq_4_subtile_m) {
19756 TEST_REQUIRES_ARM_NEON;
19757 for (uint32_t m = 1; m <= 1; m++) {
19758 GemmMicrokernelTester()
19759 .mr(1)
19760 .nr(8)
19761 .kr(1)
19762 .sr(4)
19763 .m(m)
19764 .n(8)
19765 .k(4)
19766 .iterations(1)
19767 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
19768 }
19769 }
19770
19771 TEST(F32_IGEMM_1X8S4__NEON, k_eq_4_subtile_n) {
19772 TEST_REQUIRES_ARM_NEON;
19773 for (uint32_t n = 1; n <= 8; n++) {
19774 GemmMicrokernelTester()
19775 .mr(1)
19776 .nr(8)
19777 .kr(1)
19778 .sr(4)
19779 .m(1)
19780 .n(n)
19781 .k(4)
19782 .iterations(1)
19783 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
19784 }
19785 }
19786
19787 TEST(F32_IGEMM_1X8S4__NEON, k_lt_4) {
19788 TEST_REQUIRES_ARM_NEON;
19789 for (size_t k = 1; k < 4; k++) {
19790 GemmMicrokernelTester()
19791 .mr(1)
19792 .nr(8)
19793 .kr(1)
19794 .sr(4)
19795 .m(1)
19796 .n(8)
19797 .k(k)
19798 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
19799 }
19800 }
19801
19802 TEST(F32_IGEMM_1X8S4__NEON, k_lt_4_subtile) {
19803 TEST_REQUIRES_ARM_NEON;
19804 for (size_t k = 1; k < 4; k++) {
19805 for (uint32_t m = 1; m <= 1; m++) {
19806 for (uint32_t n = 1; n <= 8; n++) {
19807 GemmMicrokernelTester()
19808 .mr(1)
19809 .nr(8)
19810 .kr(1)
19811 .sr(4)
19812 .m(m)
19813 .n(n)
19814 .k(k)
19815 .iterations(1)
19816 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
19817 }
19818 }
19819 }
19820 }
19821
19822 TEST(F32_IGEMM_1X8S4__NEON, k_gt_4) {
19823 TEST_REQUIRES_ARM_NEON;
19824 for (size_t k = 5; k < 8; k++) {
19825 GemmMicrokernelTester()
19826 .mr(1)
19827 .nr(8)
19828 .kr(1)
19829 .sr(4)
19830 .m(1)
19831 .n(8)
19832 .k(k)
19833 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
19834 }
19835 }
19836
19837 TEST(F32_IGEMM_1X8S4__NEON, k_gt_4_subtile) {
19838 TEST_REQUIRES_ARM_NEON;
19839 for (size_t k = 5; k < 8; k++) {
19840 for (uint32_t m = 1; m <= 1; m++) {
19841 for (uint32_t n = 1; n <= 8; n++) {
19842 GemmMicrokernelTester()
19843 .mr(1)
19844 .nr(8)
19845 .kr(1)
19846 .sr(4)
19847 .m(m)
19848 .n(n)
19849 .k(k)
19850 .iterations(1)
19851 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
19852 }
19853 }
19854 }
19855 }
19856
19857 TEST(F32_IGEMM_1X8S4__NEON, k_div_4) {
19858 TEST_REQUIRES_ARM_NEON;
19859 for (size_t k = 8; k <= 40; k += 4) {
19860 GemmMicrokernelTester()
19861 .mr(1)
19862 .nr(8)
19863 .kr(1)
19864 .sr(4)
19865 .m(1)
19866 .n(8)
19867 .k(k)
19868 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
19869 }
19870 }
19871
19872 TEST(F32_IGEMM_1X8S4__NEON, k_div_4_subtile) {
19873 TEST_REQUIRES_ARM_NEON;
19874 for (size_t k = 8; k <= 40; k += 4) {
19875 for (uint32_t m = 1; m <= 1; m++) {
19876 for (uint32_t n = 1; n <= 8; n++) {
19877 GemmMicrokernelTester()
19878 .mr(1)
19879 .nr(8)
19880 .kr(1)
19881 .sr(4)
19882 .m(m)
19883 .n(n)
19884 .k(k)
19885 .iterations(1)
19886 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
19887 }
19888 }
19889 }
19890 }
19891
19892 TEST(F32_IGEMM_1X8S4__NEON, n_gt_8) {
19893 TEST_REQUIRES_ARM_NEON;
19894 for (uint32_t n = 9; n < 16; n++) {
19895 for (size_t k = 1; k <= 20; k += 5) {
19896 GemmMicrokernelTester()
19897 .mr(1)
19898 .nr(8)
19899 .kr(1)
19900 .sr(4)
19901 .m(1)
19902 .n(8)
19903 .k(k)
19904 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
19905 }
19906 }
19907 }
19908
19909 TEST(F32_IGEMM_1X8S4__NEON, n_gt_8_strided_cn) {
19910 TEST_REQUIRES_ARM_NEON;
19911 for (uint32_t n = 9; n < 16; n++) {
19912 for (size_t k = 1; k <= 20; k += 5) {
19913 GemmMicrokernelTester()
19914 .mr(1)
19915 .nr(8)
19916 .kr(1)
19917 .sr(4)
19918 .m(1)
19919 .n(8)
19920 .k(k)
19921 .cn_stride(11)
19922 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
19923 }
19924 }
19925 }
19926
19927 TEST(F32_IGEMM_1X8S4__NEON, n_gt_8_subtile) {
19928 TEST_REQUIRES_ARM_NEON;
19929 for (uint32_t n = 9; n < 16; n++) {
19930 for (size_t k = 1; k <= 20; k += 5) {
19931 for (uint32_t m = 1; m <= 1; m++) {
19932 GemmMicrokernelTester()
19933 .mr(1)
19934 .nr(8)
19935 .kr(1)
19936 .sr(4)
19937 .m(m)
19938 .n(n)
19939 .k(k)
19940 .iterations(1)
19941 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
19942 }
19943 }
19944 }
19945 }
19946
19947 TEST(F32_IGEMM_1X8S4__NEON, n_div_8) {
19948 TEST_REQUIRES_ARM_NEON;
19949 for (uint32_t n = 16; n <= 24; n += 8) {
19950 for (size_t k = 1; k <= 20; k += 5) {
19951 GemmMicrokernelTester()
19952 .mr(1)
19953 .nr(8)
19954 .kr(1)
19955 .sr(4)
19956 .m(1)
19957 .n(8)
19958 .k(k)
19959 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
19960 }
19961 }
19962 }
19963
19964 TEST(F32_IGEMM_1X8S4__NEON, n_div_8_strided_cn) {
19965 TEST_REQUIRES_ARM_NEON;
19966 for (uint32_t n = 16; n <= 24; n += 8) {
19967 for (size_t k = 1; k <= 20; k += 5) {
19968 GemmMicrokernelTester()
19969 .mr(1)
19970 .nr(8)
19971 .kr(1)
19972 .sr(4)
19973 .m(1)
19974 .n(n)
19975 .k(k)
19976 .cn_stride(11)
19977 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
19978 }
19979 }
19980 }
19981
19982 TEST(F32_IGEMM_1X8S4__NEON, n_div_8_subtile) {
19983 TEST_REQUIRES_ARM_NEON;
19984 for (uint32_t n = 16; n <= 24; n += 8) {
19985 for (size_t k = 1; k <= 20; k += 5) {
19986 for (uint32_t m = 1; m <= 1; m++) {
19987 GemmMicrokernelTester()
19988 .mr(1)
19989 .nr(8)
19990 .kr(1)
19991 .sr(4)
19992 .m(m)
19993 .n(n)
19994 .k(k)
19995 .iterations(1)
19996 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
19997 }
19998 }
19999 }
20000 }
20001
20002 TEST(F32_IGEMM_1X8S4__NEON, small_kernel) {
20003 TEST_REQUIRES_ARM_NEON;
20004 for (size_t k = 1; k <= 20; k += 5) {
20005 GemmMicrokernelTester()
20006 .mr(1)
20007 .nr(8)
20008 .kr(1)
20009 .sr(4)
20010 .m(1)
20011 .n(8)
20012 .k(k)
20013 .ks(3)
20014 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
20015 }
20016 }
20017
20018 TEST(F32_IGEMM_1X8S4__NEON, small_kernel_subtile) {
20019 TEST_REQUIRES_ARM_NEON;
20020 for (size_t k = 1; k <= 20; k += 5) {
20021 for (uint32_t m = 1; m <= 1; m++) {
20022 for (uint32_t n = 1; n <= 8; n++) {
20023 GemmMicrokernelTester()
20024 .mr(1)
20025 .nr(8)
20026 .kr(1)
20027 .sr(4)
20028 .m(m)
20029 .n(n)
20030 .k(k)
20031 .ks(3)
20032 .iterations(1)
20033 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
20034 }
20035 }
20036 }
20037 }
20038
20039 TEST(F32_IGEMM_1X8S4__NEON, n_gt_8_small_kernel) {
20040 TEST_REQUIRES_ARM_NEON;
20041 for (uint32_t n = 9; n < 16; n++) {
20042 for (size_t k = 1; k <= 20; k += 5) {
20043 GemmMicrokernelTester()
20044 .mr(1)
20045 .nr(8)
20046 .kr(1)
20047 .sr(4)
20048 .m(1)
20049 .n(8)
20050 .k(k)
20051 .ks(3)
20052 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
20053 }
20054 }
20055 }
20056
20057 TEST(F32_IGEMM_1X8S4__NEON, n_div_8_small_kernel) {
20058 TEST_REQUIRES_ARM_NEON;
20059 for (uint32_t n = 16; n <= 24; n += 8) {
20060 for (size_t k = 1; k <= 20; k += 5) {
20061 GemmMicrokernelTester()
20062 .mr(1)
20063 .nr(8)
20064 .kr(1)
20065 .sr(4)
20066 .m(1)
20067 .n(8)
20068 .k(k)
20069 .ks(3)
20070 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
20071 }
20072 }
20073 }
20074
20075 TEST(F32_IGEMM_1X8S4__NEON, strided_cm_subtile) {
20076 TEST_REQUIRES_ARM_NEON;
20077 for (size_t k = 1; k <= 20; k += 5) {
20078 for (uint32_t m = 1; m <= 1; m++) {
20079 for (uint32_t n = 1; n <= 8; n++) {
20080 GemmMicrokernelTester()
20081 .mr(1)
20082 .nr(8)
20083 .kr(1)
20084 .sr(4)
20085 .m(m)
20086 .n(n)
20087 .k(k)
20088 .cm_stride(11)
20089 .iterations(1)
20090 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
20091 }
20092 }
20093 }
20094 }
20095
20096 TEST(F32_IGEMM_1X8S4__NEON, a_offset) {
20097 TEST_REQUIRES_ARM_NEON;
20098 for (size_t k = 1; k <= 20; k += 5) {
20099 GemmMicrokernelTester()
20100 .mr(1)
20101 .nr(8)
20102 .kr(1)
20103 .sr(4)
20104 .m(1)
20105 .n(8)
20106 .k(k)
20107 .ks(3)
20108 .a_offset(23)
20109 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
20110 }
20111 }
20112
20113 TEST(F32_IGEMM_1X8S4__NEON, zero) {
20114 TEST_REQUIRES_ARM_NEON;
20115 for (uint32_t mz = 0; mz < 1; mz++) {
20116 for (size_t k = 1; k <= 20; k += 5) {
20117 GemmMicrokernelTester()
20118 .mr(1)
20119 .nr(8)
20120 .kr(1)
20121 .sr(4)
20122 .m(1)
20123 .n(8)
20124 .k(k)
20125 .ks(3)
20126 .a_offset(23)
20127 .zero_index(mz)
20128 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
20129 }
20130 }
20131 }
20132
20133 TEST(F32_IGEMM_1X8S4__NEON, qmin) {
20134 TEST_REQUIRES_ARM_NEON;
20135 GemmMicrokernelTester()
20136 .mr(1)
20137 .nr(8)
20138 .kr(1)
20139 .sr(4)
20140 .m(1)
20141 .n(8)
20142 .k(4)
20143 .qmin(128)
20144 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
20145 }
20146
20147 TEST(F32_IGEMM_1X8S4__NEON, qmax) {
20148 TEST_REQUIRES_ARM_NEON;
20149 GemmMicrokernelTester()
20150 .mr(1)
20151 .nr(8)
20152 .kr(1)
20153 .sr(4)
20154 .m(1)
20155 .n(8)
20156 .k(4)
20157 .qmax(128)
20158 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
20159 }
20160
20161 TEST(F32_IGEMM_1X8S4__NEON, strided_cm) {
20162 TEST_REQUIRES_ARM_NEON;
20163 GemmMicrokernelTester()
20164 .mr(1)
20165 .nr(8)
20166 .kr(1)
20167 .sr(4)
20168 .m(1)
20169 .n(8)
20170 .k(4)
20171 .cm_stride(11)
20172 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
20173 }
20174#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
20175
20176
20177#if XNN_ARCH_ARM || XNN_ARCH_ARM64
20178 TEST(F32_IGEMM_4X8S4__NEON, k_eq_4) {
20179 TEST_REQUIRES_ARM_NEON;
20180 GemmMicrokernelTester()
20181 .mr(4)
20182 .nr(8)
20183 .kr(1)
20184 .sr(4)
20185 .m(4)
20186 .n(8)
20187 .k(4)
20188 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20189 }
20190
20191 TEST(F32_IGEMM_4X8S4__NEON, strided_cn) {
20192 TEST_REQUIRES_ARM_NEON;
20193 GemmMicrokernelTester()
20194 .mr(4)
20195 .nr(8)
20196 .kr(1)
20197 .sr(4)
20198 .m(4)
20199 .n(8)
20200 .k(4)
20201 .cn_stride(11)
20202 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20203 }
20204
20205 TEST(F32_IGEMM_4X8S4__NEON, k_eq_4_subtile) {
20206 TEST_REQUIRES_ARM_NEON;
20207 for (uint32_t m = 1; m <= 4; m++) {
20208 for (uint32_t n = 1; n <= 8; n++) {
20209 GemmMicrokernelTester()
20210 .mr(4)
20211 .nr(8)
20212 .kr(1)
20213 .sr(4)
20214 .m(m)
20215 .n(n)
20216 .k(4)
20217 .iterations(1)
20218 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20219 }
20220 }
20221 }
20222
20223 TEST(F32_IGEMM_4X8S4__NEON, k_eq_4_subtile_m) {
20224 TEST_REQUIRES_ARM_NEON;
20225 for (uint32_t m = 1; m <= 4; m++) {
20226 GemmMicrokernelTester()
20227 .mr(4)
20228 .nr(8)
20229 .kr(1)
20230 .sr(4)
20231 .m(m)
20232 .n(8)
20233 .k(4)
20234 .iterations(1)
20235 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20236 }
20237 }
20238
20239 TEST(F32_IGEMM_4X8S4__NEON, k_eq_4_subtile_n) {
20240 TEST_REQUIRES_ARM_NEON;
20241 for (uint32_t n = 1; n <= 8; n++) {
20242 GemmMicrokernelTester()
20243 .mr(4)
20244 .nr(8)
20245 .kr(1)
20246 .sr(4)
20247 .m(4)
20248 .n(n)
20249 .k(4)
20250 .iterations(1)
20251 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20252 }
20253 }
20254
20255 TEST(F32_IGEMM_4X8S4__NEON, k_lt_4) {
20256 TEST_REQUIRES_ARM_NEON;
20257 for (size_t k = 1; k < 4; k++) {
20258 GemmMicrokernelTester()
20259 .mr(4)
20260 .nr(8)
20261 .kr(1)
20262 .sr(4)
20263 .m(4)
20264 .n(8)
20265 .k(k)
20266 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20267 }
20268 }
20269
20270 TEST(F32_IGEMM_4X8S4__NEON, k_lt_4_subtile) {
20271 TEST_REQUIRES_ARM_NEON;
20272 for (size_t k = 1; k < 4; k++) {
20273 for (uint32_t m = 1; m <= 4; m++) {
20274 for (uint32_t n = 1; n <= 8; n++) {
20275 GemmMicrokernelTester()
20276 .mr(4)
20277 .nr(8)
20278 .kr(1)
20279 .sr(4)
20280 .m(m)
20281 .n(n)
20282 .k(k)
20283 .iterations(1)
20284 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20285 }
20286 }
20287 }
20288 }
20289
20290 TEST(F32_IGEMM_4X8S4__NEON, k_gt_4) {
20291 TEST_REQUIRES_ARM_NEON;
20292 for (size_t k = 5; k < 8; k++) {
20293 GemmMicrokernelTester()
20294 .mr(4)
20295 .nr(8)
20296 .kr(1)
20297 .sr(4)
20298 .m(4)
20299 .n(8)
20300 .k(k)
20301 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20302 }
20303 }
20304
20305 TEST(F32_IGEMM_4X8S4__NEON, k_gt_4_subtile) {
20306 TEST_REQUIRES_ARM_NEON;
20307 for (size_t k = 5; k < 8; k++) {
20308 for (uint32_t m = 1; m <= 4; m++) {
20309 for (uint32_t n = 1; n <= 8; n++) {
20310 GemmMicrokernelTester()
20311 .mr(4)
20312 .nr(8)
20313 .kr(1)
20314 .sr(4)
20315 .m(m)
20316 .n(n)
20317 .k(k)
20318 .iterations(1)
20319 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20320 }
20321 }
20322 }
20323 }
20324
20325 TEST(F32_IGEMM_4X8S4__NEON, k_div_4) {
20326 TEST_REQUIRES_ARM_NEON;
20327 for (size_t k = 8; k <= 40; k += 4) {
20328 GemmMicrokernelTester()
20329 .mr(4)
20330 .nr(8)
20331 .kr(1)
20332 .sr(4)
20333 .m(4)
20334 .n(8)
20335 .k(k)
20336 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20337 }
20338 }
20339
20340 TEST(F32_IGEMM_4X8S4__NEON, k_div_4_subtile) {
20341 TEST_REQUIRES_ARM_NEON;
20342 for (size_t k = 8; k <= 40; k += 4) {
20343 for (uint32_t m = 1; m <= 4; m++) {
20344 for (uint32_t n = 1; n <= 8; n++) {
20345 GemmMicrokernelTester()
20346 .mr(4)
20347 .nr(8)
20348 .kr(1)
20349 .sr(4)
20350 .m(m)
20351 .n(n)
20352 .k(k)
20353 .iterations(1)
20354 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20355 }
20356 }
20357 }
20358 }
20359
20360 TEST(F32_IGEMM_4X8S4__NEON, n_gt_8) {
20361 TEST_REQUIRES_ARM_NEON;
20362 for (uint32_t n = 9; n < 16; n++) {
20363 for (size_t k = 1; k <= 20; k += 5) {
20364 GemmMicrokernelTester()
20365 .mr(4)
20366 .nr(8)
20367 .kr(1)
20368 .sr(4)
20369 .m(4)
20370 .n(8)
20371 .k(k)
20372 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20373 }
20374 }
20375 }
20376
20377 TEST(F32_IGEMM_4X8S4__NEON, n_gt_8_strided_cn) {
20378 TEST_REQUIRES_ARM_NEON;
20379 for (uint32_t n = 9; n < 16; n++) {
20380 for (size_t k = 1; k <= 20; k += 5) {
20381 GemmMicrokernelTester()
20382 .mr(4)
20383 .nr(8)
20384 .kr(1)
20385 .sr(4)
20386 .m(4)
20387 .n(8)
20388 .k(k)
20389 .cn_stride(11)
20390 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20391 }
20392 }
20393 }
20394
20395 TEST(F32_IGEMM_4X8S4__NEON, n_gt_8_subtile) {
20396 TEST_REQUIRES_ARM_NEON;
20397 for (uint32_t n = 9; n < 16; n++) {
20398 for (size_t k = 1; k <= 20; k += 5) {
20399 for (uint32_t m = 1; m <= 4; m++) {
20400 GemmMicrokernelTester()
20401 .mr(4)
20402 .nr(8)
20403 .kr(1)
20404 .sr(4)
20405 .m(m)
20406 .n(n)
20407 .k(k)
20408 .iterations(1)
20409 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20410 }
20411 }
20412 }
20413 }
20414
20415 TEST(F32_IGEMM_4X8S4__NEON, n_div_8) {
20416 TEST_REQUIRES_ARM_NEON;
20417 for (uint32_t n = 16; n <= 24; n += 8) {
20418 for (size_t k = 1; k <= 20; k += 5) {
20419 GemmMicrokernelTester()
20420 .mr(4)
20421 .nr(8)
20422 .kr(1)
20423 .sr(4)
20424 .m(4)
20425 .n(8)
20426 .k(k)
20427 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20428 }
20429 }
20430 }
20431
20432 TEST(F32_IGEMM_4X8S4__NEON, n_div_8_strided_cn) {
20433 TEST_REQUIRES_ARM_NEON;
20434 for (uint32_t n = 16; n <= 24; n += 8) {
20435 for (size_t k = 1; k <= 20; k += 5) {
20436 GemmMicrokernelTester()
20437 .mr(4)
20438 .nr(8)
20439 .kr(1)
20440 .sr(4)
20441 .m(4)
20442 .n(n)
20443 .k(k)
20444 .cn_stride(11)
20445 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20446 }
20447 }
20448 }
20449
20450 TEST(F32_IGEMM_4X8S4__NEON, n_div_8_subtile) {
20451 TEST_REQUIRES_ARM_NEON;
20452 for (uint32_t n = 16; n <= 24; n += 8) {
20453 for (size_t k = 1; k <= 20; k += 5) {
20454 for (uint32_t m = 1; m <= 4; m++) {
20455 GemmMicrokernelTester()
20456 .mr(4)
20457 .nr(8)
20458 .kr(1)
20459 .sr(4)
20460 .m(m)
20461 .n(n)
20462 .k(k)
20463 .iterations(1)
20464 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20465 }
20466 }
20467 }
20468 }
20469
20470 TEST(F32_IGEMM_4X8S4__NEON, small_kernel) {
20471 TEST_REQUIRES_ARM_NEON;
20472 for (size_t k = 1; k <= 20; k += 5) {
20473 GemmMicrokernelTester()
20474 .mr(4)
20475 .nr(8)
20476 .kr(1)
20477 .sr(4)
20478 .m(4)
20479 .n(8)
20480 .k(k)
20481 .ks(3)
20482 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20483 }
20484 }
20485
20486 TEST(F32_IGEMM_4X8S4__NEON, small_kernel_subtile) {
20487 TEST_REQUIRES_ARM_NEON;
20488 for (size_t k = 1; k <= 20; k += 5) {
20489 for (uint32_t m = 1; m <= 4; m++) {
20490 for (uint32_t n = 1; n <= 8; n++) {
20491 GemmMicrokernelTester()
20492 .mr(4)
20493 .nr(8)
20494 .kr(1)
20495 .sr(4)
20496 .m(m)
20497 .n(n)
20498 .k(k)
20499 .ks(3)
20500 .iterations(1)
20501 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20502 }
20503 }
20504 }
20505 }
20506
20507 TEST(F32_IGEMM_4X8S4__NEON, n_gt_8_small_kernel) {
20508 TEST_REQUIRES_ARM_NEON;
20509 for (uint32_t n = 9; n < 16; n++) {
20510 for (size_t k = 1; k <= 20; k += 5) {
20511 GemmMicrokernelTester()
20512 .mr(4)
20513 .nr(8)
20514 .kr(1)
20515 .sr(4)
20516 .m(4)
20517 .n(8)
20518 .k(k)
20519 .ks(3)
20520 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20521 }
20522 }
20523 }
20524
20525 TEST(F32_IGEMM_4X8S4__NEON, n_div_8_small_kernel) {
20526 TEST_REQUIRES_ARM_NEON;
20527 for (uint32_t n = 16; n <= 24; n += 8) {
20528 for (size_t k = 1; k <= 20; k += 5) {
20529 GemmMicrokernelTester()
20530 .mr(4)
20531 .nr(8)
20532 .kr(1)
20533 .sr(4)
20534 .m(4)
20535 .n(8)
20536 .k(k)
20537 .ks(3)
20538 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20539 }
20540 }
20541 }
20542
20543 TEST(F32_IGEMM_4X8S4__NEON, strided_cm_subtile) {
20544 TEST_REQUIRES_ARM_NEON;
20545 for (size_t k = 1; k <= 20; k += 5) {
20546 for (uint32_t m = 1; m <= 4; m++) {
20547 for (uint32_t n = 1; n <= 8; n++) {
20548 GemmMicrokernelTester()
20549 .mr(4)
20550 .nr(8)
20551 .kr(1)
20552 .sr(4)
20553 .m(m)
20554 .n(n)
20555 .k(k)
20556 .cm_stride(11)
20557 .iterations(1)
20558 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20559 }
20560 }
20561 }
20562 }
20563
20564 TEST(F32_IGEMM_4X8S4__NEON, a_offset) {
20565 TEST_REQUIRES_ARM_NEON;
20566 for (size_t k = 1; k <= 20; k += 5) {
20567 GemmMicrokernelTester()
20568 .mr(4)
20569 .nr(8)
20570 .kr(1)
20571 .sr(4)
20572 .m(4)
20573 .n(8)
20574 .k(k)
20575 .ks(3)
20576 .a_offset(83)
20577 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20578 }
20579 }
20580
20581 TEST(F32_IGEMM_4X8S4__NEON, zero) {
20582 TEST_REQUIRES_ARM_NEON;
20583 for (uint32_t mz = 0; mz < 4; mz++) {
20584 for (size_t k = 1; k <= 20; k += 5) {
20585 GemmMicrokernelTester()
20586 .mr(4)
20587 .nr(8)
20588 .kr(1)
20589 .sr(4)
20590 .m(4)
20591 .n(8)
20592 .k(k)
20593 .ks(3)
20594 .a_offset(83)
20595 .zero_index(mz)
20596 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20597 }
20598 }
20599 }
20600
20601 TEST(F32_IGEMM_4X8S4__NEON, qmin) {
20602 TEST_REQUIRES_ARM_NEON;
20603 GemmMicrokernelTester()
20604 .mr(4)
20605 .nr(8)
20606 .kr(1)
20607 .sr(4)
20608 .m(4)
20609 .n(8)
20610 .k(4)
20611 .qmin(128)
20612 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20613 }
20614
20615 TEST(F32_IGEMM_4X8S4__NEON, qmax) {
20616 TEST_REQUIRES_ARM_NEON;
20617 GemmMicrokernelTester()
20618 .mr(4)
20619 .nr(8)
20620 .kr(1)
20621 .sr(4)
20622 .m(4)
20623 .n(8)
20624 .k(4)
20625 .qmax(128)
20626 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20627 }
20628
20629 TEST(F32_IGEMM_4X8S4__NEON, strided_cm) {
20630 TEST_REQUIRES_ARM_NEON;
20631 GemmMicrokernelTester()
20632 .mr(4)
20633 .nr(8)
20634 .kr(1)
20635 .sr(4)
20636 .m(4)
20637 .n(8)
20638 .k(4)
20639 .cm_stride(11)
20640 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
20641 }
20642#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
20643
20644
20645#if XNN_ARCH_ARM || XNN_ARCH_ARM64
20646 TEST(F32_IGEMM_6X8S4__NEON, k_eq_4) {
20647 TEST_REQUIRES_ARM_NEON;
20648 GemmMicrokernelTester()
20649 .mr(6)
20650 .nr(8)
20651 .kr(1)
20652 .sr(4)
20653 .m(6)
20654 .n(8)
20655 .k(4)
20656 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
20657 }
20658
20659 TEST(F32_IGEMM_6X8S4__NEON, strided_cn) {
20660 TEST_REQUIRES_ARM_NEON;
20661 GemmMicrokernelTester()
20662 .mr(6)
20663 .nr(8)
20664 .kr(1)
20665 .sr(4)
20666 .m(6)
20667 .n(8)
20668 .k(4)
20669 .cn_stride(11)
20670 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
20671 }
20672
20673 TEST(F32_IGEMM_6X8S4__NEON, k_eq_4_subtile) {
20674 TEST_REQUIRES_ARM_NEON;
20675 for (uint32_t m = 1; m <= 6; m++) {
20676 for (uint32_t n = 1; n <= 8; n++) {
20677 GemmMicrokernelTester()
20678 .mr(6)
20679 .nr(8)
20680 .kr(1)
20681 .sr(4)
20682 .m(m)
20683 .n(n)
20684 .k(4)
20685 .iterations(1)
20686 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
20687 }
20688 }
20689 }
20690
20691 TEST(F32_IGEMM_6X8S4__NEON, k_eq_4_subtile_m) {
20692 TEST_REQUIRES_ARM_NEON;
20693 for (uint32_t m = 1; m <= 6; m++) {
20694 GemmMicrokernelTester()
20695 .mr(6)
20696 .nr(8)
20697 .kr(1)
20698 .sr(4)
20699 .m(m)
20700 .n(8)
20701 .k(4)
20702 .iterations(1)
20703 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
20704 }
20705 }
20706
20707 TEST(F32_IGEMM_6X8S4__NEON, k_eq_4_subtile_n) {
20708 TEST_REQUIRES_ARM_NEON;
20709 for (uint32_t n = 1; n <= 8; n++) {
20710 GemmMicrokernelTester()
20711 .mr(6)
20712 .nr(8)
20713 .kr(1)
20714 .sr(4)
20715 .m(6)
20716 .n(n)
20717 .k(4)
20718 .iterations(1)
20719 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
20720 }
20721 }
20722
20723 TEST(F32_IGEMM_6X8S4__NEON, k_lt_4) {
20724 TEST_REQUIRES_ARM_NEON;
20725 for (size_t k = 1; k < 4; k++) {
20726 GemmMicrokernelTester()
20727 .mr(6)
20728 .nr(8)
20729 .kr(1)
20730 .sr(4)
20731 .m(6)
20732 .n(8)
20733 .k(k)
20734 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
20735 }
20736 }
20737
20738 TEST(F32_IGEMM_6X8S4__NEON, k_lt_4_subtile) {
20739 TEST_REQUIRES_ARM_NEON;
20740 for (size_t k = 1; k < 4; k++) {
20741 for (uint32_t m = 1; m <= 6; m++) {
20742 for (uint32_t n = 1; n <= 8; n++) {
20743 GemmMicrokernelTester()
20744 .mr(6)
20745 .nr(8)
20746 .kr(1)
20747 .sr(4)
20748 .m(m)
20749 .n(n)
20750 .k(k)
20751 .iterations(1)
20752 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
20753 }
20754 }
20755 }
20756 }
20757
20758 TEST(F32_IGEMM_6X8S4__NEON, k_gt_4) {
20759 TEST_REQUIRES_ARM_NEON;
20760 for (size_t k = 5; k < 8; k++) {
20761 GemmMicrokernelTester()
20762 .mr(6)
20763 .nr(8)
20764 .kr(1)
20765 .sr(4)
20766 .m(6)
20767 .n(8)
20768 .k(k)
20769 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
20770 }
20771 }
20772
20773 TEST(F32_IGEMM_6X8S4__NEON, k_gt_4_subtile) {
20774 TEST_REQUIRES_ARM_NEON;
20775 for (size_t k = 5; k < 8; k++) {
20776 for (uint32_t m = 1; m <= 6; m++) {
20777 for (uint32_t n = 1; n <= 8; n++) {
20778 GemmMicrokernelTester()
20779 .mr(6)
20780 .nr(8)
20781 .kr(1)
20782 .sr(4)
20783 .m(m)
20784 .n(n)
20785 .k(k)
20786 .iterations(1)
20787 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
20788 }
20789 }
20790 }
20791 }
20792
20793 TEST(F32_IGEMM_6X8S4__NEON, k_div_4) {
20794 TEST_REQUIRES_ARM_NEON;
20795 for (size_t k = 8; k <= 40; k += 4) {
20796 GemmMicrokernelTester()
20797 .mr(6)
20798 .nr(8)
20799 .kr(1)
20800 .sr(4)
20801 .m(6)
20802 .n(8)
20803 .k(k)
20804 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
20805 }
20806 }
20807
20808 TEST(F32_IGEMM_6X8S4__NEON, k_div_4_subtile) {
20809 TEST_REQUIRES_ARM_NEON;
20810 for (size_t k = 8; k <= 40; k += 4) {
20811 for (uint32_t m = 1; m <= 6; m++) {
20812 for (uint32_t n = 1; n <= 8; n++) {
20813 GemmMicrokernelTester()
20814 .mr(6)
20815 .nr(8)
20816 .kr(1)
20817 .sr(4)
20818 .m(m)
20819 .n(n)
20820 .k(k)
20821 .iterations(1)
20822 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
20823 }
20824 }
20825 }
20826 }
20827
20828 TEST(F32_IGEMM_6X8S4__NEON, n_gt_8) {
20829 TEST_REQUIRES_ARM_NEON;
20830 for (uint32_t n = 9; n < 16; n++) {
20831 for (size_t k = 1; k <= 20; k += 5) {
20832 GemmMicrokernelTester()
20833 .mr(6)
20834 .nr(8)
20835 .kr(1)
20836 .sr(4)
20837 .m(6)
20838 .n(8)
20839 .k(k)
20840 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
20841 }
20842 }
20843 }
20844
20845 TEST(F32_IGEMM_6X8S4__NEON, n_gt_8_strided_cn) {
20846 TEST_REQUIRES_ARM_NEON;
20847 for (uint32_t n = 9; n < 16; n++) {
20848 for (size_t k = 1; k <= 20; k += 5) {
20849 GemmMicrokernelTester()
20850 .mr(6)
20851 .nr(8)
20852 .kr(1)
20853 .sr(4)
20854 .m(6)
20855 .n(8)
20856 .k(k)
20857 .cn_stride(11)
20858 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
20859 }
20860 }
20861 }
20862
20863 TEST(F32_IGEMM_6X8S4__NEON, n_gt_8_subtile) {
20864 TEST_REQUIRES_ARM_NEON;
20865 for (uint32_t n = 9; n < 16; n++) {
20866 for (size_t k = 1; k <= 20; k += 5) {
20867 for (uint32_t m = 1; m <= 6; m++) {
20868 GemmMicrokernelTester()
20869 .mr(6)
20870 .nr(8)
20871 .kr(1)
20872 .sr(4)
20873 .m(m)
20874 .n(n)
20875 .k(k)
20876 .iterations(1)
20877 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
20878 }
20879 }
20880 }
20881 }
20882
20883 TEST(F32_IGEMM_6X8S4__NEON, n_div_8) {
20884 TEST_REQUIRES_ARM_NEON;
20885 for (uint32_t n = 16; n <= 24; n += 8) {
20886 for (size_t k = 1; k <= 20; k += 5) {
20887 GemmMicrokernelTester()
20888 .mr(6)
20889 .nr(8)
20890 .kr(1)
20891 .sr(4)
20892 .m(6)
20893 .n(8)
20894 .k(k)
20895 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
20896 }
20897 }
20898 }
20899
20900 TEST(F32_IGEMM_6X8S4__NEON, n_div_8_strided_cn) {
20901 TEST_REQUIRES_ARM_NEON;
20902 for (uint32_t n = 16; n <= 24; n += 8) {
20903 for (size_t k = 1; k <= 20; k += 5) {
20904 GemmMicrokernelTester()
20905 .mr(6)
20906 .nr(8)
20907 .kr(1)
20908 .sr(4)
20909 .m(6)
20910 .n(n)
20911 .k(k)
20912 .cn_stride(11)
20913 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
20914 }
20915 }
20916 }
20917
20918 TEST(F32_IGEMM_6X8S4__NEON, n_div_8_subtile) {
20919 TEST_REQUIRES_ARM_NEON;
20920 for (uint32_t n = 16; n <= 24; n += 8) {
20921 for (size_t k = 1; k <= 20; k += 5) {
20922 for (uint32_t m = 1; m <= 6; m++) {
20923 GemmMicrokernelTester()
20924 .mr(6)
20925 .nr(8)
20926 .kr(1)
20927 .sr(4)
20928 .m(m)
20929 .n(n)
20930 .k(k)
20931 .iterations(1)
20932 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
20933 }
20934 }
20935 }
20936 }
20937
20938 TEST(F32_IGEMM_6X8S4__NEON, small_kernel) {
20939 TEST_REQUIRES_ARM_NEON;
20940 for (size_t k = 1; k <= 20; k += 5) {
20941 GemmMicrokernelTester()
20942 .mr(6)
20943 .nr(8)
20944 .kr(1)
20945 .sr(4)
20946 .m(6)
20947 .n(8)
20948 .k(k)
20949 .ks(3)
20950 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
20951 }
20952 }
20953
20954 TEST(F32_IGEMM_6X8S4__NEON, small_kernel_subtile) {
20955 TEST_REQUIRES_ARM_NEON;
20956 for (size_t k = 1; k <= 20; k += 5) {
20957 for (uint32_t m = 1; m <= 6; m++) {
20958 for (uint32_t n = 1; n <= 8; n++) {
20959 GemmMicrokernelTester()
20960 .mr(6)
20961 .nr(8)
20962 .kr(1)
20963 .sr(4)
20964 .m(m)
20965 .n(n)
20966 .k(k)
20967 .ks(3)
20968 .iterations(1)
20969 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
20970 }
20971 }
20972 }
20973 }
20974
20975 TEST(F32_IGEMM_6X8S4__NEON, n_gt_8_small_kernel) {
20976 TEST_REQUIRES_ARM_NEON;
20977 for (uint32_t n = 9; n < 16; n++) {
20978 for (size_t k = 1; k <= 20; k += 5) {
20979 GemmMicrokernelTester()
20980 .mr(6)
20981 .nr(8)
20982 .kr(1)
20983 .sr(4)
20984 .m(6)
20985 .n(8)
20986 .k(k)
20987 .ks(3)
20988 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
20989 }
20990 }
20991 }
20992
20993 TEST(F32_IGEMM_6X8S4__NEON, n_div_8_small_kernel) {
20994 TEST_REQUIRES_ARM_NEON;
20995 for (uint32_t n = 16; n <= 24; n += 8) {
20996 for (size_t k = 1; k <= 20; k += 5) {
20997 GemmMicrokernelTester()
20998 .mr(6)
20999 .nr(8)
21000 .kr(1)
21001 .sr(4)
21002 .m(6)
21003 .n(8)
21004 .k(k)
21005 .ks(3)
21006 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
21007 }
21008 }
21009 }
21010
21011 TEST(F32_IGEMM_6X8S4__NEON, strided_cm_subtile) {
21012 TEST_REQUIRES_ARM_NEON;
21013 for (size_t k = 1; k <= 20; k += 5) {
21014 for (uint32_t m = 1; m <= 6; m++) {
21015 for (uint32_t n = 1; n <= 8; n++) {
21016 GemmMicrokernelTester()
21017 .mr(6)
21018 .nr(8)
21019 .kr(1)
21020 .sr(4)
21021 .m(m)
21022 .n(n)
21023 .k(k)
21024 .cm_stride(11)
21025 .iterations(1)
21026 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
21027 }
21028 }
21029 }
21030 }
21031
21032 TEST(F32_IGEMM_6X8S4__NEON, a_offset) {
21033 TEST_REQUIRES_ARM_NEON;
21034 for (size_t k = 1; k <= 20; k += 5) {
21035 GemmMicrokernelTester()
21036 .mr(6)
21037 .nr(8)
21038 .kr(1)
21039 .sr(4)
21040 .m(6)
21041 .n(8)
21042 .k(k)
21043 .ks(3)
21044 .a_offset(127)
21045 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
21046 }
21047 }
21048
21049 TEST(F32_IGEMM_6X8S4__NEON, zero) {
21050 TEST_REQUIRES_ARM_NEON;
21051 for (uint32_t mz = 0; mz < 6; mz++) {
21052 for (size_t k = 1; k <= 20; k += 5) {
21053 GemmMicrokernelTester()
21054 .mr(6)
21055 .nr(8)
21056 .kr(1)
21057 .sr(4)
21058 .m(6)
21059 .n(8)
21060 .k(k)
21061 .ks(3)
21062 .a_offset(127)
21063 .zero_index(mz)
21064 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
21065 }
21066 }
21067 }
21068
21069 TEST(F32_IGEMM_6X8S4__NEON, qmin) {
21070 TEST_REQUIRES_ARM_NEON;
21071 GemmMicrokernelTester()
21072 .mr(6)
21073 .nr(8)
21074 .kr(1)
21075 .sr(4)
21076 .m(6)
21077 .n(8)
21078 .k(4)
21079 .qmin(128)
21080 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
21081 }
21082
21083 TEST(F32_IGEMM_6X8S4__NEON, qmax) {
21084 TEST_REQUIRES_ARM_NEON;
21085 GemmMicrokernelTester()
21086 .mr(6)
21087 .nr(8)
21088 .kr(1)
21089 .sr(4)
21090 .m(6)
21091 .n(8)
21092 .k(4)
21093 .qmax(128)
21094 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
21095 }
21096
21097 TEST(F32_IGEMM_6X8S4__NEON, strided_cm) {
21098 TEST_REQUIRES_ARM_NEON;
21099 GemmMicrokernelTester()
21100 .mr(6)
21101 .nr(8)
21102 .kr(1)
21103 .sr(4)
21104 .m(6)
21105 .n(8)
21106 .k(4)
21107 .cm_stride(11)
21108 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
21109 }
21110#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
21111
21112
21113#if XNN_ARCH_ARM || XNN_ARCH_ARM64
21114 TEST(F32_IGEMM_8X8S4__NEON, k_eq_4) {
21115 TEST_REQUIRES_ARM_NEON;
21116 GemmMicrokernelTester()
21117 .mr(8)
21118 .nr(8)
21119 .kr(1)
21120 .sr(4)
21121 .m(8)
21122 .n(8)
21123 .k(4)
21124 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21125 }
21126
21127 TEST(F32_IGEMM_8X8S4__NEON, strided_cn) {
21128 TEST_REQUIRES_ARM_NEON;
21129 GemmMicrokernelTester()
21130 .mr(8)
21131 .nr(8)
21132 .kr(1)
21133 .sr(4)
21134 .m(8)
21135 .n(8)
21136 .k(4)
21137 .cn_stride(11)
21138 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21139 }
21140
21141 TEST(F32_IGEMM_8X8S4__NEON, k_eq_4_subtile) {
21142 TEST_REQUIRES_ARM_NEON;
21143 for (uint32_t m = 1; m <= 8; m++) {
21144 for (uint32_t n = 1; n <= 8; n++) {
21145 GemmMicrokernelTester()
21146 .mr(8)
21147 .nr(8)
21148 .kr(1)
21149 .sr(4)
21150 .m(m)
21151 .n(n)
21152 .k(4)
21153 .iterations(1)
21154 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21155 }
21156 }
21157 }
21158
21159 TEST(F32_IGEMM_8X8S4__NEON, k_eq_4_subtile_m) {
21160 TEST_REQUIRES_ARM_NEON;
21161 for (uint32_t m = 1; m <= 8; m++) {
21162 GemmMicrokernelTester()
21163 .mr(8)
21164 .nr(8)
21165 .kr(1)
21166 .sr(4)
21167 .m(m)
21168 .n(8)
21169 .k(4)
21170 .iterations(1)
21171 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21172 }
21173 }
21174
21175 TEST(F32_IGEMM_8X8S4__NEON, k_eq_4_subtile_n) {
21176 TEST_REQUIRES_ARM_NEON;
21177 for (uint32_t n = 1; n <= 8; n++) {
21178 GemmMicrokernelTester()
21179 .mr(8)
21180 .nr(8)
21181 .kr(1)
21182 .sr(4)
21183 .m(8)
21184 .n(n)
21185 .k(4)
21186 .iterations(1)
21187 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21188 }
21189 }
21190
21191 TEST(F32_IGEMM_8X8S4__NEON, k_lt_4) {
21192 TEST_REQUIRES_ARM_NEON;
21193 for (size_t k = 1; k < 4; k++) {
21194 GemmMicrokernelTester()
21195 .mr(8)
21196 .nr(8)
21197 .kr(1)
21198 .sr(4)
21199 .m(8)
21200 .n(8)
21201 .k(k)
21202 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21203 }
21204 }
21205
21206 TEST(F32_IGEMM_8X8S4__NEON, k_lt_4_subtile) {
21207 TEST_REQUIRES_ARM_NEON;
21208 for (size_t k = 1; k < 4; k++) {
21209 for (uint32_t m = 1; m <= 8; m++) {
21210 for (uint32_t n = 1; n <= 8; n++) {
21211 GemmMicrokernelTester()
21212 .mr(8)
21213 .nr(8)
21214 .kr(1)
21215 .sr(4)
21216 .m(m)
21217 .n(n)
21218 .k(k)
21219 .iterations(1)
21220 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21221 }
21222 }
21223 }
21224 }
21225
21226 TEST(F32_IGEMM_8X8S4__NEON, k_gt_4) {
21227 TEST_REQUIRES_ARM_NEON;
21228 for (size_t k = 5; k < 8; k++) {
21229 GemmMicrokernelTester()
21230 .mr(8)
21231 .nr(8)
21232 .kr(1)
21233 .sr(4)
21234 .m(8)
21235 .n(8)
21236 .k(k)
21237 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21238 }
21239 }
21240
21241 TEST(F32_IGEMM_8X8S4__NEON, k_gt_4_subtile) {
21242 TEST_REQUIRES_ARM_NEON;
21243 for (size_t k = 5; k < 8; k++) {
21244 for (uint32_t m = 1; m <= 8; m++) {
21245 for (uint32_t n = 1; n <= 8; n++) {
21246 GemmMicrokernelTester()
21247 .mr(8)
21248 .nr(8)
21249 .kr(1)
21250 .sr(4)
21251 .m(m)
21252 .n(n)
21253 .k(k)
21254 .iterations(1)
21255 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21256 }
21257 }
21258 }
21259 }
21260
21261 TEST(F32_IGEMM_8X8S4__NEON, k_div_4) {
21262 TEST_REQUIRES_ARM_NEON;
21263 for (size_t k = 8; k <= 40; k += 4) {
21264 GemmMicrokernelTester()
21265 .mr(8)
21266 .nr(8)
21267 .kr(1)
21268 .sr(4)
21269 .m(8)
21270 .n(8)
21271 .k(k)
21272 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21273 }
21274 }
21275
21276 TEST(F32_IGEMM_8X8S4__NEON, k_div_4_subtile) {
21277 TEST_REQUIRES_ARM_NEON;
21278 for (size_t k = 8; k <= 40; k += 4) {
21279 for (uint32_t m = 1; m <= 8; m++) {
21280 for (uint32_t n = 1; n <= 8; n++) {
21281 GemmMicrokernelTester()
21282 .mr(8)
21283 .nr(8)
21284 .kr(1)
21285 .sr(4)
21286 .m(m)
21287 .n(n)
21288 .k(k)
21289 .iterations(1)
21290 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21291 }
21292 }
21293 }
21294 }
21295
21296 TEST(F32_IGEMM_8X8S4__NEON, n_gt_8) {
21297 TEST_REQUIRES_ARM_NEON;
21298 for (uint32_t n = 9; n < 16; n++) {
21299 for (size_t k = 1; k <= 20; k += 5) {
21300 GemmMicrokernelTester()
21301 .mr(8)
21302 .nr(8)
21303 .kr(1)
21304 .sr(4)
21305 .m(8)
21306 .n(8)
21307 .k(k)
21308 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21309 }
21310 }
21311 }
21312
21313 TEST(F32_IGEMM_8X8S4__NEON, n_gt_8_strided_cn) {
21314 TEST_REQUIRES_ARM_NEON;
21315 for (uint32_t n = 9; n < 16; n++) {
21316 for (size_t k = 1; k <= 20; k += 5) {
21317 GemmMicrokernelTester()
21318 .mr(8)
21319 .nr(8)
21320 .kr(1)
21321 .sr(4)
21322 .m(8)
21323 .n(8)
21324 .k(k)
21325 .cn_stride(11)
21326 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21327 }
21328 }
21329 }
21330
21331 TEST(F32_IGEMM_8X8S4__NEON, n_gt_8_subtile) {
21332 TEST_REQUIRES_ARM_NEON;
21333 for (uint32_t n = 9; n < 16; n++) {
21334 for (size_t k = 1; k <= 20; k += 5) {
21335 for (uint32_t m = 1; m <= 8; m++) {
21336 GemmMicrokernelTester()
21337 .mr(8)
21338 .nr(8)
21339 .kr(1)
21340 .sr(4)
21341 .m(m)
21342 .n(n)
21343 .k(k)
21344 .iterations(1)
21345 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21346 }
21347 }
21348 }
21349 }
21350
21351 TEST(F32_IGEMM_8X8S4__NEON, n_div_8) {
21352 TEST_REQUIRES_ARM_NEON;
21353 for (uint32_t n = 16; n <= 24; n += 8) {
21354 for (size_t k = 1; k <= 20; k += 5) {
21355 GemmMicrokernelTester()
21356 .mr(8)
21357 .nr(8)
21358 .kr(1)
21359 .sr(4)
21360 .m(8)
21361 .n(8)
21362 .k(k)
21363 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21364 }
21365 }
21366 }
21367
21368 TEST(F32_IGEMM_8X8S4__NEON, n_div_8_strided_cn) {
21369 TEST_REQUIRES_ARM_NEON;
21370 for (uint32_t n = 16; n <= 24; n += 8) {
21371 for (size_t k = 1; k <= 20; k += 5) {
21372 GemmMicrokernelTester()
21373 .mr(8)
21374 .nr(8)
21375 .kr(1)
21376 .sr(4)
21377 .m(8)
21378 .n(n)
21379 .k(k)
21380 .cn_stride(11)
21381 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21382 }
21383 }
21384 }
21385
21386 TEST(F32_IGEMM_8X8S4__NEON, n_div_8_subtile) {
21387 TEST_REQUIRES_ARM_NEON;
21388 for (uint32_t n = 16; n <= 24; n += 8) {
21389 for (size_t k = 1; k <= 20; k += 5) {
21390 for (uint32_t m = 1; m <= 8; m++) {
21391 GemmMicrokernelTester()
21392 .mr(8)
21393 .nr(8)
21394 .kr(1)
21395 .sr(4)
21396 .m(m)
21397 .n(n)
21398 .k(k)
21399 .iterations(1)
21400 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21401 }
21402 }
21403 }
21404 }
21405
21406 TEST(F32_IGEMM_8X8S4__NEON, small_kernel) {
21407 TEST_REQUIRES_ARM_NEON;
21408 for (size_t k = 1; k <= 20; k += 5) {
21409 GemmMicrokernelTester()
21410 .mr(8)
21411 .nr(8)
21412 .kr(1)
21413 .sr(4)
21414 .m(8)
21415 .n(8)
21416 .k(k)
21417 .ks(3)
21418 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21419 }
21420 }
21421
21422 TEST(F32_IGEMM_8X8S4__NEON, small_kernel_subtile) {
21423 TEST_REQUIRES_ARM_NEON;
21424 for (size_t k = 1; k <= 20; k += 5) {
21425 for (uint32_t m = 1; m <= 8; m++) {
21426 for (uint32_t n = 1; n <= 8; n++) {
21427 GemmMicrokernelTester()
21428 .mr(8)
21429 .nr(8)
21430 .kr(1)
21431 .sr(4)
21432 .m(m)
21433 .n(n)
21434 .k(k)
21435 .ks(3)
21436 .iterations(1)
21437 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21438 }
21439 }
21440 }
21441 }
21442
21443 TEST(F32_IGEMM_8X8S4__NEON, n_gt_8_small_kernel) {
21444 TEST_REQUIRES_ARM_NEON;
21445 for (uint32_t n = 9; n < 16; n++) {
21446 for (size_t k = 1; k <= 20; k += 5) {
21447 GemmMicrokernelTester()
21448 .mr(8)
21449 .nr(8)
21450 .kr(1)
21451 .sr(4)
21452 .m(8)
21453 .n(8)
21454 .k(k)
21455 .ks(3)
21456 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21457 }
21458 }
21459 }
21460
21461 TEST(F32_IGEMM_8X8S4__NEON, n_div_8_small_kernel) {
21462 TEST_REQUIRES_ARM_NEON;
21463 for (uint32_t n = 16; n <= 24; n += 8) {
21464 for (size_t k = 1; k <= 20; k += 5) {
21465 GemmMicrokernelTester()
21466 .mr(8)
21467 .nr(8)
21468 .kr(1)
21469 .sr(4)
21470 .m(8)
21471 .n(8)
21472 .k(k)
21473 .ks(3)
21474 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21475 }
21476 }
21477 }
21478
21479 TEST(F32_IGEMM_8X8S4__NEON, strided_cm_subtile) {
21480 TEST_REQUIRES_ARM_NEON;
21481 for (size_t k = 1; k <= 20; k += 5) {
21482 for (uint32_t m = 1; m <= 8; m++) {
21483 for (uint32_t n = 1; n <= 8; n++) {
21484 GemmMicrokernelTester()
21485 .mr(8)
21486 .nr(8)
21487 .kr(1)
21488 .sr(4)
21489 .m(m)
21490 .n(n)
21491 .k(k)
21492 .cm_stride(11)
21493 .iterations(1)
21494 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21495 }
21496 }
21497 }
21498 }
21499
21500 TEST(F32_IGEMM_8X8S4__NEON, a_offset) {
21501 TEST_REQUIRES_ARM_NEON;
21502 for (size_t k = 1; k <= 20; k += 5) {
21503 GemmMicrokernelTester()
21504 .mr(8)
21505 .nr(8)
21506 .kr(1)
21507 .sr(4)
21508 .m(8)
21509 .n(8)
21510 .k(k)
21511 .ks(3)
21512 .a_offset(163)
21513 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21514 }
21515 }
21516
21517 TEST(F32_IGEMM_8X8S4__NEON, zero) {
21518 TEST_REQUIRES_ARM_NEON;
21519 for (uint32_t mz = 0; mz < 8; mz++) {
21520 for (size_t k = 1; k <= 20; k += 5) {
21521 GemmMicrokernelTester()
21522 .mr(8)
21523 .nr(8)
21524 .kr(1)
21525 .sr(4)
21526 .m(8)
21527 .n(8)
21528 .k(k)
21529 .ks(3)
21530 .a_offset(163)
21531 .zero_index(mz)
21532 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21533 }
21534 }
21535 }
21536
21537 TEST(F32_IGEMM_8X8S4__NEON, qmin) {
21538 TEST_REQUIRES_ARM_NEON;
21539 GemmMicrokernelTester()
21540 .mr(8)
21541 .nr(8)
21542 .kr(1)
21543 .sr(4)
21544 .m(8)
21545 .n(8)
21546 .k(4)
21547 .qmin(128)
21548 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21549 }
21550
21551 TEST(F32_IGEMM_8X8S4__NEON, qmax) {
21552 TEST_REQUIRES_ARM_NEON;
21553 GemmMicrokernelTester()
21554 .mr(8)
21555 .nr(8)
21556 .kr(1)
21557 .sr(4)
21558 .m(8)
21559 .n(8)
21560 .k(4)
21561 .qmax(128)
21562 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21563 }
21564
21565 TEST(F32_IGEMM_8X8S4__NEON, strided_cm) {
21566 TEST_REQUIRES_ARM_NEON;
21567 GemmMicrokernelTester()
21568 .mr(8)
21569 .nr(8)
21570 .kr(1)
21571 .sr(4)
21572 .m(8)
21573 .n(8)
21574 .k(4)
21575 .cm_stride(11)
21576 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
21577 }
21578#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
21579
21580
21581#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barcharddf06d802019-11-20 15:53:46 -080021582 TEST(F32_IGEMM_1X8S4__NEONFMA, k_eq_4) {
21583 TEST_REQUIRES_ARM_NEON_FMA;
21584 GemmMicrokernelTester()
21585 .mr(1)
21586 .nr(8)
21587 .kr(1)
21588 .sr(4)
21589 .m(1)
21590 .n(8)
21591 .k(4)
21592 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21593 }
21594
21595 TEST(F32_IGEMM_1X8S4__NEONFMA, strided_cn) {
21596 TEST_REQUIRES_ARM_NEON_FMA;
21597 GemmMicrokernelTester()
21598 .mr(1)
21599 .nr(8)
21600 .kr(1)
21601 .sr(4)
21602 .m(1)
21603 .n(8)
21604 .k(4)
21605 .cn_stride(11)
21606 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21607 }
21608
21609 TEST(F32_IGEMM_1X8S4__NEONFMA, k_eq_4_subtile) {
21610 TEST_REQUIRES_ARM_NEON_FMA;
21611 for (uint32_t m = 1; m <= 1; m++) {
21612 for (uint32_t n = 1; n <= 8; n++) {
21613 GemmMicrokernelTester()
21614 .mr(1)
21615 .nr(8)
21616 .kr(1)
21617 .sr(4)
21618 .m(m)
21619 .n(n)
21620 .k(4)
21621 .iterations(1)
21622 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21623 }
21624 }
21625 }
21626
21627 TEST(F32_IGEMM_1X8S4__NEONFMA, k_eq_4_subtile_m) {
21628 TEST_REQUIRES_ARM_NEON_FMA;
21629 for (uint32_t m = 1; m <= 1; m++) {
21630 GemmMicrokernelTester()
21631 .mr(1)
21632 .nr(8)
21633 .kr(1)
21634 .sr(4)
21635 .m(m)
21636 .n(8)
21637 .k(4)
21638 .iterations(1)
21639 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21640 }
21641 }
21642
21643 TEST(F32_IGEMM_1X8S4__NEONFMA, k_eq_4_subtile_n) {
21644 TEST_REQUIRES_ARM_NEON_FMA;
21645 for (uint32_t n = 1; n <= 8; n++) {
21646 GemmMicrokernelTester()
21647 .mr(1)
21648 .nr(8)
21649 .kr(1)
21650 .sr(4)
21651 .m(1)
21652 .n(n)
21653 .k(4)
21654 .iterations(1)
21655 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21656 }
21657 }
21658
21659 TEST(F32_IGEMM_1X8S4__NEONFMA, k_lt_4) {
21660 TEST_REQUIRES_ARM_NEON_FMA;
21661 for (size_t k = 1; k < 4; k++) {
21662 GemmMicrokernelTester()
21663 .mr(1)
21664 .nr(8)
21665 .kr(1)
21666 .sr(4)
21667 .m(1)
21668 .n(8)
21669 .k(k)
21670 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21671 }
21672 }
21673
21674 TEST(F32_IGEMM_1X8S4__NEONFMA, k_lt_4_subtile) {
21675 TEST_REQUIRES_ARM_NEON_FMA;
21676 for (size_t k = 1; k < 4; k++) {
21677 for (uint32_t m = 1; m <= 1; m++) {
21678 for (uint32_t n = 1; n <= 8; n++) {
21679 GemmMicrokernelTester()
21680 .mr(1)
21681 .nr(8)
21682 .kr(1)
21683 .sr(4)
21684 .m(m)
21685 .n(n)
21686 .k(k)
21687 .iterations(1)
21688 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21689 }
21690 }
21691 }
21692 }
21693
21694 TEST(F32_IGEMM_1X8S4__NEONFMA, k_gt_4) {
21695 TEST_REQUIRES_ARM_NEON_FMA;
21696 for (size_t k = 5; k < 8; k++) {
21697 GemmMicrokernelTester()
21698 .mr(1)
21699 .nr(8)
21700 .kr(1)
21701 .sr(4)
21702 .m(1)
21703 .n(8)
21704 .k(k)
21705 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21706 }
21707 }
21708
21709 TEST(F32_IGEMM_1X8S4__NEONFMA, k_gt_4_subtile) {
21710 TEST_REQUIRES_ARM_NEON_FMA;
21711 for (size_t k = 5; k < 8; k++) {
21712 for (uint32_t m = 1; m <= 1; m++) {
21713 for (uint32_t n = 1; n <= 8; n++) {
21714 GemmMicrokernelTester()
21715 .mr(1)
21716 .nr(8)
21717 .kr(1)
21718 .sr(4)
21719 .m(m)
21720 .n(n)
21721 .k(k)
21722 .iterations(1)
21723 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21724 }
21725 }
21726 }
21727 }
21728
21729 TEST(F32_IGEMM_1X8S4__NEONFMA, k_div_4) {
21730 TEST_REQUIRES_ARM_NEON_FMA;
21731 for (size_t k = 8; k <= 40; k += 4) {
21732 GemmMicrokernelTester()
21733 .mr(1)
21734 .nr(8)
21735 .kr(1)
21736 .sr(4)
21737 .m(1)
21738 .n(8)
21739 .k(k)
21740 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21741 }
21742 }
21743
21744 TEST(F32_IGEMM_1X8S4__NEONFMA, k_div_4_subtile) {
21745 TEST_REQUIRES_ARM_NEON_FMA;
21746 for (size_t k = 8; k <= 40; k += 4) {
21747 for (uint32_t m = 1; m <= 1; m++) {
21748 for (uint32_t n = 1; n <= 8; n++) {
21749 GemmMicrokernelTester()
21750 .mr(1)
21751 .nr(8)
21752 .kr(1)
21753 .sr(4)
21754 .m(m)
21755 .n(n)
21756 .k(k)
21757 .iterations(1)
21758 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21759 }
21760 }
21761 }
21762 }
21763
21764 TEST(F32_IGEMM_1X8S4__NEONFMA, n_gt_8) {
21765 TEST_REQUIRES_ARM_NEON_FMA;
21766 for (uint32_t n = 9; n < 16; n++) {
21767 for (size_t k = 1; k <= 20; k += 5) {
21768 GemmMicrokernelTester()
21769 .mr(1)
21770 .nr(8)
21771 .kr(1)
21772 .sr(4)
21773 .m(1)
21774 .n(8)
21775 .k(k)
21776 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21777 }
21778 }
21779 }
21780
21781 TEST(F32_IGEMM_1X8S4__NEONFMA, n_gt_8_strided_cn) {
21782 TEST_REQUIRES_ARM_NEON_FMA;
21783 for (uint32_t n = 9; n < 16; n++) {
21784 for (size_t k = 1; k <= 20; k += 5) {
21785 GemmMicrokernelTester()
21786 .mr(1)
21787 .nr(8)
21788 .kr(1)
21789 .sr(4)
21790 .m(1)
21791 .n(8)
21792 .k(k)
21793 .cn_stride(11)
21794 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21795 }
21796 }
21797 }
21798
21799 TEST(F32_IGEMM_1X8S4__NEONFMA, n_gt_8_subtile) {
21800 TEST_REQUIRES_ARM_NEON_FMA;
21801 for (uint32_t n = 9; n < 16; n++) {
21802 for (size_t k = 1; k <= 20; k += 5) {
21803 for (uint32_t m = 1; m <= 1; m++) {
21804 GemmMicrokernelTester()
21805 .mr(1)
21806 .nr(8)
21807 .kr(1)
21808 .sr(4)
21809 .m(m)
21810 .n(n)
21811 .k(k)
21812 .iterations(1)
21813 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21814 }
21815 }
21816 }
21817 }
21818
21819 TEST(F32_IGEMM_1X8S4__NEONFMA, n_div_8) {
21820 TEST_REQUIRES_ARM_NEON_FMA;
21821 for (uint32_t n = 16; n <= 24; n += 8) {
21822 for (size_t k = 1; k <= 20; k += 5) {
21823 GemmMicrokernelTester()
21824 .mr(1)
21825 .nr(8)
21826 .kr(1)
21827 .sr(4)
21828 .m(1)
21829 .n(8)
21830 .k(k)
21831 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21832 }
21833 }
21834 }
21835
21836 TEST(F32_IGEMM_1X8S4__NEONFMA, n_div_8_strided_cn) {
21837 TEST_REQUIRES_ARM_NEON_FMA;
21838 for (uint32_t n = 16; n <= 24; n += 8) {
21839 for (size_t k = 1; k <= 20; k += 5) {
21840 GemmMicrokernelTester()
21841 .mr(1)
21842 .nr(8)
21843 .kr(1)
21844 .sr(4)
21845 .m(1)
21846 .n(n)
21847 .k(k)
21848 .cn_stride(11)
21849 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21850 }
21851 }
21852 }
21853
21854 TEST(F32_IGEMM_1X8S4__NEONFMA, n_div_8_subtile) {
21855 TEST_REQUIRES_ARM_NEON_FMA;
21856 for (uint32_t n = 16; n <= 24; n += 8) {
21857 for (size_t k = 1; k <= 20; k += 5) {
21858 for (uint32_t m = 1; m <= 1; m++) {
21859 GemmMicrokernelTester()
21860 .mr(1)
21861 .nr(8)
21862 .kr(1)
21863 .sr(4)
21864 .m(m)
21865 .n(n)
21866 .k(k)
21867 .iterations(1)
21868 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21869 }
21870 }
21871 }
21872 }
21873
21874 TEST(F32_IGEMM_1X8S4__NEONFMA, small_kernel) {
21875 TEST_REQUIRES_ARM_NEON_FMA;
21876 for (size_t k = 1; k <= 20; k += 5) {
21877 GemmMicrokernelTester()
21878 .mr(1)
21879 .nr(8)
21880 .kr(1)
21881 .sr(4)
21882 .m(1)
21883 .n(8)
21884 .k(k)
21885 .ks(3)
21886 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21887 }
21888 }
21889
21890 TEST(F32_IGEMM_1X8S4__NEONFMA, small_kernel_subtile) {
21891 TEST_REQUIRES_ARM_NEON_FMA;
21892 for (size_t k = 1; k <= 20; k += 5) {
21893 for (uint32_t m = 1; m <= 1; m++) {
21894 for (uint32_t n = 1; n <= 8; n++) {
21895 GemmMicrokernelTester()
21896 .mr(1)
21897 .nr(8)
21898 .kr(1)
21899 .sr(4)
21900 .m(m)
21901 .n(n)
21902 .k(k)
21903 .ks(3)
21904 .iterations(1)
21905 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21906 }
21907 }
21908 }
21909 }
21910
21911 TEST(F32_IGEMM_1X8S4__NEONFMA, n_gt_8_small_kernel) {
21912 TEST_REQUIRES_ARM_NEON_FMA;
21913 for (uint32_t n = 9; n < 16; n++) {
21914 for (size_t k = 1; k <= 20; k += 5) {
21915 GemmMicrokernelTester()
21916 .mr(1)
21917 .nr(8)
21918 .kr(1)
21919 .sr(4)
21920 .m(1)
21921 .n(8)
21922 .k(k)
21923 .ks(3)
21924 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21925 }
21926 }
21927 }
21928
21929 TEST(F32_IGEMM_1X8S4__NEONFMA, n_div_8_small_kernel) {
21930 TEST_REQUIRES_ARM_NEON_FMA;
21931 for (uint32_t n = 16; n <= 24; n += 8) {
21932 for (size_t k = 1; k <= 20; k += 5) {
21933 GemmMicrokernelTester()
21934 .mr(1)
21935 .nr(8)
21936 .kr(1)
21937 .sr(4)
21938 .m(1)
21939 .n(8)
21940 .k(k)
21941 .ks(3)
21942 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21943 }
21944 }
21945 }
21946
21947 TEST(F32_IGEMM_1X8S4__NEONFMA, strided_cm_subtile) {
21948 TEST_REQUIRES_ARM_NEON_FMA;
21949 for (size_t k = 1; k <= 20; k += 5) {
21950 for (uint32_t m = 1; m <= 1; m++) {
21951 for (uint32_t n = 1; n <= 8; n++) {
21952 GemmMicrokernelTester()
21953 .mr(1)
21954 .nr(8)
21955 .kr(1)
21956 .sr(4)
21957 .m(m)
21958 .n(n)
21959 .k(k)
21960 .cm_stride(11)
21961 .iterations(1)
21962 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21963 }
21964 }
21965 }
21966 }
21967
21968 TEST(F32_IGEMM_1X8S4__NEONFMA, a_offset) {
21969 TEST_REQUIRES_ARM_NEON_FMA;
21970 for (size_t k = 1; k <= 20; k += 5) {
21971 GemmMicrokernelTester()
21972 .mr(1)
21973 .nr(8)
21974 .kr(1)
21975 .sr(4)
21976 .m(1)
21977 .n(8)
21978 .k(k)
21979 .ks(3)
21980 .a_offset(23)
21981 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
21982 }
21983 }
21984
21985 TEST(F32_IGEMM_1X8S4__NEONFMA, zero) {
21986 TEST_REQUIRES_ARM_NEON_FMA;
21987 for (uint32_t mz = 0; mz < 1; mz++) {
21988 for (size_t k = 1; k <= 20; k += 5) {
21989 GemmMicrokernelTester()
21990 .mr(1)
21991 .nr(8)
21992 .kr(1)
21993 .sr(4)
21994 .m(1)
21995 .n(8)
21996 .k(k)
21997 .ks(3)
21998 .a_offset(23)
21999 .zero_index(mz)
22000 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
22001 }
22002 }
22003 }
22004
22005 TEST(F32_IGEMM_1X8S4__NEONFMA, qmin) {
22006 TEST_REQUIRES_ARM_NEON_FMA;
22007 GemmMicrokernelTester()
22008 .mr(1)
22009 .nr(8)
22010 .kr(1)
22011 .sr(4)
22012 .m(1)
22013 .n(8)
22014 .k(4)
22015 .qmin(128)
22016 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
22017 }
22018
22019 TEST(F32_IGEMM_1X8S4__NEONFMA, qmax) {
22020 TEST_REQUIRES_ARM_NEON_FMA;
22021 GemmMicrokernelTester()
22022 .mr(1)
22023 .nr(8)
22024 .kr(1)
22025 .sr(4)
22026 .m(1)
22027 .n(8)
22028 .k(4)
22029 .qmax(128)
22030 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
22031 }
22032
22033 TEST(F32_IGEMM_1X8S4__NEONFMA, strided_cm) {
22034 TEST_REQUIRES_ARM_NEON_FMA;
22035 GemmMicrokernelTester()
22036 .mr(1)
22037 .nr(8)
22038 .kr(1)
22039 .sr(4)
22040 .m(1)
22041 .n(8)
22042 .k(4)
22043 .cm_stride(11)
22044 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
22045 }
22046#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
22047
22048
22049#if XNN_ARCH_ARM || XNN_ARCH_ARM64
22050 TEST(F32_IGEMM_4X8S4__NEONFMA, k_eq_4) {
22051 TEST_REQUIRES_ARM_NEON_FMA;
22052 GemmMicrokernelTester()
22053 .mr(4)
22054 .nr(8)
22055 .kr(1)
22056 .sr(4)
22057 .m(4)
22058 .n(8)
22059 .k(4)
22060 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22061 }
22062
22063 TEST(F32_IGEMM_4X8S4__NEONFMA, strided_cn) {
22064 TEST_REQUIRES_ARM_NEON_FMA;
22065 GemmMicrokernelTester()
22066 .mr(4)
22067 .nr(8)
22068 .kr(1)
22069 .sr(4)
22070 .m(4)
22071 .n(8)
22072 .k(4)
22073 .cn_stride(11)
22074 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22075 }
22076
22077 TEST(F32_IGEMM_4X8S4__NEONFMA, k_eq_4_subtile) {
22078 TEST_REQUIRES_ARM_NEON_FMA;
22079 for (uint32_t m = 1; m <= 4; m++) {
22080 for (uint32_t n = 1; n <= 8; n++) {
22081 GemmMicrokernelTester()
22082 .mr(4)
22083 .nr(8)
22084 .kr(1)
22085 .sr(4)
22086 .m(m)
22087 .n(n)
22088 .k(4)
22089 .iterations(1)
22090 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22091 }
22092 }
22093 }
22094
22095 TEST(F32_IGEMM_4X8S4__NEONFMA, k_eq_4_subtile_m) {
22096 TEST_REQUIRES_ARM_NEON_FMA;
22097 for (uint32_t m = 1; m <= 4; m++) {
22098 GemmMicrokernelTester()
22099 .mr(4)
22100 .nr(8)
22101 .kr(1)
22102 .sr(4)
22103 .m(m)
22104 .n(8)
22105 .k(4)
22106 .iterations(1)
22107 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22108 }
22109 }
22110
22111 TEST(F32_IGEMM_4X8S4__NEONFMA, k_eq_4_subtile_n) {
22112 TEST_REQUIRES_ARM_NEON_FMA;
22113 for (uint32_t n = 1; n <= 8; n++) {
22114 GemmMicrokernelTester()
22115 .mr(4)
22116 .nr(8)
22117 .kr(1)
22118 .sr(4)
22119 .m(4)
22120 .n(n)
22121 .k(4)
22122 .iterations(1)
22123 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22124 }
22125 }
22126
22127 TEST(F32_IGEMM_4X8S4__NEONFMA, k_lt_4) {
22128 TEST_REQUIRES_ARM_NEON_FMA;
22129 for (size_t k = 1; k < 4; k++) {
22130 GemmMicrokernelTester()
22131 .mr(4)
22132 .nr(8)
22133 .kr(1)
22134 .sr(4)
22135 .m(4)
22136 .n(8)
22137 .k(k)
22138 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22139 }
22140 }
22141
22142 TEST(F32_IGEMM_4X8S4__NEONFMA, k_lt_4_subtile) {
22143 TEST_REQUIRES_ARM_NEON_FMA;
22144 for (size_t k = 1; k < 4; k++) {
22145 for (uint32_t m = 1; m <= 4; m++) {
22146 for (uint32_t n = 1; n <= 8; n++) {
22147 GemmMicrokernelTester()
22148 .mr(4)
22149 .nr(8)
22150 .kr(1)
22151 .sr(4)
22152 .m(m)
22153 .n(n)
22154 .k(k)
22155 .iterations(1)
22156 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22157 }
22158 }
22159 }
22160 }
22161
22162 TEST(F32_IGEMM_4X8S4__NEONFMA, k_gt_4) {
22163 TEST_REQUIRES_ARM_NEON_FMA;
22164 for (size_t k = 5; k < 8; k++) {
22165 GemmMicrokernelTester()
22166 .mr(4)
22167 .nr(8)
22168 .kr(1)
22169 .sr(4)
22170 .m(4)
22171 .n(8)
22172 .k(k)
22173 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22174 }
22175 }
22176
22177 TEST(F32_IGEMM_4X8S4__NEONFMA, k_gt_4_subtile) {
22178 TEST_REQUIRES_ARM_NEON_FMA;
22179 for (size_t k = 5; k < 8; k++) {
22180 for (uint32_t m = 1; m <= 4; m++) {
22181 for (uint32_t n = 1; n <= 8; n++) {
22182 GemmMicrokernelTester()
22183 .mr(4)
22184 .nr(8)
22185 .kr(1)
22186 .sr(4)
22187 .m(m)
22188 .n(n)
22189 .k(k)
22190 .iterations(1)
22191 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22192 }
22193 }
22194 }
22195 }
22196
22197 TEST(F32_IGEMM_4X8S4__NEONFMA, k_div_4) {
22198 TEST_REQUIRES_ARM_NEON_FMA;
22199 for (size_t k = 8; k <= 40; k += 4) {
22200 GemmMicrokernelTester()
22201 .mr(4)
22202 .nr(8)
22203 .kr(1)
22204 .sr(4)
22205 .m(4)
22206 .n(8)
22207 .k(k)
22208 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22209 }
22210 }
22211
22212 TEST(F32_IGEMM_4X8S4__NEONFMA, k_div_4_subtile) {
22213 TEST_REQUIRES_ARM_NEON_FMA;
22214 for (size_t k = 8; k <= 40; k += 4) {
22215 for (uint32_t m = 1; m <= 4; m++) {
22216 for (uint32_t n = 1; n <= 8; n++) {
22217 GemmMicrokernelTester()
22218 .mr(4)
22219 .nr(8)
22220 .kr(1)
22221 .sr(4)
22222 .m(m)
22223 .n(n)
22224 .k(k)
22225 .iterations(1)
22226 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22227 }
22228 }
22229 }
22230 }
22231
22232 TEST(F32_IGEMM_4X8S4__NEONFMA, n_gt_8) {
22233 TEST_REQUIRES_ARM_NEON_FMA;
22234 for (uint32_t n = 9; n < 16; n++) {
22235 for (size_t k = 1; k <= 20; k += 5) {
22236 GemmMicrokernelTester()
22237 .mr(4)
22238 .nr(8)
22239 .kr(1)
22240 .sr(4)
22241 .m(4)
22242 .n(8)
22243 .k(k)
22244 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22245 }
22246 }
22247 }
22248
22249 TEST(F32_IGEMM_4X8S4__NEONFMA, n_gt_8_strided_cn) {
22250 TEST_REQUIRES_ARM_NEON_FMA;
22251 for (uint32_t n = 9; n < 16; n++) {
22252 for (size_t k = 1; k <= 20; k += 5) {
22253 GemmMicrokernelTester()
22254 .mr(4)
22255 .nr(8)
22256 .kr(1)
22257 .sr(4)
22258 .m(4)
22259 .n(8)
22260 .k(k)
22261 .cn_stride(11)
22262 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22263 }
22264 }
22265 }
22266
22267 TEST(F32_IGEMM_4X8S4__NEONFMA, n_gt_8_subtile) {
22268 TEST_REQUIRES_ARM_NEON_FMA;
22269 for (uint32_t n = 9; n < 16; n++) {
22270 for (size_t k = 1; k <= 20; k += 5) {
22271 for (uint32_t m = 1; m <= 4; m++) {
22272 GemmMicrokernelTester()
22273 .mr(4)
22274 .nr(8)
22275 .kr(1)
22276 .sr(4)
22277 .m(m)
22278 .n(n)
22279 .k(k)
22280 .iterations(1)
22281 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22282 }
22283 }
22284 }
22285 }
22286
22287 TEST(F32_IGEMM_4X8S4__NEONFMA, n_div_8) {
22288 TEST_REQUIRES_ARM_NEON_FMA;
22289 for (uint32_t n = 16; n <= 24; n += 8) {
22290 for (size_t k = 1; k <= 20; k += 5) {
22291 GemmMicrokernelTester()
22292 .mr(4)
22293 .nr(8)
22294 .kr(1)
22295 .sr(4)
22296 .m(4)
22297 .n(8)
22298 .k(k)
22299 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22300 }
22301 }
22302 }
22303
22304 TEST(F32_IGEMM_4X8S4__NEONFMA, n_div_8_strided_cn) {
22305 TEST_REQUIRES_ARM_NEON_FMA;
22306 for (uint32_t n = 16; n <= 24; n += 8) {
22307 for (size_t k = 1; k <= 20; k += 5) {
22308 GemmMicrokernelTester()
22309 .mr(4)
22310 .nr(8)
22311 .kr(1)
22312 .sr(4)
22313 .m(4)
22314 .n(n)
22315 .k(k)
22316 .cn_stride(11)
22317 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22318 }
22319 }
22320 }
22321
22322 TEST(F32_IGEMM_4X8S4__NEONFMA, n_div_8_subtile) {
22323 TEST_REQUIRES_ARM_NEON_FMA;
22324 for (uint32_t n = 16; n <= 24; n += 8) {
22325 for (size_t k = 1; k <= 20; k += 5) {
22326 for (uint32_t m = 1; m <= 4; m++) {
22327 GemmMicrokernelTester()
22328 .mr(4)
22329 .nr(8)
22330 .kr(1)
22331 .sr(4)
22332 .m(m)
22333 .n(n)
22334 .k(k)
22335 .iterations(1)
22336 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22337 }
22338 }
22339 }
22340 }
22341
22342 TEST(F32_IGEMM_4X8S4__NEONFMA, small_kernel) {
22343 TEST_REQUIRES_ARM_NEON_FMA;
22344 for (size_t k = 1; k <= 20; k += 5) {
22345 GemmMicrokernelTester()
22346 .mr(4)
22347 .nr(8)
22348 .kr(1)
22349 .sr(4)
22350 .m(4)
22351 .n(8)
22352 .k(k)
22353 .ks(3)
22354 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22355 }
22356 }
22357
22358 TEST(F32_IGEMM_4X8S4__NEONFMA, small_kernel_subtile) {
22359 TEST_REQUIRES_ARM_NEON_FMA;
22360 for (size_t k = 1; k <= 20; k += 5) {
22361 for (uint32_t m = 1; m <= 4; m++) {
22362 for (uint32_t n = 1; n <= 8; n++) {
22363 GemmMicrokernelTester()
22364 .mr(4)
22365 .nr(8)
22366 .kr(1)
22367 .sr(4)
22368 .m(m)
22369 .n(n)
22370 .k(k)
22371 .ks(3)
22372 .iterations(1)
22373 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22374 }
22375 }
22376 }
22377 }
22378
22379 TEST(F32_IGEMM_4X8S4__NEONFMA, n_gt_8_small_kernel) {
22380 TEST_REQUIRES_ARM_NEON_FMA;
22381 for (uint32_t n = 9; n < 16; n++) {
22382 for (size_t k = 1; k <= 20; k += 5) {
22383 GemmMicrokernelTester()
22384 .mr(4)
22385 .nr(8)
22386 .kr(1)
22387 .sr(4)
22388 .m(4)
22389 .n(8)
22390 .k(k)
22391 .ks(3)
22392 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22393 }
22394 }
22395 }
22396
22397 TEST(F32_IGEMM_4X8S4__NEONFMA, n_div_8_small_kernel) {
22398 TEST_REQUIRES_ARM_NEON_FMA;
22399 for (uint32_t n = 16; n <= 24; n += 8) {
22400 for (size_t k = 1; k <= 20; k += 5) {
22401 GemmMicrokernelTester()
22402 .mr(4)
22403 .nr(8)
22404 .kr(1)
22405 .sr(4)
22406 .m(4)
22407 .n(8)
22408 .k(k)
22409 .ks(3)
22410 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22411 }
22412 }
22413 }
22414
22415 TEST(F32_IGEMM_4X8S4__NEONFMA, strided_cm_subtile) {
22416 TEST_REQUIRES_ARM_NEON_FMA;
22417 for (size_t k = 1; k <= 20; k += 5) {
22418 for (uint32_t m = 1; m <= 4; m++) {
22419 for (uint32_t n = 1; n <= 8; n++) {
22420 GemmMicrokernelTester()
22421 .mr(4)
22422 .nr(8)
22423 .kr(1)
22424 .sr(4)
22425 .m(m)
22426 .n(n)
22427 .k(k)
22428 .cm_stride(11)
22429 .iterations(1)
22430 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22431 }
22432 }
22433 }
22434 }
22435
22436 TEST(F32_IGEMM_4X8S4__NEONFMA, a_offset) {
22437 TEST_REQUIRES_ARM_NEON_FMA;
22438 for (size_t k = 1; k <= 20; k += 5) {
22439 GemmMicrokernelTester()
22440 .mr(4)
22441 .nr(8)
22442 .kr(1)
22443 .sr(4)
22444 .m(4)
22445 .n(8)
22446 .k(k)
22447 .ks(3)
22448 .a_offset(83)
22449 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22450 }
22451 }
22452
22453 TEST(F32_IGEMM_4X8S4__NEONFMA, zero) {
22454 TEST_REQUIRES_ARM_NEON_FMA;
22455 for (uint32_t mz = 0; mz < 4; mz++) {
22456 for (size_t k = 1; k <= 20; k += 5) {
22457 GemmMicrokernelTester()
22458 .mr(4)
22459 .nr(8)
22460 .kr(1)
22461 .sr(4)
22462 .m(4)
22463 .n(8)
22464 .k(k)
22465 .ks(3)
22466 .a_offset(83)
22467 .zero_index(mz)
22468 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22469 }
22470 }
22471 }
22472
22473 TEST(F32_IGEMM_4X8S4__NEONFMA, qmin) {
22474 TEST_REQUIRES_ARM_NEON_FMA;
22475 GemmMicrokernelTester()
22476 .mr(4)
22477 .nr(8)
22478 .kr(1)
22479 .sr(4)
22480 .m(4)
22481 .n(8)
22482 .k(4)
22483 .qmin(128)
22484 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22485 }
22486
22487 TEST(F32_IGEMM_4X8S4__NEONFMA, qmax) {
22488 TEST_REQUIRES_ARM_NEON_FMA;
22489 GemmMicrokernelTester()
22490 .mr(4)
22491 .nr(8)
22492 .kr(1)
22493 .sr(4)
22494 .m(4)
22495 .n(8)
22496 .k(4)
22497 .qmax(128)
22498 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22499 }
22500
22501 TEST(F32_IGEMM_4X8S4__NEONFMA, strided_cm) {
22502 TEST_REQUIRES_ARM_NEON_FMA;
22503 GemmMicrokernelTester()
22504 .mr(4)
22505 .nr(8)
22506 .kr(1)
22507 .sr(4)
22508 .m(4)
22509 .n(8)
22510 .k(4)
22511 .cm_stride(11)
22512 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
22513 }
22514#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
22515
22516
22517#if XNN_ARCH_ARM || XNN_ARCH_ARM64
22518 TEST(F32_IGEMM_6X8S4__NEONFMA, k_eq_4) {
22519 TEST_REQUIRES_ARM_NEON_FMA;
22520 GemmMicrokernelTester()
22521 .mr(6)
22522 .nr(8)
22523 .kr(1)
22524 .sr(4)
22525 .m(6)
22526 .n(8)
22527 .k(4)
22528 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22529 }
22530
22531 TEST(F32_IGEMM_6X8S4__NEONFMA, strided_cn) {
22532 TEST_REQUIRES_ARM_NEON_FMA;
22533 GemmMicrokernelTester()
22534 .mr(6)
22535 .nr(8)
22536 .kr(1)
22537 .sr(4)
22538 .m(6)
22539 .n(8)
22540 .k(4)
22541 .cn_stride(11)
22542 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22543 }
22544
22545 TEST(F32_IGEMM_6X8S4__NEONFMA, k_eq_4_subtile) {
22546 TEST_REQUIRES_ARM_NEON_FMA;
22547 for (uint32_t m = 1; m <= 6; m++) {
22548 for (uint32_t n = 1; n <= 8; n++) {
22549 GemmMicrokernelTester()
22550 .mr(6)
22551 .nr(8)
22552 .kr(1)
22553 .sr(4)
22554 .m(m)
22555 .n(n)
22556 .k(4)
22557 .iterations(1)
22558 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22559 }
22560 }
22561 }
22562
22563 TEST(F32_IGEMM_6X8S4__NEONFMA, k_eq_4_subtile_m) {
22564 TEST_REQUIRES_ARM_NEON_FMA;
22565 for (uint32_t m = 1; m <= 6; m++) {
22566 GemmMicrokernelTester()
22567 .mr(6)
22568 .nr(8)
22569 .kr(1)
22570 .sr(4)
22571 .m(m)
22572 .n(8)
22573 .k(4)
22574 .iterations(1)
22575 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22576 }
22577 }
22578
22579 TEST(F32_IGEMM_6X8S4__NEONFMA, k_eq_4_subtile_n) {
22580 TEST_REQUIRES_ARM_NEON_FMA;
22581 for (uint32_t n = 1; n <= 8; n++) {
22582 GemmMicrokernelTester()
22583 .mr(6)
22584 .nr(8)
22585 .kr(1)
22586 .sr(4)
22587 .m(6)
22588 .n(n)
22589 .k(4)
22590 .iterations(1)
22591 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22592 }
22593 }
22594
22595 TEST(F32_IGEMM_6X8S4__NEONFMA, k_lt_4) {
22596 TEST_REQUIRES_ARM_NEON_FMA;
22597 for (size_t k = 1; k < 4; k++) {
22598 GemmMicrokernelTester()
22599 .mr(6)
22600 .nr(8)
22601 .kr(1)
22602 .sr(4)
22603 .m(6)
22604 .n(8)
22605 .k(k)
22606 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22607 }
22608 }
22609
22610 TEST(F32_IGEMM_6X8S4__NEONFMA, k_lt_4_subtile) {
22611 TEST_REQUIRES_ARM_NEON_FMA;
22612 for (size_t k = 1; k < 4; k++) {
22613 for (uint32_t m = 1; m <= 6; m++) {
22614 for (uint32_t n = 1; n <= 8; n++) {
22615 GemmMicrokernelTester()
22616 .mr(6)
22617 .nr(8)
22618 .kr(1)
22619 .sr(4)
22620 .m(m)
22621 .n(n)
22622 .k(k)
22623 .iterations(1)
22624 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22625 }
22626 }
22627 }
22628 }
22629
22630 TEST(F32_IGEMM_6X8S4__NEONFMA, k_gt_4) {
22631 TEST_REQUIRES_ARM_NEON_FMA;
22632 for (size_t k = 5; k < 8; k++) {
22633 GemmMicrokernelTester()
22634 .mr(6)
22635 .nr(8)
22636 .kr(1)
22637 .sr(4)
22638 .m(6)
22639 .n(8)
22640 .k(k)
22641 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22642 }
22643 }
22644
22645 TEST(F32_IGEMM_6X8S4__NEONFMA, k_gt_4_subtile) {
22646 TEST_REQUIRES_ARM_NEON_FMA;
22647 for (size_t k = 5; k < 8; k++) {
22648 for (uint32_t m = 1; m <= 6; m++) {
22649 for (uint32_t n = 1; n <= 8; n++) {
22650 GemmMicrokernelTester()
22651 .mr(6)
22652 .nr(8)
22653 .kr(1)
22654 .sr(4)
22655 .m(m)
22656 .n(n)
22657 .k(k)
22658 .iterations(1)
22659 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22660 }
22661 }
22662 }
22663 }
22664
22665 TEST(F32_IGEMM_6X8S4__NEONFMA, k_div_4) {
22666 TEST_REQUIRES_ARM_NEON_FMA;
22667 for (size_t k = 8; k <= 40; k += 4) {
22668 GemmMicrokernelTester()
22669 .mr(6)
22670 .nr(8)
22671 .kr(1)
22672 .sr(4)
22673 .m(6)
22674 .n(8)
22675 .k(k)
22676 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22677 }
22678 }
22679
22680 TEST(F32_IGEMM_6X8S4__NEONFMA, k_div_4_subtile) {
22681 TEST_REQUIRES_ARM_NEON_FMA;
22682 for (size_t k = 8; k <= 40; k += 4) {
22683 for (uint32_t m = 1; m <= 6; m++) {
22684 for (uint32_t n = 1; n <= 8; n++) {
22685 GemmMicrokernelTester()
22686 .mr(6)
22687 .nr(8)
22688 .kr(1)
22689 .sr(4)
22690 .m(m)
22691 .n(n)
22692 .k(k)
22693 .iterations(1)
22694 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22695 }
22696 }
22697 }
22698 }
22699
22700 TEST(F32_IGEMM_6X8S4__NEONFMA, n_gt_8) {
22701 TEST_REQUIRES_ARM_NEON_FMA;
22702 for (uint32_t n = 9; n < 16; n++) {
22703 for (size_t k = 1; k <= 20; k += 5) {
22704 GemmMicrokernelTester()
22705 .mr(6)
22706 .nr(8)
22707 .kr(1)
22708 .sr(4)
22709 .m(6)
22710 .n(8)
22711 .k(k)
22712 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22713 }
22714 }
22715 }
22716
22717 TEST(F32_IGEMM_6X8S4__NEONFMA, n_gt_8_strided_cn) {
22718 TEST_REQUIRES_ARM_NEON_FMA;
22719 for (uint32_t n = 9; n < 16; n++) {
22720 for (size_t k = 1; k <= 20; k += 5) {
22721 GemmMicrokernelTester()
22722 .mr(6)
22723 .nr(8)
22724 .kr(1)
22725 .sr(4)
22726 .m(6)
22727 .n(8)
22728 .k(k)
22729 .cn_stride(11)
22730 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22731 }
22732 }
22733 }
22734
22735 TEST(F32_IGEMM_6X8S4__NEONFMA, n_gt_8_subtile) {
22736 TEST_REQUIRES_ARM_NEON_FMA;
22737 for (uint32_t n = 9; n < 16; n++) {
22738 for (size_t k = 1; k <= 20; k += 5) {
22739 for (uint32_t m = 1; m <= 6; m++) {
22740 GemmMicrokernelTester()
22741 .mr(6)
22742 .nr(8)
22743 .kr(1)
22744 .sr(4)
22745 .m(m)
22746 .n(n)
22747 .k(k)
22748 .iterations(1)
22749 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22750 }
22751 }
22752 }
22753 }
22754
22755 TEST(F32_IGEMM_6X8S4__NEONFMA, n_div_8) {
22756 TEST_REQUIRES_ARM_NEON_FMA;
22757 for (uint32_t n = 16; n <= 24; n += 8) {
22758 for (size_t k = 1; k <= 20; k += 5) {
22759 GemmMicrokernelTester()
22760 .mr(6)
22761 .nr(8)
22762 .kr(1)
22763 .sr(4)
22764 .m(6)
22765 .n(8)
22766 .k(k)
22767 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22768 }
22769 }
22770 }
22771
22772 TEST(F32_IGEMM_6X8S4__NEONFMA, n_div_8_strided_cn) {
22773 TEST_REQUIRES_ARM_NEON_FMA;
22774 for (uint32_t n = 16; n <= 24; n += 8) {
22775 for (size_t k = 1; k <= 20; k += 5) {
22776 GemmMicrokernelTester()
22777 .mr(6)
22778 .nr(8)
22779 .kr(1)
22780 .sr(4)
22781 .m(6)
22782 .n(n)
22783 .k(k)
22784 .cn_stride(11)
22785 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22786 }
22787 }
22788 }
22789
22790 TEST(F32_IGEMM_6X8S4__NEONFMA, n_div_8_subtile) {
22791 TEST_REQUIRES_ARM_NEON_FMA;
22792 for (uint32_t n = 16; n <= 24; n += 8) {
22793 for (size_t k = 1; k <= 20; k += 5) {
22794 for (uint32_t m = 1; m <= 6; m++) {
22795 GemmMicrokernelTester()
22796 .mr(6)
22797 .nr(8)
22798 .kr(1)
22799 .sr(4)
22800 .m(m)
22801 .n(n)
22802 .k(k)
22803 .iterations(1)
22804 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22805 }
22806 }
22807 }
22808 }
22809
22810 TEST(F32_IGEMM_6X8S4__NEONFMA, small_kernel) {
22811 TEST_REQUIRES_ARM_NEON_FMA;
22812 for (size_t k = 1; k <= 20; k += 5) {
22813 GemmMicrokernelTester()
22814 .mr(6)
22815 .nr(8)
22816 .kr(1)
22817 .sr(4)
22818 .m(6)
22819 .n(8)
22820 .k(k)
22821 .ks(3)
22822 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22823 }
22824 }
22825
22826 TEST(F32_IGEMM_6X8S4__NEONFMA, small_kernel_subtile) {
22827 TEST_REQUIRES_ARM_NEON_FMA;
22828 for (size_t k = 1; k <= 20; k += 5) {
22829 for (uint32_t m = 1; m <= 6; m++) {
22830 for (uint32_t n = 1; n <= 8; n++) {
22831 GemmMicrokernelTester()
22832 .mr(6)
22833 .nr(8)
22834 .kr(1)
22835 .sr(4)
22836 .m(m)
22837 .n(n)
22838 .k(k)
22839 .ks(3)
22840 .iterations(1)
22841 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22842 }
22843 }
22844 }
22845 }
22846
22847 TEST(F32_IGEMM_6X8S4__NEONFMA, n_gt_8_small_kernel) {
22848 TEST_REQUIRES_ARM_NEON_FMA;
22849 for (uint32_t n = 9; n < 16; n++) {
22850 for (size_t k = 1; k <= 20; k += 5) {
22851 GemmMicrokernelTester()
22852 .mr(6)
22853 .nr(8)
22854 .kr(1)
22855 .sr(4)
22856 .m(6)
22857 .n(8)
22858 .k(k)
22859 .ks(3)
22860 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22861 }
22862 }
22863 }
22864
22865 TEST(F32_IGEMM_6X8S4__NEONFMA, n_div_8_small_kernel) {
22866 TEST_REQUIRES_ARM_NEON_FMA;
22867 for (uint32_t n = 16; n <= 24; n += 8) {
22868 for (size_t k = 1; k <= 20; k += 5) {
22869 GemmMicrokernelTester()
22870 .mr(6)
22871 .nr(8)
22872 .kr(1)
22873 .sr(4)
22874 .m(6)
22875 .n(8)
22876 .k(k)
22877 .ks(3)
22878 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22879 }
22880 }
22881 }
22882
22883 TEST(F32_IGEMM_6X8S4__NEONFMA, strided_cm_subtile) {
22884 TEST_REQUIRES_ARM_NEON_FMA;
22885 for (size_t k = 1; k <= 20; k += 5) {
22886 for (uint32_t m = 1; m <= 6; m++) {
22887 for (uint32_t n = 1; n <= 8; n++) {
22888 GemmMicrokernelTester()
22889 .mr(6)
22890 .nr(8)
22891 .kr(1)
22892 .sr(4)
22893 .m(m)
22894 .n(n)
22895 .k(k)
22896 .cm_stride(11)
22897 .iterations(1)
22898 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22899 }
22900 }
22901 }
22902 }
22903
22904 TEST(F32_IGEMM_6X8S4__NEONFMA, a_offset) {
22905 TEST_REQUIRES_ARM_NEON_FMA;
22906 for (size_t k = 1; k <= 20; k += 5) {
22907 GemmMicrokernelTester()
22908 .mr(6)
22909 .nr(8)
22910 .kr(1)
22911 .sr(4)
22912 .m(6)
22913 .n(8)
22914 .k(k)
22915 .ks(3)
22916 .a_offset(127)
22917 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22918 }
22919 }
22920
22921 TEST(F32_IGEMM_6X8S4__NEONFMA, zero) {
22922 TEST_REQUIRES_ARM_NEON_FMA;
22923 for (uint32_t mz = 0; mz < 6; mz++) {
22924 for (size_t k = 1; k <= 20; k += 5) {
22925 GemmMicrokernelTester()
22926 .mr(6)
22927 .nr(8)
22928 .kr(1)
22929 .sr(4)
22930 .m(6)
22931 .n(8)
22932 .k(k)
22933 .ks(3)
22934 .a_offset(127)
22935 .zero_index(mz)
22936 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22937 }
22938 }
22939 }
22940
22941 TEST(F32_IGEMM_6X8S4__NEONFMA, qmin) {
22942 TEST_REQUIRES_ARM_NEON_FMA;
22943 GemmMicrokernelTester()
22944 .mr(6)
22945 .nr(8)
22946 .kr(1)
22947 .sr(4)
22948 .m(6)
22949 .n(8)
22950 .k(4)
22951 .qmin(128)
22952 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22953 }
22954
22955 TEST(F32_IGEMM_6X8S4__NEONFMA, qmax) {
22956 TEST_REQUIRES_ARM_NEON_FMA;
22957 GemmMicrokernelTester()
22958 .mr(6)
22959 .nr(8)
22960 .kr(1)
22961 .sr(4)
22962 .m(6)
22963 .n(8)
22964 .k(4)
22965 .qmax(128)
22966 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22967 }
22968
22969 TEST(F32_IGEMM_6X8S4__NEONFMA, strided_cm) {
22970 TEST_REQUIRES_ARM_NEON_FMA;
22971 GemmMicrokernelTester()
22972 .mr(6)
22973 .nr(8)
22974 .kr(1)
22975 .sr(4)
22976 .m(6)
22977 .n(8)
22978 .k(4)
22979 .cm_stride(11)
22980 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
22981 }
22982#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
22983
22984
22985#if XNN_ARCH_ARM || XNN_ARCH_ARM64
22986 TEST(F32_IGEMM_8X8S4__NEONFMA, k_eq_4) {
22987 TEST_REQUIRES_ARM_NEON_FMA;
22988 GemmMicrokernelTester()
22989 .mr(8)
22990 .nr(8)
22991 .kr(1)
22992 .sr(4)
22993 .m(8)
22994 .n(8)
22995 .k(4)
22996 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
22997 }
22998
22999 TEST(F32_IGEMM_8X8S4__NEONFMA, strided_cn) {
23000 TEST_REQUIRES_ARM_NEON_FMA;
23001 GemmMicrokernelTester()
23002 .mr(8)
23003 .nr(8)
23004 .kr(1)
23005 .sr(4)
23006 .m(8)
23007 .n(8)
23008 .k(4)
23009 .cn_stride(11)
23010 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23011 }
23012
23013 TEST(F32_IGEMM_8X8S4__NEONFMA, k_eq_4_subtile) {
23014 TEST_REQUIRES_ARM_NEON_FMA;
23015 for (uint32_t m = 1; m <= 8; m++) {
23016 for (uint32_t n = 1; n <= 8; n++) {
23017 GemmMicrokernelTester()
23018 .mr(8)
23019 .nr(8)
23020 .kr(1)
23021 .sr(4)
23022 .m(m)
23023 .n(n)
23024 .k(4)
23025 .iterations(1)
23026 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23027 }
23028 }
23029 }
23030
23031 TEST(F32_IGEMM_8X8S4__NEONFMA, k_eq_4_subtile_m) {
23032 TEST_REQUIRES_ARM_NEON_FMA;
23033 for (uint32_t m = 1; m <= 8; m++) {
23034 GemmMicrokernelTester()
23035 .mr(8)
23036 .nr(8)
23037 .kr(1)
23038 .sr(4)
23039 .m(m)
23040 .n(8)
23041 .k(4)
23042 .iterations(1)
23043 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23044 }
23045 }
23046
23047 TEST(F32_IGEMM_8X8S4__NEONFMA, k_eq_4_subtile_n) {
23048 TEST_REQUIRES_ARM_NEON_FMA;
23049 for (uint32_t n = 1; n <= 8; n++) {
23050 GemmMicrokernelTester()
23051 .mr(8)
23052 .nr(8)
23053 .kr(1)
23054 .sr(4)
23055 .m(8)
23056 .n(n)
23057 .k(4)
23058 .iterations(1)
23059 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23060 }
23061 }
23062
23063 TEST(F32_IGEMM_8X8S4__NEONFMA, k_lt_4) {
23064 TEST_REQUIRES_ARM_NEON_FMA;
23065 for (size_t k = 1; k < 4; k++) {
23066 GemmMicrokernelTester()
23067 .mr(8)
23068 .nr(8)
23069 .kr(1)
23070 .sr(4)
23071 .m(8)
23072 .n(8)
23073 .k(k)
23074 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23075 }
23076 }
23077
23078 TEST(F32_IGEMM_8X8S4__NEONFMA, k_lt_4_subtile) {
23079 TEST_REQUIRES_ARM_NEON_FMA;
23080 for (size_t k = 1; k < 4; k++) {
23081 for (uint32_t m = 1; m <= 8; m++) {
23082 for (uint32_t n = 1; n <= 8; n++) {
23083 GemmMicrokernelTester()
23084 .mr(8)
23085 .nr(8)
23086 .kr(1)
23087 .sr(4)
23088 .m(m)
23089 .n(n)
23090 .k(k)
23091 .iterations(1)
23092 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23093 }
23094 }
23095 }
23096 }
23097
23098 TEST(F32_IGEMM_8X8S4__NEONFMA, k_gt_4) {
23099 TEST_REQUIRES_ARM_NEON_FMA;
23100 for (size_t k = 5; k < 8; k++) {
23101 GemmMicrokernelTester()
23102 .mr(8)
23103 .nr(8)
23104 .kr(1)
23105 .sr(4)
23106 .m(8)
23107 .n(8)
23108 .k(k)
23109 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23110 }
23111 }
23112
23113 TEST(F32_IGEMM_8X8S4__NEONFMA, k_gt_4_subtile) {
23114 TEST_REQUIRES_ARM_NEON_FMA;
23115 for (size_t k = 5; k < 8; k++) {
23116 for (uint32_t m = 1; m <= 8; m++) {
23117 for (uint32_t n = 1; n <= 8; n++) {
23118 GemmMicrokernelTester()
23119 .mr(8)
23120 .nr(8)
23121 .kr(1)
23122 .sr(4)
23123 .m(m)
23124 .n(n)
23125 .k(k)
23126 .iterations(1)
23127 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23128 }
23129 }
23130 }
23131 }
23132
23133 TEST(F32_IGEMM_8X8S4__NEONFMA, k_div_4) {
23134 TEST_REQUIRES_ARM_NEON_FMA;
23135 for (size_t k = 8; k <= 40; k += 4) {
23136 GemmMicrokernelTester()
23137 .mr(8)
23138 .nr(8)
23139 .kr(1)
23140 .sr(4)
23141 .m(8)
23142 .n(8)
23143 .k(k)
23144 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23145 }
23146 }
23147
23148 TEST(F32_IGEMM_8X8S4__NEONFMA, k_div_4_subtile) {
23149 TEST_REQUIRES_ARM_NEON_FMA;
23150 for (size_t k = 8; k <= 40; k += 4) {
23151 for (uint32_t m = 1; m <= 8; m++) {
23152 for (uint32_t n = 1; n <= 8; n++) {
23153 GemmMicrokernelTester()
23154 .mr(8)
23155 .nr(8)
23156 .kr(1)
23157 .sr(4)
23158 .m(m)
23159 .n(n)
23160 .k(k)
23161 .iterations(1)
23162 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23163 }
23164 }
23165 }
23166 }
23167
23168 TEST(F32_IGEMM_8X8S4__NEONFMA, n_gt_8) {
23169 TEST_REQUIRES_ARM_NEON_FMA;
23170 for (uint32_t n = 9; n < 16; n++) {
23171 for (size_t k = 1; k <= 20; k += 5) {
23172 GemmMicrokernelTester()
23173 .mr(8)
23174 .nr(8)
23175 .kr(1)
23176 .sr(4)
23177 .m(8)
23178 .n(8)
23179 .k(k)
23180 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23181 }
23182 }
23183 }
23184
23185 TEST(F32_IGEMM_8X8S4__NEONFMA, n_gt_8_strided_cn) {
23186 TEST_REQUIRES_ARM_NEON_FMA;
23187 for (uint32_t n = 9; n < 16; n++) {
23188 for (size_t k = 1; k <= 20; k += 5) {
23189 GemmMicrokernelTester()
23190 .mr(8)
23191 .nr(8)
23192 .kr(1)
23193 .sr(4)
23194 .m(8)
23195 .n(8)
23196 .k(k)
23197 .cn_stride(11)
23198 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23199 }
23200 }
23201 }
23202
23203 TEST(F32_IGEMM_8X8S4__NEONFMA, n_gt_8_subtile) {
23204 TEST_REQUIRES_ARM_NEON_FMA;
23205 for (uint32_t n = 9; n < 16; n++) {
23206 for (size_t k = 1; k <= 20; k += 5) {
23207 for (uint32_t m = 1; m <= 8; m++) {
23208 GemmMicrokernelTester()
23209 .mr(8)
23210 .nr(8)
23211 .kr(1)
23212 .sr(4)
23213 .m(m)
23214 .n(n)
23215 .k(k)
23216 .iterations(1)
23217 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23218 }
23219 }
23220 }
23221 }
23222
23223 TEST(F32_IGEMM_8X8S4__NEONFMA, n_div_8) {
23224 TEST_REQUIRES_ARM_NEON_FMA;
23225 for (uint32_t n = 16; n <= 24; n += 8) {
23226 for (size_t k = 1; k <= 20; k += 5) {
23227 GemmMicrokernelTester()
23228 .mr(8)
23229 .nr(8)
23230 .kr(1)
23231 .sr(4)
23232 .m(8)
23233 .n(8)
23234 .k(k)
23235 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23236 }
23237 }
23238 }
23239
23240 TEST(F32_IGEMM_8X8S4__NEONFMA, n_div_8_strided_cn) {
23241 TEST_REQUIRES_ARM_NEON_FMA;
23242 for (uint32_t n = 16; n <= 24; n += 8) {
23243 for (size_t k = 1; k <= 20; k += 5) {
23244 GemmMicrokernelTester()
23245 .mr(8)
23246 .nr(8)
23247 .kr(1)
23248 .sr(4)
23249 .m(8)
23250 .n(n)
23251 .k(k)
23252 .cn_stride(11)
23253 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23254 }
23255 }
23256 }
23257
23258 TEST(F32_IGEMM_8X8S4__NEONFMA, n_div_8_subtile) {
23259 TEST_REQUIRES_ARM_NEON_FMA;
23260 for (uint32_t n = 16; n <= 24; n += 8) {
23261 for (size_t k = 1; k <= 20; k += 5) {
23262 for (uint32_t m = 1; m <= 8; m++) {
23263 GemmMicrokernelTester()
23264 .mr(8)
23265 .nr(8)
23266 .kr(1)
23267 .sr(4)
23268 .m(m)
23269 .n(n)
23270 .k(k)
23271 .iterations(1)
23272 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23273 }
23274 }
23275 }
23276 }
23277
23278 TEST(F32_IGEMM_8X8S4__NEONFMA, small_kernel) {
23279 TEST_REQUIRES_ARM_NEON_FMA;
23280 for (size_t k = 1; k <= 20; k += 5) {
23281 GemmMicrokernelTester()
23282 .mr(8)
23283 .nr(8)
23284 .kr(1)
23285 .sr(4)
23286 .m(8)
23287 .n(8)
23288 .k(k)
23289 .ks(3)
23290 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23291 }
23292 }
23293
23294 TEST(F32_IGEMM_8X8S4__NEONFMA, small_kernel_subtile) {
23295 TEST_REQUIRES_ARM_NEON_FMA;
23296 for (size_t k = 1; k <= 20; k += 5) {
23297 for (uint32_t m = 1; m <= 8; m++) {
23298 for (uint32_t n = 1; n <= 8; n++) {
23299 GemmMicrokernelTester()
23300 .mr(8)
23301 .nr(8)
23302 .kr(1)
23303 .sr(4)
23304 .m(m)
23305 .n(n)
23306 .k(k)
23307 .ks(3)
23308 .iterations(1)
23309 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23310 }
23311 }
23312 }
23313 }
23314
23315 TEST(F32_IGEMM_8X8S4__NEONFMA, n_gt_8_small_kernel) {
23316 TEST_REQUIRES_ARM_NEON_FMA;
23317 for (uint32_t n = 9; n < 16; n++) {
23318 for (size_t k = 1; k <= 20; k += 5) {
23319 GemmMicrokernelTester()
23320 .mr(8)
23321 .nr(8)
23322 .kr(1)
23323 .sr(4)
23324 .m(8)
23325 .n(8)
23326 .k(k)
23327 .ks(3)
23328 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23329 }
23330 }
23331 }
23332
23333 TEST(F32_IGEMM_8X8S4__NEONFMA, n_div_8_small_kernel) {
23334 TEST_REQUIRES_ARM_NEON_FMA;
23335 for (uint32_t n = 16; n <= 24; n += 8) {
23336 for (size_t k = 1; k <= 20; k += 5) {
23337 GemmMicrokernelTester()
23338 .mr(8)
23339 .nr(8)
23340 .kr(1)
23341 .sr(4)
23342 .m(8)
23343 .n(8)
23344 .k(k)
23345 .ks(3)
23346 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23347 }
23348 }
23349 }
23350
23351 TEST(F32_IGEMM_8X8S4__NEONFMA, strided_cm_subtile) {
23352 TEST_REQUIRES_ARM_NEON_FMA;
23353 for (size_t k = 1; k <= 20; k += 5) {
23354 for (uint32_t m = 1; m <= 8; m++) {
23355 for (uint32_t n = 1; n <= 8; n++) {
23356 GemmMicrokernelTester()
23357 .mr(8)
23358 .nr(8)
23359 .kr(1)
23360 .sr(4)
23361 .m(m)
23362 .n(n)
23363 .k(k)
23364 .cm_stride(11)
23365 .iterations(1)
23366 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23367 }
23368 }
23369 }
23370 }
23371
23372 TEST(F32_IGEMM_8X8S4__NEONFMA, a_offset) {
23373 TEST_REQUIRES_ARM_NEON_FMA;
23374 for (size_t k = 1; k <= 20; k += 5) {
23375 GemmMicrokernelTester()
23376 .mr(8)
23377 .nr(8)
23378 .kr(1)
23379 .sr(4)
23380 .m(8)
23381 .n(8)
23382 .k(k)
23383 .ks(3)
23384 .a_offset(163)
23385 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23386 }
23387 }
23388
23389 TEST(F32_IGEMM_8X8S4__NEONFMA, zero) {
23390 TEST_REQUIRES_ARM_NEON_FMA;
23391 for (uint32_t mz = 0; mz < 8; mz++) {
23392 for (size_t k = 1; k <= 20; k += 5) {
23393 GemmMicrokernelTester()
23394 .mr(8)
23395 .nr(8)
23396 .kr(1)
23397 .sr(4)
23398 .m(8)
23399 .n(8)
23400 .k(k)
23401 .ks(3)
23402 .a_offset(163)
23403 .zero_index(mz)
23404 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23405 }
23406 }
23407 }
23408
23409 TEST(F32_IGEMM_8X8S4__NEONFMA, qmin) {
23410 TEST_REQUIRES_ARM_NEON_FMA;
23411 GemmMicrokernelTester()
23412 .mr(8)
23413 .nr(8)
23414 .kr(1)
23415 .sr(4)
23416 .m(8)
23417 .n(8)
23418 .k(4)
23419 .qmin(128)
23420 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23421 }
23422
23423 TEST(F32_IGEMM_8X8S4__NEONFMA, qmax) {
23424 TEST_REQUIRES_ARM_NEON_FMA;
23425 GemmMicrokernelTester()
23426 .mr(8)
23427 .nr(8)
23428 .kr(1)
23429 .sr(4)
23430 .m(8)
23431 .n(8)
23432 .k(4)
23433 .qmax(128)
23434 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23435 }
23436
23437 TEST(F32_IGEMM_8X8S4__NEONFMA, strided_cm) {
23438 TEST_REQUIRES_ARM_NEON_FMA;
23439 GemmMicrokernelTester()
23440 .mr(8)
23441 .nr(8)
23442 .kr(1)
23443 .sr(4)
23444 .m(8)
23445 .n(8)
23446 .k(4)
23447 .cm_stride(11)
23448 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
23449 }
23450#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
23451
23452
Marat Dukhan1dadbf72019-10-01 10:46:20 -070023453#if XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070023454 TEST(F32_IGEMM_1X8__SSE_LOAD1, k_eq_1) {
23455 TEST_REQUIRES_X86_SSE;
23456 GemmMicrokernelTester()
23457 .mr(1)
23458 .nr(8)
23459 .kr(1)
23460 .sr(1)
23461 .m(1)
23462 .n(8)
23463 .k(1)
23464 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23465 }
23466
23467 TEST(F32_IGEMM_1X8__SSE_LOAD1, strided_cn) {
23468 TEST_REQUIRES_X86_SSE;
23469 GemmMicrokernelTester()
23470 .mr(1)
23471 .nr(8)
23472 .kr(1)
23473 .sr(1)
23474 .m(1)
23475 .n(8)
23476 .k(1)
23477 .cn_stride(11)
23478 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23479 }
23480
23481 TEST(F32_IGEMM_1X8__SSE_LOAD1, k_eq_1_subtile) {
23482 TEST_REQUIRES_X86_SSE;
23483 for (uint32_t m = 1; m <= 1; m++) {
23484 for (uint32_t n = 1; n <= 8; n++) {
23485 GemmMicrokernelTester()
23486 .mr(1)
23487 .nr(8)
23488 .kr(1)
23489 .sr(1)
23490 .m(m)
23491 .n(n)
23492 .k(1)
23493 .iterations(1)
23494 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23495 }
23496 }
23497 }
23498
23499 TEST(F32_IGEMM_1X8__SSE_LOAD1, k_eq_1_subtile_m) {
23500 TEST_REQUIRES_X86_SSE;
23501 for (uint32_t m = 1; m <= 1; m++) {
23502 GemmMicrokernelTester()
23503 .mr(1)
23504 .nr(8)
23505 .kr(1)
23506 .sr(1)
23507 .m(m)
23508 .n(8)
23509 .k(1)
23510 .iterations(1)
23511 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23512 }
23513 }
23514
23515 TEST(F32_IGEMM_1X8__SSE_LOAD1, k_eq_1_subtile_n) {
23516 TEST_REQUIRES_X86_SSE;
23517 for (uint32_t n = 1; n <= 8; n++) {
23518 GemmMicrokernelTester()
23519 .mr(1)
23520 .nr(8)
23521 .kr(1)
23522 .sr(1)
23523 .m(1)
23524 .n(n)
23525 .k(1)
23526 .iterations(1)
23527 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23528 }
23529 }
23530
23531 TEST(F32_IGEMM_1X8__SSE_LOAD1, k_gt_1) {
23532 TEST_REQUIRES_X86_SSE;
23533 for (size_t k = 2; k < 10; k++) {
23534 GemmMicrokernelTester()
23535 .mr(1)
23536 .nr(8)
23537 .kr(1)
23538 .sr(1)
23539 .m(1)
23540 .n(8)
23541 .k(k)
23542 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23543 }
23544 }
23545
23546 TEST(F32_IGEMM_1X8__SSE_LOAD1, k_gt_1_subtile) {
23547 TEST_REQUIRES_X86_SSE;
23548 for (size_t k = 2; k < 10; k++) {
23549 for (uint32_t m = 1; m <= 1; m++) {
23550 for (uint32_t n = 1; n <= 8; n++) {
23551 GemmMicrokernelTester()
23552 .mr(1)
23553 .nr(8)
23554 .kr(1)
23555 .sr(1)
23556 .m(m)
23557 .n(n)
23558 .k(k)
23559 .iterations(1)
23560 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23561 }
23562 }
23563 }
23564 }
23565
23566 TEST(F32_IGEMM_1X8__SSE_LOAD1, n_gt_8) {
23567 TEST_REQUIRES_X86_SSE;
23568 for (uint32_t n = 9; n < 16; n++) {
23569 for (size_t k = 1; k <= 5; k += 2) {
23570 GemmMicrokernelTester()
23571 .mr(1)
23572 .nr(8)
23573 .kr(1)
23574 .sr(1)
23575 .m(1)
23576 .n(8)
23577 .k(k)
23578 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23579 }
23580 }
23581 }
23582
23583 TEST(F32_IGEMM_1X8__SSE_LOAD1, n_gt_8_strided_cn) {
23584 TEST_REQUIRES_X86_SSE;
23585 for (uint32_t n = 9; n < 16; n++) {
23586 for (size_t k = 1; k <= 5; k += 2) {
23587 GemmMicrokernelTester()
23588 .mr(1)
23589 .nr(8)
23590 .kr(1)
23591 .sr(1)
23592 .m(1)
23593 .n(8)
23594 .k(k)
23595 .cn_stride(11)
23596 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23597 }
23598 }
23599 }
23600
23601 TEST(F32_IGEMM_1X8__SSE_LOAD1, n_gt_8_subtile) {
23602 TEST_REQUIRES_X86_SSE;
23603 for (uint32_t n = 9; n < 16; n++) {
23604 for (size_t k = 1; k <= 5; k += 2) {
23605 for (uint32_t m = 1; m <= 1; m++) {
23606 GemmMicrokernelTester()
23607 .mr(1)
23608 .nr(8)
23609 .kr(1)
23610 .sr(1)
23611 .m(m)
23612 .n(n)
23613 .k(k)
23614 .iterations(1)
23615 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23616 }
23617 }
23618 }
23619 }
23620
23621 TEST(F32_IGEMM_1X8__SSE_LOAD1, n_div_8) {
23622 TEST_REQUIRES_X86_SSE;
23623 for (uint32_t n = 16; n <= 24; n += 8) {
23624 for (size_t k = 1; k <= 5; k += 2) {
23625 GemmMicrokernelTester()
23626 .mr(1)
23627 .nr(8)
23628 .kr(1)
23629 .sr(1)
23630 .m(1)
23631 .n(8)
23632 .k(k)
23633 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23634 }
23635 }
23636 }
23637
23638 TEST(F32_IGEMM_1X8__SSE_LOAD1, n_div_8_strided_cn) {
23639 TEST_REQUIRES_X86_SSE;
23640 for (uint32_t n = 16; n <= 24; n += 8) {
23641 for (size_t k = 1; k <= 5; k += 2) {
23642 GemmMicrokernelTester()
23643 .mr(1)
23644 .nr(8)
23645 .kr(1)
23646 .sr(1)
23647 .m(1)
23648 .n(n)
23649 .k(k)
23650 .cn_stride(11)
23651 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23652 }
23653 }
23654 }
23655
23656 TEST(F32_IGEMM_1X8__SSE_LOAD1, n_div_8_subtile) {
23657 TEST_REQUIRES_X86_SSE;
23658 for (uint32_t n = 16; n <= 24; n += 8) {
23659 for (size_t k = 1; k <= 5; k += 2) {
23660 for (uint32_t m = 1; m <= 1; m++) {
23661 GemmMicrokernelTester()
23662 .mr(1)
23663 .nr(8)
23664 .kr(1)
23665 .sr(1)
23666 .m(m)
23667 .n(n)
23668 .k(k)
23669 .iterations(1)
23670 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23671 }
23672 }
23673 }
23674 }
23675
23676 TEST(F32_IGEMM_1X8__SSE_LOAD1, small_kernel) {
23677 TEST_REQUIRES_X86_SSE;
23678 for (size_t k = 1; k <= 5; k += 2) {
23679 GemmMicrokernelTester()
23680 .mr(1)
23681 .nr(8)
23682 .kr(1)
23683 .sr(1)
23684 .m(1)
23685 .n(8)
23686 .k(k)
23687 .ks(3)
23688 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23689 }
23690 }
23691
23692 TEST(F32_IGEMM_1X8__SSE_LOAD1, small_kernel_subtile) {
23693 TEST_REQUIRES_X86_SSE;
23694 for (size_t k = 1; k <= 5; k += 2) {
23695 for (uint32_t m = 1; m <= 1; m++) {
23696 for (uint32_t n = 1; n <= 8; n++) {
23697 GemmMicrokernelTester()
23698 .mr(1)
23699 .nr(8)
23700 .kr(1)
23701 .sr(1)
23702 .m(m)
23703 .n(n)
23704 .k(k)
23705 .ks(3)
23706 .iterations(1)
23707 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23708 }
23709 }
23710 }
23711 }
23712
23713 TEST(F32_IGEMM_1X8__SSE_LOAD1, n_gt_8_small_kernel) {
23714 TEST_REQUIRES_X86_SSE;
23715 for (uint32_t n = 9; n < 16; n++) {
23716 for (size_t k = 1; k <= 5; k += 2) {
23717 GemmMicrokernelTester()
23718 .mr(1)
23719 .nr(8)
23720 .kr(1)
23721 .sr(1)
23722 .m(1)
23723 .n(8)
23724 .k(k)
23725 .ks(3)
23726 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23727 }
23728 }
23729 }
23730
23731 TEST(F32_IGEMM_1X8__SSE_LOAD1, n_div_8_small_kernel) {
23732 TEST_REQUIRES_X86_SSE;
23733 for (uint32_t n = 16; n <= 24; n += 8) {
23734 for (size_t k = 1; k <= 5; k += 2) {
23735 GemmMicrokernelTester()
23736 .mr(1)
23737 .nr(8)
23738 .kr(1)
23739 .sr(1)
23740 .m(1)
23741 .n(8)
23742 .k(k)
23743 .ks(3)
23744 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23745 }
23746 }
23747 }
23748
23749 TEST(F32_IGEMM_1X8__SSE_LOAD1, strided_cm_subtile) {
23750 TEST_REQUIRES_X86_SSE;
23751 for (size_t k = 1; k <= 5; k += 2) {
23752 for (uint32_t m = 1; m <= 1; m++) {
23753 for (uint32_t n = 1; n <= 8; n++) {
23754 GemmMicrokernelTester()
23755 .mr(1)
23756 .nr(8)
23757 .kr(1)
23758 .sr(1)
23759 .m(m)
23760 .n(n)
23761 .k(k)
23762 .cm_stride(11)
23763 .iterations(1)
23764 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23765 }
23766 }
23767 }
23768 }
23769
23770 TEST(F32_IGEMM_1X8__SSE_LOAD1, a_offset) {
23771 TEST_REQUIRES_X86_SSE;
23772 for (size_t k = 1; k <= 5; k += 2) {
23773 GemmMicrokernelTester()
23774 .mr(1)
23775 .nr(8)
23776 .kr(1)
23777 .sr(1)
23778 .m(1)
23779 .n(8)
23780 .k(k)
23781 .ks(3)
23782 .a_offset(7)
23783 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23784 }
23785 }
23786
23787 TEST(F32_IGEMM_1X8__SSE_LOAD1, zero) {
23788 TEST_REQUIRES_X86_SSE;
23789 for (uint32_t mz = 0; mz < 1; mz++) {
23790 for (size_t k = 1; k <= 5; k += 2) {
23791 GemmMicrokernelTester()
23792 .mr(1)
23793 .nr(8)
23794 .kr(1)
23795 .sr(1)
23796 .m(1)
23797 .n(8)
23798 .k(k)
23799 .ks(3)
23800 .a_offset(7)
23801 .zero_index(mz)
23802 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23803 }
23804 }
23805 }
23806
23807 TEST(F32_IGEMM_1X8__SSE_LOAD1, qmin) {
23808 TEST_REQUIRES_X86_SSE;
23809 GemmMicrokernelTester()
23810 .mr(1)
23811 .nr(8)
23812 .kr(1)
23813 .sr(1)
23814 .m(1)
23815 .n(8)
23816 .k(1)
23817 .qmin(128)
23818 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23819 }
23820
23821 TEST(F32_IGEMM_1X8__SSE_LOAD1, qmax) {
23822 TEST_REQUIRES_X86_SSE;
23823 GemmMicrokernelTester()
23824 .mr(1)
23825 .nr(8)
23826 .kr(1)
23827 .sr(1)
23828 .m(1)
23829 .n(8)
23830 .k(1)
23831 .qmax(128)
23832 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23833 }
23834
23835 TEST(F32_IGEMM_1X8__SSE_LOAD1, strided_cm) {
23836 TEST_REQUIRES_X86_SSE;
23837 GemmMicrokernelTester()
23838 .mr(1)
23839 .nr(8)
23840 .kr(1)
23841 .sr(1)
23842 .m(1)
23843 .n(8)
23844 .k(1)
23845 .cm_stride(11)
23846 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
23847 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070023848#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070023849
23850
Marat Dukhan1dadbf72019-10-01 10:46:20 -070023851#if XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070023852 TEST(F32_IGEMM_4X8__SSE_LOAD1, k_eq_1) {
23853 TEST_REQUIRES_X86_SSE;
23854 GemmMicrokernelTester()
23855 .mr(4)
23856 .nr(8)
23857 .kr(1)
23858 .sr(1)
23859 .m(4)
23860 .n(8)
23861 .k(1)
23862 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
23863 }
23864
23865 TEST(F32_IGEMM_4X8__SSE_LOAD1, strided_cn) {
23866 TEST_REQUIRES_X86_SSE;
23867 GemmMicrokernelTester()
23868 .mr(4)
23869 .nr(8)
23870 .kr(1)
23871 .sr(1)
23872 .m(4)
23873 .n(8)
23874 .k(1)
23875 .cn_stride(11)
23876 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
23877 }
23878
23879 TEST(F32_IGEMM_4X8__SSE_LOAD1, k_eq_1_subtile) {
23880 TEST_REQUIRES_X86_SSE;
23881 for (uint32_t m = 1; m <= 4; m++) {
23882 for (uint32_t n = 1; n <= 8; n++) {
23883 GemmMicrokernelTester()
23884 .mr(4)
23885 .nr(8)
23886 .kr(1)
23887 .sr(1)
23888 .m(m)
23889 .n(n)
23890 .k(1)
23891 .iterations(1)
23892 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
23893 }
23894 }
23895 }
23896
23897 TEST(F32_IGEMM_4X8__SSE_LOAD1, k_eq_1_subtile_m) {
23898 TEST_REQUIRES_X86_SSE;
23899 for (uint32_t m = 1; m <= 4; m++) {
23900 GemmMicrokernelTester()
23901 .mr(4)
23902 .nr(8)
23903 .kr(1)
23904 .sr(1)
23905 .m(m)
23906 .n(8)
23907 .k(1)
23908 .iterations(1)
23909 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
23910 }
23911 }
23912
23913 TEST(F32_IGEMM_4X8__SSE_LOAD1, k_eq_1_subtile_n) {
23914 TEST_REQUIRES_X86_SSE;
23915 for (uint32_t n = 1; n <= 8; n++) {
23916 GemmMicrokernelTester()
23917 .mr(4)
23918 .nr(8)
23919 .kr(1)
23920 .sr(1)
23921 .m(4)
23922 .n(n)
23923 .k(1)
23924 .iterations(1)
23925 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
23926 }
23927 }
23928
23929 TEST(F32_IGEMM_4X8__SSE_LOAD1, k_gt_1) {
23930 TEST_REQUIRES_X86_SSE;
23931 for (size_t k = 2; k < 10; k++) {
23932 GemmMicrokernelTester()
23933 .mr(4)
23934 .nr(8)
23935 .kr(1)
23936 .sr(1)
23937 .m(4)
23938 .n(8)
23939 .k(k)
23940 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
23941 }
23942 }
23943
23944 TEST(F32_IGEMM_4X8__SSE_LOAD1, k_gt_1_subtile) {
23945 TEST_REQUIRES_X86_SSE;
23946 for (size_t k = 2; k < 10; k++) {
23947 for (uint32_t m = 1; m <= 4; m++) {
23948 for (uint32_t n = 1; n <= 8; n++) {
23949 GemmMicrokernelTester()
23950 .mr(4)
23951 .nr(8)
23952 .kr(1)
23953 .sr(1)
23954 .m(m)
23955 .n(n)
23956 .k(k)
23957 .iterations(1)
23958 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
23959 }
23960 }
23961 }
23962 }
23963
23964 TEST(F32_IGEMM_4X8__SSE_LOAD1, n_gt_8) {
23965 TEST_REQUIRES_X86_SSE;
23966 for (uint32_t n = 9; n < 16; n++) {
23967 for (size_t k = 1; k <= 5; k += 2) {
23968 GemmMicrokernelTester()
23969 .mr(4)
23970 .nr(8)
23971 .kr(1)
23972 .sr(1)
23973 .m(4)
23974 .n(8)
23975 .k(k)
23976 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
23977 }
23978 }
23979 }
23980
23981 TEST(F32_IGEMM_4X8__SSE_LOAD1, n_gt_8_strided_cn) {
23982 TEST_REQUIRES_X86_SSE;
23983 for (uint32_t n = 9; n < 16; n++) {
23984 for (size_t k = 1; k <= 5; k += 2) {
23985 GemmMicrokernelTester()
23986 .mr(4)
23987 .nr(8)
23988 .kr(1)
23989 .sr(1)
23990 .m(4)
23991 .n(8)
23992 .k(k)
23993 .cn_stride(11)
23994 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
23995 }
23996 }
23997 }
23998
23999 TEST(F32_IGEMM_4X8__SSE_LOAD1, n_gt_8_subtile) {
24000 TEST_REQUIRES_X86_SSE;
24001 for (uint32_t n = 9; n < 16; n++) {
24002 for (size_t k = 1; k <= 5; k += 2) {
24003 for (uint32_t m = 1; m <= 4; m++) {
24004 GemmMicrokernelTester()
24005 .mr(4)
24006 .nr(8)
24007 .kr(1)
24008 .sr(1)
24009 .m(m)
24010 .n(n)
24011 .k(k)
24012 .iterations(1)
24013 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
24014 }
24015 }
24016 }
24017 }
24018
24019 TEST(F32_IGEMM_4X8__SSE_LOAD1, n_div_8) {
24020 TEST_REQUIRES_X86_SSE;
24021 for (uint32_t n = 16; n <= 24; n += 8) {
24022 for (size_t k = 1; k <= 5; k += 2) {
24023 GemmMicrokernelTester()
24024 .mr(4)
24025 .nr(8)
24026 .kr(1)
24027 .sr(1)
24028 .m(4)
24029 .n(8)
24030 .k(k)
24031 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
24032 }
24033 }
24034 }
24035
24036 TEST(F32_IGEMM_4X8__SSE_LOAD1, n_div_8_strided_cn) {
24037 TEST_REQUIRES_X86_SSE;
24038 for (uint32_t n = 16; n <= 24; n += 8) {
24039 for (size_t k = 1; k <= 5; k += 2) {
24040 GemmMicrokernelTester()
24041 .mr(4)
24042 .nr(8)
24043 .kr(1)
24044 .sr(1)
24045 .m(4)
24046 .n(n)
24047 .k(k)
24048 .cn_stride(11)
24049 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
24050 }
24051 }
24052 }
24053
24054 TEST(F32_IGEMM_4X8__SSE_LOAD1, n_div_8_subtile) {
24055 TEST_REQUIRES_X86_SSE;
24056 for (uint32_t n = 16; n <= 24; n += 8) {
24057 for (size_t k = 1; k <= 5; k += 2) {
24058 for (uint32_t m = 1; m <= 4; m++) {
24059 GemmMicrokernelTester()
24060 .mr(4)
24061 .nr(8)
24062 .kr(1)
24063 .sr(1)
24064 .m(m)
24065 .n(n)
24066 .k(k)
24067 .iterations(1)
24068 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
24069 }
24070 }
24071 }
24072 }
24073
24074 TEST(F32_IGEMM_4X8__SSE_LOAD1, small_kernel) {
24075 TEST_REQUIRES_X86_SSE;
24076 for (size_t k = 1; k <= 5; k += 2) {
24077 GemmMicrokernelTester()
24078 .mr(4)
24079 .nr(8)
24080 .kr(1)
24081 .sr(1)
24082 .m(4)
24083 .n(8)
24084 .k(k)
24085 .ks(3)
24086 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
24087 }
24088 }
24089
24090 TEST(F32_IGEMM_4X8__SSE_LOAD1, small_kernel_subtile) {
24091 TEST_REQUIRES_X86_SSE;
24092 for (size_t k = 1; k <= 5; k += 2) {
24093 for (uint32_t m = 1; m <= 4; m++) {
24094 for (uint32_t n = 1; n <= 8; n++) {
24095 GemmMicrokernelTester()
24096 .mr(4)
24097 .nr(8)
24098 .kr(1)
24099 .sr(1)
24100 .m(m)
24101 .n(n)
24102 .k(k)
24103 .ks(3)
24104 .iterations(1)
24105 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
24106 }
24107 }
24108 }
24109 }
24110
24111 TEST(F32_IGEMM_4X8__SSE_LOAD1, n_gt_8_small_kernel) {
24112 TEST_REQUIRES_X86_SSE;
24113 for (uint32_t n = 9; n < 16; n++) {
24114 for (size_t k = 1; k <= 5; k += 2) {
24115 GemmMicrokernelTester()
24116 .mr(4)
24117 .nr(8)
24118 .kr(1)
24119 .sr(1)
24120 .m(4)
24121 .n(8)
24122 .k(k)
24123 .ks(3)
24124 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
24125 }
24126 }
24127 }
24128
24129 TEST(F32_IGEMM_4X8__SSE_LOAD1, n_div_8_small_kernel) {
24130 TEST_REQUIRES_X86_SSE;
24131 for (uint32_t n = 16; n <= 24; n += 8) {
24132 for (size_t k = 1; k <= 5; k += 2) {
24133 GemmMicrokernelTester()
24134 .mr(4)
24135 .nr(8)
24136 .kr(1)
24137 .sr(1)
24138 .m(4)
24139 .n(8)
24140 .k(k)
24141 .ks(3)
24142 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
24143 }
24144 }
24145 }
24146
24147 TEST(F32_IGEMM_4X8__SSE_LOAD1, strided_cm_subtile) {
24148 TEST_REQUIRES_X86_SSE;
24149 for (size_t k = 1; k <= 5; k += 2) {
24150 for (uint32_t m = 1; m <= 4; m++) {
24151 for (uint32_t n = 1; n <= 8; n++) {
24152 GemmMicrokernelTester()
24153 .mr(4)
24154 .nr(8)
24155 .kr(1)
24156 .sr(1)
24157 .m(m)
24158 .n(n)
24159 .k(k)
24160 .cm_stride(11)
24161 .iterations(1)
24162 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
24163 }
24164 }
24165 }
24166 }
24167
24168 TEST(F32_IGEMM_4X8__SSE_LOAD1, a_offset) {
24169 TEST_REQUIRES_X86_SSE;
24170 for (size_t k = 1; k <= 5; k += 2) {
24171 GemmMicrokernelTester()
24172 .mr(4)
24173 .nr(8)
24174 .kr(1)
24175 .sr(1)
24176 .m(4)
24177 .n(8)
24178 .k(k)
24179 .ks(3)
24180 .a_offset(23)
24181 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
24182 }
24183 }
24184
24185 TEST(F32_IGEMM_4X8__SSE_LOAD1, zero) {
24186 TEST_REQUIRES_X86_SSE;
24187 for (uint32_t mz = 0; mz < 4; mz++) {
24188 for (size_t k = 1; k <= 5; k += 2) {
24189 GemmMicrokernelTester()
24190 .mr(4)
24191 .nr(8)
24192 .kr(1)
24193 .sr(1)
24194 .m(4)
24195 .n(8)
24196 .k(k)
24197 .ks(3)
24198 .a_offset(23)
24199 .zero_index(mz)
24200 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
24201 }
24202 }
24203 }
24204
24205 TEST(F32_IGEMM_4X8__SSE_LOAD1, qmin) {
24206 TEST_REQUIRES_X86_SSE;
24207 GemmMicrokernelTester()
24208 .mr(4)
24209 .nr(8)
24210 .kr(1)
24211 .sr(1)
24212 .m(4)
24213 .n(8)
24214 .k(1)
24215 .qmin(128)
24216 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
24217 }
24218
24219 TEST(F32_IGEMM_4X8__SSE_LOAD1, qmax) {
24220 TEST_REQUIRES_X86_SSE;
24221 GemmMicrokernelTester()
24222 .mr(4)
24223 .nr(8)
24224 .kr(1)
24225 .sr(1)
24226 .m(4)
24227 .n(8)
24228 .k(1)
24229 .qmax(128)
24230 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
24231 }
24232
24233 TEST(F32_IGEMM_4X8__SSE_LOAD1, strided_cm) {
24234 TEST_REQUIRES_X86_SSE;
24235 GemmMicrokernelTester()
24236 .mr(4)
24237 .nr(8)
24238 .kr(1)
24239 .sr(1)
24240 .m(4)
24241 .n(8)
24242 .k(1)
24243 .cm_stride(11)
24244 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
24245 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070024246#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070024247
24248
Marat Dukhan1dadbf72019-10-01 10:46:20 -070024249#if XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070024250 TEST(F32_IGEMM_1X8__SSE_DUP, k_eq_4) {
24251 TEST_REQUIRES_X86_SSE;
24252 GemmMicrokernelTester()
24253 .mr(1)
24254 .nr(8)
24255 .kr(1)
24256 .sr(1)
24257 .m(1)
24258 .n(8)
24259 .k(4)
24260 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24261 }
24262
24263 TEST(F32_IGEMM_1X8__SSE_DUP, strided_cn) {
24264 TEST_REQUIRES_X86_SSE;
24265 GemmMicrokernelTester()
24266 .mr(1)
24267 .nr(8)
24268 .kr(1)
24269 .sr(1)
24270 .m(1)
24271 .n(8)
24272 .k(4)
24273 .cn_stride(11)
24274 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24275 }
24276
24277 TEST(F32_IGEMM_1X8__SSE_DUP, k_eq_4_subtile) {
24278 TEST_REQUIRES_X86_SSE;
24279 for (uint32_t m = 1; m <= 1; m++) {
24280 for (uint32_t n = 1; n <= 8; n++) {
24281 GemmMicrokernelTester()
24282 .mr(1)
24283 .nr(8)
24284 .kr(1)
24285 .sr(1)
24286 .m(m)
24287 .n(n)
24288 .k(4)
24289 .iterations(1)
24290 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24291 }
24292 }
24293 }
24294
24295 TEST(F32_IGEMM_1X8__SSE_DUP, k_eq_4_subtile_m) {
24296 TEST_REQUIRES_X86_SSE;
24297 for (uint32_t m = 1; m <= 1; m++) {
24298 GemmMicrokernelTester()
24299 .mr(1)
24300 .nr(8)
24301 .kr(1)
24302 .sr(1)
24303 .m(m)
24304 .n(8)
24305 .k(4)
24306 .iterations(1)
24307 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24308 }
24309 }
24310
24311 TEST(F32_IGEMM_1X8__SSE_DUP, k_eq_4_subtile_n) {
24312 TEST_REQUIRES_X86_SSE;
24313 for (uint32_t n = 1; n <= 8; n++) {
24314 GemmMicrokernelTester()
24315 .mr(1)
24316 .nr(8)
24317 .kr(1)
24318 .sr(1)
24319 .m(1)
24320 .n(n)
24321 .k(4)
24322 .iterations(1)
24323 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24324 }
24325 }
24326
24327 TEST(F32_IGEMM_1X8__SSE_DUP, k_lt_4) {
24328 TEST_REQUIRES_X86_SSE;
24329 for (size_t k = 1; k < 4; k++) {
24330 GemmMicrokernelTester()
24331 .mr(1)
24332 .nr(8)
24333 .kr(1)
24334 .sr(1)
24335 .m(1)
24336 .n(8)
24337 .k(k)
24338 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24339 }
24340 }
24341
24342 TEST(F32_IGEMM_1X8__SSE_DUP, k_lt_4_subtile) {
24343 TEST_REQUIRES_X86_SSE;
24344 for (size_t k = 1; k < 4; k++) {
24345 for (uint32_t m = 1; m <= 1; m++) {
24346 for (uint32_t n = 1; n <= 8; n++) {
24347 GemmMicrokernelTester()
24348 .mr(1)
24349 .nr(8)
24350 .kr(1)
24351 .sr(1)
24352 .m(m)
24353 .n(n)
24354 .k(k)
24355 .iterations(1)
24356 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24357 }
24358 }
24359 }
24360 }
24361
24362 TEST(F32_IGEMM_1X8__SSE_DUP, k_gt_4) {
24363 TEST_REQUIRES_X86_SSE;
24364 for (size_t k = 5; k < 8; k++) {
24365 GemmMicrokernelTester()
24366 .mr(1)
24367 .nr(8)
24368 .kr(1)
24369 .sr(1)
24370 .m(1)
24371 .n(8)
24372 .k(k)
24373 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24374 }
24375 }
24376
24377 TEST(F32_IGEMM_1X8__SSE_DUP, k_gt_4_subtile) {
24378 TEST_REQUIRES_X86_SSE;
24379 for (size_t k = 5; k < 8; k++) {
24380 for (uint32_t m = 1; m <= 1; m++) {
24381 for (uint32_t n = 1; n <= 8; n++) {
24382 GemmMicrokernelTester()
24383 .mr(1)
24384 .nr(8)
24385 .kr(1)
24386 .sr(1)
24387 .m(m)
24388 .n(n)
24389 .k(k)
24390 .iterations(1)
24391 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24392 }
24393 }
24394 }
24395 }
24396
24397 TEST(F32_IGEMM_1X8__SSE_DUP, k_div_4) {
24398 TEST_REQUIRES_X86_SSE;
24399 for (size_t k = 8; k <= 40; k += 4) {
24400 GemmMicrokernelTester()
24401 .mr(1)
24402 .nr(8)
24403 .kr(1)
24404 .sr(1)
24405 .m(1)
24406 .n(8)
24407 .k(k)
24408 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24409 }
24410 }
24411
24412 TEST(F32_IGEMM_1X8__SSE_DUP, k_div_4_subtile) {
24413 TEST_REQUIRES_X86_SSE;
24414 for (size_t k = 8; k <= 40; k += 4) {
24415 for (uint32_t m = 1; m <= 1; m++) {
24416 for (uint32_t n = 1; n <= 8; n++) {
24417 GemmMicrokernelTester()
24418 .mr(1)
24419 .nr(8)
24420 .kr(1)
24421 .sr(1)
24422 .m(m)
24423 .n(n)
24424 .k(k)
24425 .iterations(1)
24426 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24427 }
24428 }
24429 }
24430 }
24431
24432 TEST(F32_IGEMM_1X8__SSE_DUP, n_gt_8) {
24433 TEST_REQUIRES_X86_SSE;
24434 for (uint32_t n = 9; n < 16; n++) {
24435 for (size_t k = 1; k <= 20; k += 5) {
24436 GemmMicrokernelTester()
24437 .mr(1)
24438 .nr(8)
24439 .kr(1)
24440 .sr(1)
24441 .m(1)
24442 .n(8)
24443 .k(k)
24444 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24445 }
24446 }
24447 }
24448
24449 TEST(F32_IGEMM_1X8__SSE_DUP, n_gt_8_strided_cn) {
24450 TEST_REQUIRES_X86_SSE;
24451 for (uint32_t n = 9; n < 16; n++) {
24452 for (size_t k = 1; k <= 20; k += 5) {
24453 GemmMicrokernelTester()
24454 .mr(1)
24455 .nr(8)
24456 .kr(1)
24457 .sr(1)
24458 .m(1)
24459 .n(8)
24460 .k(k)
24461 .cn_stride(11)
24462 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24463 }
24464 }
24465 }
24466
24467 TEST(F32_IGEMM_1X8__SSE_DUP, n_gt_8_subtile) {
24468 TEST_REQUIRES_X86_SSE;
24469 for (uint32_t n = 9; n < 16; n++) {
24470 for (size_t k = 1; k <= 20; k += 5) {
24471 for (uint32_t m = 1; m <= 1; m++) {
24472 GemmMicrokernelTester()
24473 .mr(1)
24474 .nr(8)
24475 .kr(1)
24476 .sr(1)
24477 .m(m)
24478 .n(n)
24479 .k(k)
24480 .iterations(1)
24481 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24482 }
24483 }
24484 }
24485 }
24486
24487 TEST(F32_IGEMM_1X8__SSE_DUP, n_div_8) {
24488 TEST_REQUIRES_X86_SSE;
24489 for (uint32_t n = 16; n <= 24; n += 8) {
24490 for (size_t k = 1; k <= 20; k += 5) {
24491 GemmMicrokernelTester()
24492 .mr(1)
24493 .nr(8)
24494 .kr(1)
24495 .sr(1)
24496 .m(1)
24497 .n(8)
24498 .k(k)
24499 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24500 }
24501 }
24502 }
24503
24504 TEST(F32_IGEMM_1X8__SSE_DUP, n_div_8_strided_cn) {
24505 TEST_REQUIRES_X86_SSE;
24506 for (uint32_t n = 16; n <= 24; n += 8) {
24507 for (size_t k = 1; k <= 20; k += 5) {
24508 GemmMicrokernelTester()
24509 .mr(1)
24510 .nr(8)
24511 .kr(1)
24512 .sr(1)
24513 .m(1)
24514 .n(n)
24515 .k(k)
24516 .cn_stride(11)
24517 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24518 }
24519 }
24520 }
24521
24522 TEST(F32_IGEMM_1X8__SSE_DUP, n_div_8_subtile) {
24523 TEST_REQUIRES_X86_SSE;
24524 for (uint32_t n = 16; n <= 24; n += 8) {
24525 for (size_t k = 1; k <= 20; k += 5) {
24526 for (uint32_t m = 1; m <= 1; m++) {
24527 GemmMicrokernelTester()
24528 .mr(1)
24529 .nr(8)
24530 .kr(1)
24531 .sr(1)
24532 .m(m)
24533 .n(n)
24534 .k(k)
24535 .iterations(1)
24536 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24537 }
24538 }
24539 }
24540 }
24541
24542 TEST(F32_IGEMM_1X8__SSE_DUP, small_kernel) {
24543 TEST_REQUIRES_X86_SSE;
24544 for (size_t k = 1; k <= 20; k += 5) {
24545 GemmMicrokernelTester()
24546 .mr(1)
24547 .nr(8)
24548 .kr(1)
24549 .sr(1)
24550 .m(1)
24551 .n(8)
24552 .k(k)
24553 .ks(3)
24554 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24555 }
24556 }
24557
24558 TEST(F32_IGEMM_1X8__SSE_DUP, small_kernel_subtile) {
24559 TEST_REQUIRES_X86_SSE;
24560 for (size_t k = 1; k <= 20; k += 5) {
24561 for (uint32_t m = 1; m <= 1; m++) {
24562 for (uint32_t n = 1; n <= 8; n++) {
24563 GemmMicrokernelTester()
24564 .mr(1)
24565 .nr(8)
24566 .kr(1)
24567 .sr(1)
24568 .m(m)
24569 .n(n)
24570 .k(k)
24571 .ks(3)
24572 .iterations(1)
24573 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24574 }
24575 }
24576 }
24577 }
24578
24579 TEST(F32_IGEMM_1X8__SSE_DUP, n_gt_8_small_kernel) {
24580 TEST_REQUIRES_X86_SSE;
24581 for (uint32_t n = 9; n < 16; n++) {
24582 for (size_t k = 1; k <= 20; k += 5) {
24583 GemmMicrokernelTester()
24584 .mr(1)
24585 .nr(8)
24586 .kr(1)
24587 .sr(1)
24588 .m(1)
24589 .n(8)
24590 .k(k)
24591 .ks(3)
24592 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24593 }
24594 }
24595 }
24596
24597 TEST(F32_IGEMM_1X8__SSE_DUP, n_div_8_small_kernel) {
24598 TEST_REQUIRES_X86_SSE;
24599 for (uint32_t n = 16; n <= 24; n += 8) {
24600 for (size_t k = 1; k <= 20; k += 5) {
24601 GemmMicrokernelTester()
24602 .mr(1)
24603 .nr(8)
24604 .kr(1)
24605 .sr(1)
24606 .m(1)
24607 .n(8)
24608 .k(k)
24609 .ks(3)
24610 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24611 }
24612 }
24613 }
24614
24615 TEST(F32_IGEMM_1X8__SSE_DUP, strided_cm_subtile) {
24616 TEST_REQUIRES_X86_SSE;
24617 for (size_t k = 1; k <= 20; k += 5) {
24618 for (uint32_t m = 1; m <= 1; m++) {
24619 for (uint32_t n = 1; n <= 8; n++) {
24620 GemmMicrokernelTester()
24621 .mr(1)
24622 .nr(8)
24623 .kr(1)
24624 .sr(1)
24625 .m(m)
24626 .n(n)
24627 .k(k)
24628 .cm_stride(11)
24629 .iterations(1)
24630 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24631 }
24632 }
24633 }
24634 }
24635
24636 TEST(F32_IGEMM_1X8__SSE_DUP, a_offset) {
24637 TEST_REQUIRES_X86_SSE;
24638 for (size_t k = 1; k <= 20; k += 5) {
24639 GemmMicrokernelTester()
24640 .mr(1)
24641 .nr(8)
24642 .kr(1)
24643 .sr(1)
24644 .m(1)
24645 .n(8)
24646 .k(k)
24647 .ks(3)
24648 .a_offset(23)
24649 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24650 }
24651 }
24652
24653 TEST(F32_IGEMM_1X8__SSE_DUP, zero) {
24654 TEST_REQUIRES_X86_SSE;
24655 for (uint32_t mz = 0; mz < 1; mz++) {
24656 for (size_t k = 1; k <= 20; k += 5) {
24657 GemmMicrokernelTester()
24658 .mr(1)
24659 .nr(8)
24660 .kr(1)
24661 .sr(1)
24662 .m(1)
24663 .n(8)
24664 .k(k)
24665 .ks(3)
24666 .a_offset(23)
24667 .zero_index(mz)
24668 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24669 }
24670 }
24671 }
24672
24673 TEST(F32_IGEMM_1X8__SSE_DUP, qmin) {
24674 TEST_REQUIRES_X86_SSE;
24675 GemmMicrokernelTester()
24676 .mr(1)
24677 .nr(8)
24678 .kr(1)
24679 .sr(1)
24680 .m(1)
24681 .n(8)
24682 .k(4)
24683 .qmin(128)
24684 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24685 }
24686
24687 TEST(F32_IGEMM_1X8__SSE_DUP, qmax) {
24688 TEST_REQUIRES_X86_SSE;
24689 GemmMicrokernelTester()
24690 .mr(1)
24691 .nr(8)
24692 .kr(1)
24693 .sr(1)
24694 .m(1)
24695 .n(8)
24696 .k(4)
24697 .qmax(128)
24698 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24699 }
24700
24701 TEST(F32_IGEMM_1X8__SSE_DUP, strided_cm) {
24702 TEST_REQUIRES_X86_SSE;
24703 GemmMicrokernelTester()
24704 .mr(1)
24705 .nr(8)
24706 .kr(1)
24707 .sr(1)
24708 .m(1)
24709 .n(8)
24710 .k(4)
24711 .cm_stride(11)
24712 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
24713 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070024714#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070024715
24716
Marat Dukhan1dadbf72019-10-01 10:46:20 -070024717#if XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070024718 TEST(F32_IGEMM_4X8__SSE_DUP, k_eq_4) {
24719 TEST_REQUIRES_X86_SSE;
24720 GemmMicrokernelTester()
24721 .mr(4)
24722 .nr(8)
24723 .kr(1)
24724 .sr(1)
24725 .m(4)
24726 .n(8)
24727 .k(4)
24728 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
24729 }
24730
24731 TEST(F32_IGEMM_4X8__SSE_DUP, strided_cn) {
24732 TEST_REQUIRES_X86_SSE;
24733 GemmMicrokernelTester()
24734 .mr(4)
24735 .nr(8)
24736 .kr(1)
24737 .sr(1)
24738 .m(4)
24739 .n(8)
24740 .k(4)
24741 .cn_stride(11)
24742 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
24743 }
24744
24745 TEST(F32_IGEMM_4X8__SSE_DUP, k_eq_4_subtile) {
24746 TEST_REQUIRES_X86_SSE;
24747 for (uint32_t m = 1; m <= 4; m++) {
24748 for (uint32_t n = 1; n <= 8; n++) {
24749 GemmMicrokernelTester()
24750 .mr(4)
24751 .nr(8)
24752 .kr(1)
24753 .sr(1)
24754 .m(m)
24755 .n(n)
24756 .k(4)
24757 .iterations(1)
24758 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
24759 }
24760 }
24761 }
24762
24763 TEST(F32_IGEMM_4X8__SSE_DUP, k_eq_4_subtile_m) {
24764 TEST_REQUIRES_X86_SSE;
24765 for (uint32_t m = 1; m <= 4; m++) {
24766 GemmMicrokernelTester()
24767 .mr(4)
24768 .nr(8)
24769 .kr(1)
24770 .sr(1)
24771 .m(m)
24772 .n(8)
24773 .k(4)
24774 .iterations(1)
24775 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
24776 }
24777 }
24778
24779 TEST(F32_IGEMM_4X8__SSE_DUP, k_eq_4_subtile_n) {
24780 TEST_REQUIRES_X86_SSE;
24781 for (uint32_t n = 1; n <= 8; n++) {
24782 GemmMicrokernelTester()
24783 .mr(4)
24784 .nr(8)
24785 .kr(1)
24786 .sr(1)
24787 .m(4)
24788 .n(n)
24789 .k(4)
24790 .iterations(1)
24791 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
24792 }
24793 }
24794
24795 TEST(F32_IGEMM_4X8__SSE_DUP, k_lt_4) {
24796 TEST_REQUIRES_X86_SSE;
24797 for (size_t k = 1; k < 4; k++) {
24798 GemmMicrokernelTester()
24799 .mr(4)
24800 .nr(8)
24801 .kr(1)
24802 .sr(1)
24803 .m(4)
24804 .n(8)
24805 .k(k)
24806 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
24807 }
24808 }
24809
24810 TEST(F32_IGEMM_4X8__SSE_DUP, k_lt_4_subtile) {
24811 TEST_REQUIRES_X86_SSE;
24812 for (size_t k = 1; k < 4; k++) {
24813 for (uint32_t m = 1; m <= 4; m++) {
24814 for (uint32_t n = 1; n <= 8; n++) {
24815 GemmMicrokernelTester()
24816 .mr(4)
24817 .nr(8)
24818 .kr(1)
24819 .sr(1)
24820 .m(m)
24821 .n(n)
24822 .k(k)
24823 .iterations(1)
24824 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
24825 }
24826 }
24827 }
24828 }
24829
24830 TEST(F32_IGEMM_4X8__SSE_DUP, k_gt_4) {
24831 TEST_REQUIRES_X86_SSE;
24832 for (size_t k = 5; k < 8; k++) {
24833 GemmMicrokernelTester()
24834 .mr(4)
24835 .nr(8)
24836 .kr(1)
24837 .sr(1)
24838 .m(4)
24839 .n(8)
24840 .k(k)
24841 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
24842 }
24843 }
24844
24845 TEST(F32_IGEMM_4X8__SSE_DUP, k_gt_4_subtile) {
24846 TEST_REQUIRES_X86_SSE;
24847 for (size_t k = 5; k < 8; k++) {
24848 for (uint32_t m = 1; m <= 4; m++) {
24849 for (uint32_t n = 1; n <= 8; n++) {
24850 GemmMicrokernelTester()
24851 .mr(4)
24852 .nr(8)
24853 .kr(1)
24854 .sr(1)
24855 .m(m)
24856 .n(n)
24857 .k(k)
24858 .iterations(1)
24859 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
24860 }
24861 }
24862 }
24863 }
24864
24865 TEST(F32_IGEMM_4X8__SSE_DUP, k_div_4) {
24866 TEST_REQUIRES_X86_SSE;
24867 for (size_t k = 8; k <= 40; k += 4) {
24868 GemmMicrokernelTester()
24869 .mr(4)
24870 .nr(8)
24871 .kr(1)
24872 .sr(1)
24873 .m(4)
24874 .n(8)
24875 .k(k)
24876 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
24877 }
24878 }
24879
24880 TEST(F32_IGEMM_4X8__SSE_DUP, k_div_4_subtile) {
24881 TEST_REQUIRES_X86_SSE;
24882 for (size_t k = 8; k <= 40; k += 4) {
24883 for (uint32_t m = 1; m <= 4; m++) {
24884 for (uint32_t n = 1; n <= 8; n++) {
24885 GemmMicrokernelTester()
24886 .mr(4)
24887 .nr(8)
24888 .kr(1)
24889 .sr(1)
24890 .m(m)
24891 .n(n)
24892 .k(k)
24893 .iterations(1)
24894 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
24895 }
24896 }
24897 }
24898 }
24899
24900 TEST(F32_IGEMM_4X8__SSE_DUP, n_gt_8) {
24901 TEST_REQUIRES_X86_SSE;
24902 for (uint32_t n = 9; n < 16; n++) {
24903 for (size_t k = 1; k <= 20; k += 5) {
24904 GemmMicrokernelTester()
24905 .mr(4)
24906 .nr(8)
24907 .kr(1)
24908 .sr(1)
24909 .m(4)
24910 .n(8)
24911 .k(k)
24912 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
24913 }
24914 }
24915 }
24916
24917 TEST(F32_IGEMM_4X8__SSE_DUP, n_gt_8_strided_cn) {
24918 TEST_REQUIRES_X86_SSE;
24919 for (uint32_t n = 9; n < 16; n++) {
24920 for (size_t k = 1; k <= 20; k += 5) {
24921 GemmMicrokernelTester()
24922 .mr(4)
24923 .nr(8)
24924 .kr(1)
24925 .sr(1)
24926 .m(4)
24927 .n(8)
24928 .k(k)
24929 .cn_stride(11)
24930 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
24931 }
24932 }
24933 }
24934
24935 TEST(F32_IGEMM_4X8__SSE_DUP, n_gt_8_subtile) {
24936 TEST_REQUIRES_X86_SSE;
24937 for (uint32_t n = 9; n < 16; n++) {
24938 for (size_t k = 1; k <= 20; k += 5) {
24939 for (uint32_t m = 1; m <= 4; m++) {
24940 GemmMicrokernelTester()
24941 .mr(4)
24942 .nr(8)
24943 .kr(1)
24944 .sr(1)
24945 .m(m)
24946 .n(n)
24947 .k(k)
24948 .iterations(1)
24949 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
24950 }
24951 }
24952 }
24953 }
24954
24955 TEST(F32_IGEMM_4X8__SSE_DUP, n_div_8) {
24956 TEST_REQUIRES_X86_SSE;
24957 for (uint32_t n = 16; n <= 24; n += 8) {
24958 for (size_t k = 1; k <= 20; k += 5) {
24959 GemmMicrokernelTester()
24960 .mr(4)
24961 .nr(8)
24962 .kr(1)
24963 .sr(1)
24964 .m(4)
24965 .n(8)
24966 .k(k)
24967 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
24968 }
24969 }
24970 }
24971
24972 TEST(F32_IGEMM_4X8__SSE_DUP, n_div_8_strided_cn) {
24973 TEST_REQUIRES_X86_SSE;
24974 for (uint32_t n = 16; n <= 24; n += 8) {
24975 for (size_t k = 1; k <= 20; k += 5) {
24976 GemmMicrokernelTester()
24977 .mr(4)
24978 .nr(8)
24979 .kr(1)
24980 .sr(1)
24981 .m(4)
24982 .n(n)
24983 .k(k)
24984 .cn_stride(11)
24985 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
24986 }
24987 }
24988 }
24989
24990 TEST(F32_IGEMM_4X8__SSE_DUP, n_div_8_subtile) {
24991 TEST_REQUIRES_X86_SSE;
24992 for (uint32_t n = 16; n <= 24; n += 8) {
24993 for (size_t k = 1; k <= 20; k += 5) {
24994 for (uint32_t m = 1; m <= 4; m++) {
24995 GemmMicrokernelTester()
24996 .mr(4)
24997 .nr(8)
24998 .kr(1)
24999 .sr(1)
25000 .m(m)
25001 .n(n)
25002 .k(k)
25003 .iterations(1)
25004 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
25005 }
25006 }
25007 }
25008 }
25009
25010 TEST(F32_IGEMM_4X8__SSE_DUP, small_kernel) {
25011 TEST_REQUIRES_X86_SSE;
25012 for (size_t k = 1; k <= 20; k += 5) {
25013 GemmMicrokernelTester()
25014 .mr(4)
25015 .nr(8)
25016 .kr(1)
25017 .sr(1)
25018 .m(4)
25019 .n(8)
25020 .k(k)
25021 .ks(3)
25022 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
25023 }
25024 }
25025
25026 TEST(F32_IGEMM_4X8__SSE_DUP, small_kernel_subtile) {
25027 TEST_REQUIRES_X86_SSE;
25028 for (size_t k = 1; k <= 20; k += 5) {
25029 for (uint32_t m = 1; m <= 4; m++) {
25030 for (uint32_t n = 1; n <= 8; n++) {
25031 GemmMicrokernelTester()
25032 .mr(4)
25033 .nr(8)
25034 .kr(1)
25035 .sr(1)
25036 .m(m)
25037 .n(n)
25038 .k(k)
25039 .ks(3)
25040 .iterations(1)
25041 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
25042 }
25043 }
25044 }
25045 }
25046
25047 TEST(F32_IGEMM_4X8__SSE_DUP, n_gt_8_small_kernel) {
25048 TEST_REQUIRES_X86_SSE;
25049 for (uint32_t n = 9; n < 16; n++) {
25050 for (size_t k = 1; k <= 20; k += 5) {
25051 GemmMicrokernelTester()
25052 .mr(4)
25053 .nr(8)
25054 .kr(1)
25055 .sr(1)
25056 .m(4)
25057 .n(8)
25058 .k(k)
25059 .ks(3)
25060 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
25061 }
25062 }
25063 }
25064
25065 TEST(F32_IGEMM_4X8__SSE_DUP, n_div_8_small_kernel) {
25066 TEST_REQUIRES_X86_SSE;
25067 for (uint32_t n = 16; n <= 24; n += 8) {
25068 for (size_t k = 1; k <= 20; k += 5) {
25069 GemmMicrokernelTester()
25070 .mr(4)
25071 .nr(8)
25072 .kr(1)
25073 .sr(1)
25074 .m(4)
25075 .n(8)
25076 .k(k)
25077 .ks(3)
25078 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
25079 }
25080 }
25081 }
25082
25083 TEST(F32_IGEMM_4X8__SSE_DUP, strided_cm_subtile) {
25084 TEST_REQUIRES_X86_SSE;
25085 for (size_t k = 1; k <= 20; k += 5) {
25086 for (uint32_t m = 1; m <= 4; m++) {
25087 for (uint32_t n = 1; n <= 8; n++) {
25088 GemmMicrokernelTester()
25089 .mr(4)
25090 .nr(8)
25091 .kr(1)
25092 .sr(1)
25093 .m(m)
25094 .n(n)
25095 .k(k)
25096 .cm_stride(11)
25097 .iterations(1)
25098 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
25099 }
25100 }
25101 }
25102 }
25103
25104 TEST(F32_IGEMM_4X8__SSE_DUP, a_offset) {
25105 TEST_REQUIRES_X86_SSE;
25106 for (size_t k = 1; k <= 20; k += 5) {
25107 GemmMicrokernelTester()
25108 .mr(4)
25109 .nr(8)
25110 .kr(1)
25111 .sr(1)
25112 .m(4)
25113 .n(8)
25114 .k(k)
25115 .ks(3)
25116 .a_offset(83)
25117 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
25118 }
25119 }
25120
25121 TEST(F32_IGEMM_4X8__SSE_DUP, zero) {
25122 TEST_REQUIRES_X86_SSE;
25123 for (uint32_t mz = 0; mz < 4; mz++) {
25124 for (size_t k = 1; k <= 20; k += 5) {
25125 GemmMicrokernelTester()
25126 .mr(4)
25127 .nr(8)
25128 .kr(1)
25129 .sr(1)
25130 .m(4)
25131 .n(8)
25132 .k(k)
25133 .ks(3)
25134 .a_offset(83)
25135 .zero_index(mz)
25136 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
25137 }
25138 }
25139 }
25140
25141 TEST(F32_IGEMM_4X8__SSE_DUP, qmin) {
25142 TEST_REQUIRES_X86_SSE;
25143 GemmMicrokernelTester()
25144 .mr(4)
25145 .nr(8)
25146 .kr(1)
25147 .sr(1)
25148 .m(4)
25149 .n(8)
25150 .k(4)
25151 .qmin(128)
25152 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
25153 }
25154
25155 TEST(F32_IGEMM_4X8__SSE_DUP, qmax) {
25156 TEST_REQUIRES_X86_SSE;
25157 GemmMicrokernelTester()
25158 .mr(4)
25159 .nr(8)
25160 .kr(1)
25161 .sr(1)
25162 .m(4)
25163 .n(8)
25164 .k(4)
25165 .qmax(128)
25166 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
25167 }
25168
25169 TEST(F32_IGEMM_4X8__SSE_DUP, strided_cm) {
25170 TEST_REQUIRES_X86_SSE;
25171 GemmMicrokernelTester()
25172 .mr(4)
25173 .nr(8)
25174 .kr(1)
25175 .sr(1)
25176 .m(4)
25177 .n(8)
25178 .k(4)
25179 .cm_stride(11)
25180 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
25181 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070025182#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070025183
25184
Marat Dukhan1dadbf72019-10-01 10:46:20 -070025185#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Frank Barchardd42bdf72019-11-20 16:39:43 -080025186 TEST(F32_IGEMM_1X8S4__SSE, k_eq_4) {
25187 TEST_REQUIRES_X86_SSE;
25188 GemmMicrokernelTester()
25189 .mr(1)
25190 .nr(8)
25191 .kr(1)
25192 .sr(4)
25193 .m(1)
25194 .n(8)
25195 .k(4)
25196 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25197 }
25198
25199 TEST(F32_IGEMM_1X8S4__SSE, strided_cn) {
25200 TEST_REQUIRES_X86_SSE;
25201 GemmMicrokernelTester()
25202 .mr(1)
25203 .nr(8)
25204 .kr(1)
25205 .sr(4)
25206 .m(1)
25207 .n(8)
25208 .k(4)
25209 .cn_stride(11)
25210 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25211 }
25212
25213 TEST(F32_IGEMM_1X8S4__SSE, k_eq_4_subtile) {
25214 TEST_REQUIRES_X86_SSE;
25215 for (uint32_t m = 1; m <= 1; m++) {
25216 for (uint32_t n = 1; n <= 8; n++) {
25217 GemmMicrokernelTester()
25218 .mr(1)
25219 .nr(8)
25220 .kr(1)
25221 .sr(4)
25222 .m(m)
25223 .n(n)
25224 .k(4)
25225 .iterations(1)
25226 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25227 }
25228 }
25229 }
25230
25231 TEST(F32_IGEMM_1X8S4__SSE, k_eq_4_subtile_m) {
25232 TEST_REQUIRES_X86_SSE;
25233 for (uint32_t m = 1; m <= 1; m++) {
25234 GemmMicrokernelTester()
25235 .mr(1)
25236 .nr(8)
25237 .kr(1)
25238 .sr(4)
25239 .m(m)
25240 .n(8)
25241 .k(4)
25242 .iterations(1)
25243 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25244 }
25245 }
25246
25247 TEST(F32_IGEMM_1X8S4__SSE, k_eq_4_subtile_n) {
25248 TEST_REQUIRES_X86_SSE;
25249 for (uint32_t n = 1; n <= 8; n++) {
25250 GemmMicrokernelTester()
25251 .mr(1)
25252 .nr(8)
25253 .kr(1)
25254 .sr(4)
25255 .m(1)
25256 .n(n)
25257 .k(4)
25258 .iterations(1)
25259 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25260 }
25261 }
25262
25263 TEST(F32_IGEMM_1X8S4__SSE, k_lt_4) {
25264 TEST_REQUIRES_X86_SSE;
25265 for (size_t k = 1; k < 4; k++) {
25266 GemmMicrokernelTester()
25267 .mr(1)
25268 .nr(8)
25269 .kr(1)
25270 .sr(4)
25271 .m(1)
25272 .n(8)
25273 .k(k)
25274 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25275 }
25276 }
25277
25278 TEST(F32_IGEMM_1X8S4__SSE, k_lt_4_subtile) {
25279 TEST_REQUIRES_X86_SSE;
25280 for (size_t k = 1; k < 4; k++) {
25281 for (uint32_t m = 1; m <= 1; m++) {
25282 for (uint32_t n = 1; n <= 8; n++) {
25283 GemmMicrokernelTester()
25284 .mr(1)
25285 .nr(8)
25286 .kr(1)
25287 .sr(4)
25288 .m(m)
25289 .n(n)
25290 .k(k)
25291 .iterations(1)
25292 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25293 }
25294 }
25295 }
25296 }
25297
25298 TEST(F32_IGEMM_1X8S4__SSE, k_gt_4) {
25299 TEST_REQUIRES_X86_SSE;
25300 for (size_t k = 5; k < 8; k++) {
25301 GemmMicrokernelTester()
25302 .mr(1)
25303 .nr(8)
25304 .kr(1)
25305 .sr(4)
25306 .m(1)
25307 .n(8)
25308 .k(k)
25309 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25310 }
25311 }
25312
25313 TEST(F32_IGEMM_1X8S4__SSE, k_gt_4_subtile) {
25314 TEST_REQUIRES_X86_SSE;
25315 for (size_t k = 5; k < 8; k++) {
25316 for (uint32_t m = 1; m <= 1; m++) {
25317 for (uint32_t n = 1; n <= 8; n++) {
25318 GemmMicrokernelTester()
25319 .mr(1)
25320 .nr(8)
25321 .kr(1)
25322 .sr(4)
25323 .m(m)
25324 .n(n)
25325 .k(k)
25326 .iterations(1)
25327 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25328 }
25329 }
25330 }
25331 }
25332
25333 TEST(F32_IGEMM_1X8S4__SSE, k_div_4) {
25334 TEST_REQUIRES_X86_SSE;
25335 for (size_t k = 8; k <= 40; k += 4) {
25336 GemmMicrokernelTester()
25337 .mr(1)
25338 .nr(8)
25339 .kr(1)
25340 .sr(4)
25341 .m(1)
25342 .n(8)
25343 .k(k)
25344 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25345 }
25346 }
25347
25348 TEST(F32_IGEMM_1X8S4__SSE, k_div_4_subtile) {
25349 TEST_REQUIRES_X86_SSE;
25350 for (size_t k = 8; k <= 40; k += 4) {
25351 for (uint32_t m = 1; m <= 1; m++) {
25352 for (uint32_t n = 1; n <= 8; n++) {
25353 GemmMicrokernelTester()
25354 .mr(1)
25355 .nr(8)
25356 .kr(1)
25357 .sr(4)
25358 .m(m)
25359 .n(n)
25360 .k(k)
25361 .iterations(1)
25362 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25363 }
25364 }
25365 }
25366 }
25367
25368 TEST(F32_IGEMM_1X8S4__SSE, n_gt_8) {
25369 TEST_REQUIRES_X86_SSE;
25370 for (uint32_t n = 9; n < 16; n++) {
25371 for (size_t k = 1; k <= 20; k += 5) {
25372 GemmMicrokernelTester()
25373 .mr(1)
25374 .nr(8)
25375 .kr(1)
25376 .sr(4)
25377 .m(1)
25378 .n(8)
25379 .k(k)
25380 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25381 }
25382 }
25383 }
25384
25385 TEST(F32_IGEMM_1X8S4__SSE, n_gt_8_strided_cn) {
25386 TEST_REQUIRES_X86_SSE;
25387 for (uint32_t n = 9; n < 16; n++) {
25388 for (size_t k = 1; k <= 20; k += 5) {
25389 GemmMicrokernelTester()
25390 .mr(1)
25391 .nr(8)
25392 .kr(1)
25393 .sr(4)
25394 .m(1)
25395 .n(8)
25396 .k(k)
25397 .cn_stride(11)
25398 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25399 }
25400 }
25401 }
25402
25403 TEST(F32_IGEMM_1X8S4__SSE, n_gt_8_subtile) {
25404 TEST_REQUIRES_X86_SSE;
25405 for (uint32_t n = 9; n < 16; n++) {
25406 for (size_t k = 1; k <= 20; k += 5) {
25407 for (uint32_t m = 1; m <= 1; m++) {
25408 GemmMicrokernelTester()
25409 .mr(1)
25410 .nr(8)
25411 .kr(1)
25412 .sr(4)
25413 .m(m)
25414 .n(n)
25415 .k(k)
25416 .iterations(1)
25417 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25418 }
25419 }
25420 }
25421 }
25422
25423 TEST(F32_IGEMM_1X8S4__SSE, n_div_8) {
25424 TEST_REQUIRES_X86_SSE;
25425 for (uint32_t n = 16; n <= 24; n += 8) {
25426 for (size_t k = 1; k <= 20; k += 5) {
25427 GemmMicrokernelTester()
25428 .mr(1)
25429 .nr(8)
25430 .kr(1)
25431 .sr(4)
25432 .m(1)
25433 .n(8)
25434 .k(k)
25435 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25436 }
25437 }
25438 }
25439
25440 TEST(F32_IGEMM_1X8S4__SSE, n_div_8_strided_cn) {
25441 TEST_REQUIRES_X86_SSE;
25442 for (uint32_t n = 16; n <= 24; n += 8) {
25443 for (size_t k = 1; k <= 20; k += 5) {
25444 GemmMicrokernelTester()
25445 .mr(1)
25446 .nr(8)
25447 .kr(1)
25448 .sr(4)
25449 .m(1)
25450 .n(n)
25451 .k(k)
25452 .cn_stride(11)
25453 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25454 }
25455 }
25456 }
25457
25458 TEST(F32_IGEMM_1X8S4__SSE, n_div_8_subtile) {
25459 TEST_REQUIRES_X86_SSE;
25460 for (uint32_t n = 16; n <= 24; n += 8) {
25461 for (size_t k = 1; k <= 20; k += 5) {
25462 for (uint32_t m = 1; m <= 1; m++) {
25463 GemmMicrokernelTester()
25464 .mr(1)
25465 .nr(8)
25466 .kr(1)
25467 .sr(4)
25468 .m(m)
25469 .n(n)
25470 .k(k)
25471 .iterations(1)
25472 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25473 }
25474 }
25475 }
25476 }
25477
25478 TEST(F32_IGEMM_1X8S4__SSE, small_kernel) {
25479 TEST_REQUIRES_X86_SSE;
25480 for (size_t k = 1; k <= 20; k += 5) {
25481 GemmMicrokernelTester()
25482 .mr(1)
25483 .nr(8)
25484 .kr(1)
25485 .sr(4)
25486 .m(1)
25487 .n(8)
25488 .k(k)
25489 .ks(3)
25490 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25491 }
25492 }
25493
25494 TEST(F32_IGEMM_1X8S4__SSE, small_kernel_subtile) {
25495 TEST_REQUIRES_X86_SSE;
25496 for (size_t k = 1; k <= 20; k += 5) {
25497 for (uint32_t m = 1; m <= 1; m++) {
25498 for (uint32_t n = 1; n <= 8; n++) {
25499 GemmMicrokernelTester()
25500 .mr(1)
25501 .nr(8)
25502 .kr(1)
25503 .sr(4)
25504 .m(m)
25505 .n(n)
25506 .k(k)
25507 .ks(3)
25508 .iterations(1)
25509 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25510 }
25511 }
25512 }
25513 }
25514
25515 TEST(F32_IGEMM_1X8S4__SSE, n_gt_8_small_kernel) {
25516 TEST_REQUIRES_X86_SSE;
25517 for (uint32_t n = 9; n < 16; n++) {
25518 for (size_t k = 1; k <= 20; k += 5) {
25519 GemmMicrokernelTester()
25520 .mr(1)
25521 .nr(8)
25522 .kr(1)
25523 .sr(4)
25524 .m(1)
25525 .n(8)
25526 .k(k)
25527 .ks(3)
25528 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25529 }
25530 }
25531 }
25532
25533 TEST(F32_IGEMM_1X8S4__SSE, n_div_8_small_kernel) {
25534 TEST_REQUIRES_X86_SSE;
25535 for (uint32_t n = 16; n <= 24; n += 8) {
25536 for (size_t k = 1; k <= 20; k += 5) {
25537 GemmMicrokernelTester()
25538 .mr(1)
25539 .nr(8)
25540 .kr(1)
25541 .sr(4)
25542 .m(1)
25543 .n(8)
25544 .k(k)
25545 .ks(3)
25546 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25547 }
25548 }
25549 }
25550
25551 TEST(F32_IGEMM_1X8S4__SSE, strided_cm_subtile) {
25552 TEST_REQUIRES_X86_SSE;
25553 for (size_t k = 1; k <= 20; k += 5) {
25554 for (uint32_t m = 1; m <= 1; m++) {
25555 for (uint32_t n = 1; n <= 8; n++) {
25556 GemmMicrokernelTester()
25557 .mr(1)
25558 .nr(8)
25559 .kr(1)
25560 .sr(4)
25561 .m(m)
25562 .n(n)
25563 .k(k)
25564 .cm_stride(11)
25565 .iterations(1)
25566 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25567 }
25568 }
25569 }
25570 }
25571
25572 TEST(F32_IGEMM_1X8S4__SSE, a_offset) {
25573 TEST_REQUIRES_X86_SSE;
25574 for (size_t k = 1; k <= 20; k += 5) {
25575 GemmMicrokernelTester()
25576 .mr(1)
25577 .nr(8)
25578 .kr(1)
25579 .sr(4)
25580 .m(1)
25581 .n(8)
25582 .k(k)
25583 .ks(3)
25584 .a_offset(23)
25585 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25586 }
25587 }
25588
25589 TEST(F32_IGEMM_1X8S4__SSE, zero) {
25590 TEST_REQUIRES_X86_SSE;
25591 for (uint32_t mz = 0; mz < 1; mz++) {
25592 for (size_t k = 1; k <= 20; k += 5) {
25593 GemmMicrokernelTester()
25594 .mr(1)
25595 .nr(8)
25596 .kr(1)
25597 .sr(4)
25598 .m(1)
25599 .n(8)
25600 .k(k)
25601 .ks(3)
25602 .a_offset(23)
25603 .zero_index(mz)
25604 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25605 }
25606 }
25607 }
25608
25609 TEST(F32_IGEMM_1X8S4__SSE, qmin) {
25610 TEST_REQUIRES_X86_SSE;
25611 GemmMicrokernelTester()
25612 .mr(1)
25613 .nr(8)
25614 .kr(1)
25615 .sr(4)
25616 .m(1)
25617 .n(8)
25618 .k(4)
25619 .qmin(128)
25620 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25621 }
25622
25623 TEST(F32_IGEMM_1X8S4__SSE, qmax) {
25624 TEST_REQUIRES_X86_SSE;
25625 GemmMicrokernelTester()
25626 .mr(1)
25627 .nr(8)
25628 .kr(1)
25629 .sr(4)
25630 .m(1)
25631 .n(8)
25632 .k(4)
25633 .qmax(128)
25634 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25635 }
25636
25637 TEST(F32_IGEMM_1X8S4__SSE, strided_cm) {
25638 TEST_REQUIRES_X86_SSE;
25639 GemmMicrokernelTester()
25640 .mr(1)
25641 .nr(8)
25642 .kr(1)
25643 .sr(4)
25644 .m(1)
25645 .n(8)
25646 .k(4)
25647 .cm_stride(11)
25648 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
25649 }
25650#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
25651
25652
25653#if XNN_ARCH_X86 || XNN_ARCH_X86_64
25654 TEST(F32_IGEMM_4X8S4__SSE, k_eq_4) {
25655 TEST_REQUIRES_X86_SSE;
25656 GemmMicrokernelTester()
25657 .mr(4)
25658 .nr(8)
25659 .kr(1)
25660 .sr(4)
25661 .m(4)
25662 .n(8)
25663 .k(4)
25664 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
25665 }
25666
25667 TEST(F32_IGEMM_4X8S4__SSE, strided_cn) {
25668 TEST_REQUIRES_X86_SSE;
25669 GemmMicrokernelTester()
25670 .mr(4)
25671 .nr(8)
25672 .kr(1)
25673 .sr(4)
25674 .m(4)
25675 .n(8)
25676 .k(4)
25677 .cn_stride(11)
25678 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
25679 }
25680
25681 TEST(F32_IGEMM_4X8S4__SSE, k_eq_4_subtile) {
25682 TEST_REQUIRES_X86_SSE;
25683 for (uint32_t m = 1; m <= 4; m++) {
25684 for (uint32_t n = 1; n <= 8; n++) {
25685 GemmMicrokernelTester()
25686 .mr(4)
25687 .nr(8)
25688 .kr(1)
25689 .sr(4)
25690 .m(m)
25691 .n(n)
25692 .k(4)
25693 .iterations(1)
25694 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
25695 }
25696 }
25697 }
25698
25699 TEST(F32_IGEMM_4X8S4__SSE, k_eq_4_subtile_m) {
25700 TEST_REQUIRES_X86_SSE;
25701 for (uint32_t m = 1; m <= 4; m++) {
25702 GemmMicrokernelTester()
25703 .mr(4)
25704 .nr(8)
25705 .kr(1)
25706 .sr(4)
25707 .m(m)
25708 .n(8)
25709 .k(4)
25710 .iterations(1)
25711 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
25712 }
25713 }
25714
25715 TEST(F32_IGEMM_4X8S4__SSE, k_eq_4_subtile_n) {
25716 TEST_REQUIRES_X86_SSE;
25717 for (uint32_t n = 1; n <= 8; n++) {
25718 GemmMicrokernelTester()
25719 .mr(4)
25720 .nr(8)
25721 .kr(1)
25722 .sr(4)
25723 .m(4)
25724 .n(n)
25725 .k(4)
25726 .iterations(1)
25727 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
25728 }
25729 }
25730
25731 TEST(F32_IGEMM_4X8S4__SSE, k_lt_4) {
25732 TEST_REQUIRES_X86_SSE;
25733 for (size_t k = 1; k < 4; k++) {
25734 GemmMicrokernelTester()
25735 .mr(4)
25736 .nr(8)
25737 .kr(1)
25738 .sr(4)
25739 .m(4)
25740 .n(8)
25741 .k(k)
25742 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
25743 }
25744 }
25745
25746 TEST(F32_IGEMM_4X8S4__SSE, k_lt_4_subtile) {
25747 TEST_REQUIRES_X86_SSE;
25748 for (size_t k = 1; k < 4; k++) {
25749 for (uint32_t m = 1; m <= 4; m++) {
25750 for (uint32_t n = 1; n <= 8; n++) {
25751 GemmMicrokernelTester()
25752 .mr(4)
25753 .nr(8)
25754 .kr(1)
25755 .sr(4)
25756 .m(m)
25757 .n(n)
25758 .k(k)
25759 .iterations(1)
25760 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
25761 }
25762 }
25763 }
25764 }
25765
25766 TEST(F32_IGEMM_4X8S4__SSE, k_gt_4) {
25767 TEST_REQUIRES_X86_SSE;
25768 for (size_t k = 5; k < 8; k++) {
25769 GemmMicrokernelTester()
25770 .mr(4)
25771 .nr(8)
25772 .kr(1)
25773 .sr(4)
25774 .m(4)
25775 .n(8)
25776 .k(k)
25777 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
25778 }
25779 }
25780
25781 TEST(F32_IGEMM_4X8S4__SSE, k_gt_4_subtile) {
25782 TEST_REQUIRES_X86_SSE;
25783 for (size_t k = 5; k < 8; k++) {
25784 for (uint32_t m = 1; m <= 4; m++) {
25785 for (uint32_t n = 1; n <= 8; n++) {
25786 GemmMicrokernelTester()
25787 .mr(4)
25788 .nr(8)
25789 .kr(1)
25790 .sr(4)
25791 .m(m)
25792 .n(n)
25793 .k(k)
25794 .iterations(1)
25795 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
25796 }
25797 }
25798 }
25799 }
25800
25801 TEST(F32_IGEMM_4X8S4__SSE, k_div_4) {
25802 TEST_REQUIRES_X86_SSE;
25803 for (size_t k = 8; k <= 40; k += 4) {
25804 GemmMicrokernelTester()
25805 .mr(4)
25806 .nr(8)
25807 .kr(1)
25808 .sr(4)
25809 .m(4)
25810 .n(8)
25811 .k(k)
25812 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
25813 }
25814 }
25815
25816 TEST(F32_IGEMM_4X8S4__SSE, k_div_4_subtile) {
25817 TEST_REQUIRES_X86_SSE;
25818 for (size_t k = 8; k <= 40; k += 4) {
25819 for (uint32_t m = 1; m <= 4; m++) {
25820 for (uint32_t n = 1; n <= 8; n++) {
25821 GemmMicrokernelTester()
25822 .mr(4)
25823 .nr(8)
25824 .kr(1)
25825 .sr(4)
25826 .m(m)
25827 .n(n)
25828 .k(k)
25829 .iterations(1)
25830 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
25831 }
25832 }
25833 }
25834 }
25835
25836 TEST(F32_IGEMM_4X8S4__SSE, n_gt_8) {
25837 TEST_REQUIRES_X86_SSE;
25838 for (uint32_t n = 9; n < 16; n++) {
25839 for (size_t k = 1; k <= 20; k += 5) {
25840 GemmMicrokernelTester()
25841 .mr(4)
25842 .nr(8)
25843 .kr(1)
25844 .sr(4)
25845 .m(4)
25846 .n(8)
25847 .k(k)
25848 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
25849 }
25850 }
25851 }
25852
25853 TEST(F32_IGEMM_4X8S4__SSE, n_gt_8_strided_cn) {
25854 TEST_REQUIRES_X86_SSE;
25855 for (uint32_t n = 9; n < 16; n++) {
25856 for (size_t k = 1; k <= 20; k += 5) {
25857 GemmMicrokernelTester()
25858 .mr(4)
25859 .nr(8)
25860 .kr(1)
25861 .sr(4)
25862 .m(4)
25863 .n(8)
25864 .k(k)
25865 .cn_stride(11)
25866 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
25867 }
25868 }
25869 }
25870
25871 TEST(F32_IGEMM_4X8S4__SSE, n_gt_8_subtile) {
25872 TEST_REQUIRES_X86_SSE;
25873 for (uint32_t n = 9; n < 16; n++) {
25874 for (size_t k = 1; k <= 20; k += 5) {
25875 for (uint32_t m = 1; m <= 4; m++) {
25876 GemmMicrokernelTester()
25877 .mr(4)
25878 .nr(8)
25879 .kr(1)
25880 .sr(4)
25881 .m(m)
25882 .n(n)
25883 .k(k)
25884 .iterations(1)
25885 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
25886 }
25887 }
25888 }
25889 }
25890
25891 TEST(F32_IGEMM_4X8S4__SSE, n_div_8) {
25892 TEST_REQUIRES_X86_SSE;
25893 for (uint32_t n = 16; n <= 24; n += 8) {
25894 for (size_t k = 1; k <= 20; k += 5) {
25895 GemmMicrokernelTester()
25896 .mr(4)
25897 .nr(8)
25898 .kr(1)
25899 .sr(4)
25900 .m(4)
25901 .n(8)
25902 .k(k)
25903 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
25904 }
25905 }
25906 }
25907
25908 TEST(F32_IGEMM_4X8S4__SSE, n_div_8_strided_cn) {
25909 TEST_REQUIRES_X86_SSE;
25910 for (uint32_t n = 16; n <= 24; n += 8) {
25911 for (size_t k = 1; k <= 20; k += 5) {
25912 GemmMicrokernelTester()
25913 .mr(4)
25914 .nr(8)
25915 .kr(1)
25916 .sr(4)
25917 .m(4)
25918 .n(n)
25919 .k(k)
25920 .cn_stride(11)
25921 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
25922 }
25923 }
25924 }
25925
25926 TEST(F32_IGEMM_4X8S4__SSE, n_div_8_subtile) {
25927 TEST_REQUIRES_X86_SSE;
25928 for (uint32_t n = 16; n <= 24; n += 8) {
25929 for (size_t k = 1; k <= 20; k += 5) {
25930 for (uint32_t m = 1; m <= 4; m++) {
25931 GemmMicrokernelTester()
25932 .mr(4)
25933 .nr(8)
25934 .kr(1)
25935 .sr(4)
25936 .m(m)
25937 .n(n)
25938 .k(k)
25939 .iterations(1)
25940 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
25941 }
25942 }
25943 }
25944 }
25945
25946 TEST(F32_IGEMM_4X8S4__SSE, small_kernel) {
25947 TEST_REQUIRES_X86_SSE;
25948 for (size_t k = 1; k <= 20; k += 5) {
25949 GemmMicrokernelTester()
25950 .mr(4)
25951 .nr(8)
25952 .kr(1)
25953 .sr(4)
25954 .m(4)
25955 .n(8)
25956 .k(k)
25957 .ks(3)
25958 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
25959 }
25960 }
25961
25962 TEST(F32_IGEMM_4X8S4__SSE, small_kernel_subtile) {
25963 TEST_REQUIRES_X86_SSE;
25964 for (size_t k = 1; k <= 20; k += 5) {
25965 for (uint32_t m = 1; m <= 4; m++) {
25966 for (uint32_t n = 1; n <= 8; n++) {
25967 GemmMicrokernelTester()
25968 .mr(4)
25969 .nr(8)
25970 .kr(1)
25971 .sr(4)
25972 .m(m)
25973 .n(n)
25974 .k(k)
25975 .ks(3)
25976 .iterations(1)
25977 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
25978 }
25979 }
25980 }
25981 }
25982
25983 TEST(F32_IGEMM_4X8S4__SSE, n_gt_8_small_kernel) {
25984 TEST_REQUIRES_X86_SSE;
25985 for (uint32_t n = 9; n < 16; n++) {
25986 for (size_t k = 1; k <= 20; k += 5) {
25987 GemmMicrokernelTester()
25988 .mr(4)
25989 .nr(8)
25990 .kr(1)
25991 .sr(4)
25992 .m(4)
25993 .n(8)
25994 .k(k)
25995 .ks(3)
25996 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
25997 }
25998 }
25999 }
26000
26001 TEST(F32_IGEMM_4X8S4__SSE, n_div_8_small_kernel) {
26002 TEST_REQUIRES_X86_SSE;
26003 for (uint32_t n = 16; n <= 24; n += 8) {
26004 for (size_t k = 1; k <= 20; k += 5) {
26005 GemmMicrokernelTester()
26006 .mr(4)
26007 .nr(8)
26008 .kr(1)
26009 .sr(4)
26010 .m(4)
26011 .n(8)
26012 .k(k)
26013 .ks(3)
26014 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
26015 }
26016 }
26017 }
26018
26019 TEST(F32_IGEMM_4X8S4__SSE, strided_cm_subtile) {
26020 TEST_REQUIRES_X86_SSE;
26021 for (size_t k = 1; k <= 20; k += 5) {
26022 for (uint32_t m = 1; m <= 4; m++) {
26023 for (uint32_t n = 1; n <= 8; n++) {
26024 GemmMicrokernelTester()
26025 .mr(4)
26026 .nr(8)
26027 .kr(1)
26028 .sr(4)
26029 .m(m)
26030 .n(n)
26031 .k(k)
26032 .cm_stride(11)
26033 .iterations(1)
26034 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
26035 }
26036 }
26037 }
26038 }
26039
26040 TEST(F32_IGEMM_4X8S4__SSE, a_offset) {
26041 TEST_REQUIRES_X86_SSE;
26042 for (size_t k = 1; k <= 20; k += 5) {
26043 GemmMicrokernelTester()
26044 .mr(4)
26045 .nr(8)
26046 .kr(1)
26047 .sr(4)
26048 .m(4)
26049 .n(8)
26050 .k(k)
26051 .ks(3)
26052 .a_offset(83)
26053 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
26054 }
26055 }
26056
26057 TEST(F32_IGEMM_4X8S4__SSE, zero) {
26058 TEST_REQUIRES_X86_SSE;
26059 for (uint32_t mz = 0; mz < 4; mz++) {
26060 for (size_t k = 1; k <= 20; k += 5) {
26061 GemmMicrokernelTester()
26062 .mr(4)
26063 .nr(8)
26064 .kr(1)
26065 .sr(4)
26066 .m(4)
26067 .n(8)
26068 .k(k)
26069 .ks(3)
26070 .a_offset(83)
26071 .zero_index(mz)
26072 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
26073 }
26074 }
26075 }
26076
26077 TEST(F32_IGEMM_4X8S4__SSE, qmin) {
26078 TEST_REQUIRES_X86_SSE;
26079 GemmMicrokernelTester()
26080 .mr(4)
26081 .nr(8)
26082 .kr(1)
26083 .sr(4)
26084 .m(4)
26085 .n(8)
26086 .k(4)
26087 .qmin(128)
26088 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
26089 }
26090
26091 TEST(F32_IGEMM_4X8S4__SSE, qmax) {
26092 TEST_REQUIRES_X86_SSE;
26093 GemmMicrokernelTester()
26094 .mr(4)
26095 .nr(8)
26096 .kr(1)
26097 .sr(4)
26098 .m(4)
26099 .n(8)
26100 .k(4)
26101 .qmax(128)
26102 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
26103 }
26104
26105 TEST(F32_IGEMM_4X8S4__SSE, strided_cm) {
26106 TEST_REQUIRES_X86_SSE;
26107 GemmMicrokernelTester()
26108 .mr(4)
26109 .nr(8)
26110 .kr(1)
26111 .sr(4)
26112 .m(4)
26113 .n(8)
26114 .k(4)
26115 .cm_stride(11)
26116 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
26117 }
26118#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
26119
26120
26121#if XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070026122 TEST(F32_IGEMM_4X2C4__SSE, k_eq_4) {
26123 TEST_REQUIRES_X86_SSE;
26124 GemmMicrokernelTester()
26125 .mr(4)
26126 .nr(2)
26127 .kr(4)
26128 .sr(1)
26129 .m(4)
26130 .n(2)
26131 .k(4)
26132 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26133 }
26134
26135 TEST(F32_IGEMM_4X2C4__SSE, strided_cn) {
26136 TEST_REQUIRES_X86_SSE;
26137 GemmMicrokernelTester()
26138 .mr(4)
26139 .nr(2)
26140 .kr(4)
26141 .sr(1)
26142 .m(4)
26143 .n(2)
26144 .k(4)
26145 .cn_stride(5)
26146 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26147 }
26148
26149 TEST(F32_IGEMM_4X2C4__SSE, k_eq_4_subtile) {
26150 TEST_REQUIRES_X86_SSE;
26151 for (uint32_t m = 1; m <= 4; m++) {
26152 for (uint32_t n = 1; n <= 2; n++) {
26153 GemmMicrokernelTester()
26154 .mr(4)
26155 .nr(2)
26156 .kr(4)
26157 .sr(1)
26158 .m(m)
26159 .n(n)
26160 .k(4)
26161 .iterations(1)
26162 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26163 }
26164 }
26165 }
26166
26167 TEST(F32_IGEMM_4X2C4__SSE, k_eq_4_subtile_m) {
26168 TEST_REQUIRES_X86_SSE;
26169 for (uint32_t m = 1; m <= 4; m++) {
26170 GemmMicrokernelTester()
26171 .mr(4)
26172 .nr(2)
26173 .kr(4)
26174 .sr(1)
26175 .m(m)
26176 .n(2)
26177 .k(4)
26178 .iterations(1)
26179 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26180 }
26181 }
26182
26183 TEST(F32_IGEMM_4X2C4__SSE, k_eq_4_subtile_n) {
26184 TEST_REQUIRES_X86_SSE;
26185 for (uint32_t n = 1; n <= 2; n++) {
26186 GemmMicrokernelTester()
26187 .mr(4)
26188 .nr(2)
26189 .kr(4)
26190 .sr(1)
26191 .m(4)
26192 .n(n)
26193 .k(4)
26194 .iterations(1)
26195 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26196 }
26197 }
26198
26199 TEST(F32_IGEMM_4X2C4__SSE, k_lt_4) {
26200 TEST_REQUIRES_X86_SSE;
26201 for (size_t k = 1; k < 4; k++) {
26202 GemmMicrokernelTester()
26203 .mr(4)
26204 .nr(2)
26205 .kr(4)
26206 .sr(1)
26207 .m(4)
26208 .n(2)
26209 .k(k)
26210 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26211 }
26212 }
26213
26214 TEST(F32_IGEMM_4X2C4__SSE, k_lt_4_subtile) {
26215 TEST_REQUIRES_X86_SSE;
26216 for (size_t k = 1; k < 4; k++) {
26217 for (uint32_t m = 1; m <= 4; m++) {
26218 for (uint32_t n = 1; n <= 2; n++) {
26219 GemmMicrokernelTester()
26220 .mr(4)
26221 .nr(2)
26222 .kr(4)
26223 .sr(1)
26224 .m(m)
26225 .n(n)
26226 .k(k)
26227 .iterations(1)
26228 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26229 }
26230 }
26231 }
26232 }
26233
26234 TEST(F32_IGEMM_4X2C4__SSE, k_gt_4) {
26235 TEST_REQUIRES_X86_SSE;
26236 for (size_t k = 5; k < 8; k++) {
26237 GemmMicrokernelTester()
26238 .mr(4)
26239 .nr(2)
26240 .kr(4)
26241 .sr(1)
26242 .m(4)
26243 .n(2)
26244 .k(k)
26245 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26246 }
26247 }
26248
26249 TEST(F32_IGEMM_4X2C4__SSE, k_gt_4_subtile) {
26250 TEST_REQUIRES_X86_SSE;
26251 for (size_t k = 5; k < 8; k++) {
26252 for (uint32_t m = 1; m <= 4; m++) {
26253 for (uint32_t n = 1; n <= 2; n++) {
26254 GemmMicrokernelTester()
26255 .mr(4)
26256 .nr(2)
26257 .kr(4)
26258 .sr(1)
26259 .m(m)
26260 .n(n)
26261 .k(k)
26262 .iterations(1)
26263 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26264 }
26265 }
26266 }
26267 }
26268
26269 TEST(F32_IGEMM_4X2C4__SSE, k_div_4) {
26270 TEST_REQUIRES_X86_SSE;
26271 for (size_t k = 8; k <= 40; k += 4) {
26272 GemmMicrokernelTester()
26273 .mr(4)
26274 .nr(2)
26275 .kr(4)
26276 .sr(1)
26277 .m(4)
26278 .n(2)
26279 .k(k)
26280 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26281 }
26282 }
26283
26284 TEST(F32_IGEMM_4X2C4__SSE, k_div_4_subtile) {
26285 TEST_REQUIRES_X86_SSE;
26286 for (size_t k = 8; k <= 40; k += 4) {
26287 for (uint32_t m = 1; m <= 4; m++) {
26288 for (uint32_t n = 1; n <= 2; n++) {
26289 GemmMicrokernelTester()
26290 .mr(4)
26291 .nr(2)
26292 .kr(4)
26293 .sr(1)
26294 .m(m)
26295 .n(n)
26296 .k(k)
26297 .iterations(1)
26298 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26299 }
26300 }
26301 }
26302 }
26303
26304 TEST(F32_IGEMM_4X2C4__SSE, n_gt_2) {
26305 TEST_REQUIRES_X86_SSE;
26306 for (uint32_t n = 3; n < 4; n++) {
26307 for (size_t k = 1; k <= 20; k += 5) {
26308 GemmMicrokernelTester()
26309 .mr(4)
26310 .nr(2)
26311 .kr(4)
26312 .sr(1)
26313 .m(4)
26314 .n(2)
26315 .k(k)
26316 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26317 }
26318 }
26319 }
26320
26321 TEST(F32_IGEMM_4X2C4__SSE, n_gt_2_strided_cn) {
26322 TEST_REQUIRES_X86_SSE;
26323 for (uint32_t n = 3; n < 4; n++) {
26324 for (size_t k = 1; k <= 20; k += 5) {
26325 GemmMicrokernelTester()
26326 .mr(4)
26327 .nr(2)
26328 .kr(4)
26329 .sr(1)
26330 .m(4)
26331 .n(2)
26332 .k(k)
26333 .cn_stride(5)
26334 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26335 }
26336 }
26337 }
26338
26339 TEST(F32_IGEMM_4X2C4__SSE, n_gt_2_subtile) {
26340 TEST_REQUIRES_X86_SSE;
26341 for (uint32_t n = 3; n < 4; n++) {
26342 for (size_t k = 1; k <= 20; k += 5) {
26343 for (uint32_t m = 1; m <= 4; m++) {
26344 GemmMicrokernelTester()
26345 .mr(4)
26346 .nr(2)
26347 .kr(4)
26348 .sr(1)
26349 .m(m)
26350 .n(n)
26351 .k(k)
26352 .iterations(1)
26353 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26354 }
26355 }
26356 }
26357 }
26358
26359 TEST(F32_IGEMM_4X2C4__SSE, n_div_2) {
26360 TEST_REQUIRES_X86_SSE;
26361 for (uint32_t n = 4; n <= 6; n += 2) {
26362 for (size_t k = 1; k <= 20; k += 5) {
26363 GemmMicrokernelTester()
26364 .mr(4)
26365 .nr(2)
26366 .kr(4)
26367 .sr(1)
26368 .m(4)
26369 .n(2)
26370 .k(k)
26371 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26372 }
26373 }
26374 }
26375
26376 TEST(F32_IGEMM_4X2C4__SSE, n_div_2_strided_cn) {
26377 TEST_REQUIRES_X86_SSE;
26378 for (uint32_t n = 4; n <= 6; n += 2) {
26379 for (size_t k = 1; k <= 20; k += 5) {
26380 GemmMicrokernelTester()
26381 .mr(4)
26382 .nr(2)
26383 .kr(4)
26384 .sr(1)
26385 .m(4)
26386 .n(n)
26387 .k(k)
26388 .cn_stride(5)
26389 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26390 }
26391 }
26392 }
26393
26394 TEST(F32_IGEMM_4X2C4__SSE, n_div_2_subtile) {
26395 TEST_REQUIRES_X86_SSE;
26396 for (uint32_t n = 4; n <= 6; n += 2) {
26397 for (size_t k = 1; k <= 20; k += 5) {
26398 for (uint32_t m = 1; m <= 4; m++) {
26399 GemmMicrokernelTester()
26400 .mr(4)
26401 .nr(2)
26402 .kr(4)
26403 .sr(1)
26404 .m(m)
26405 .n(n)
26406 .k(k)
26407 .iterations(1)
26408 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26409 }
26410 }
26411 }
26412 }
26413
26414 TEST(F32_IGEMM_4X2C4__SSE, small_kernel) {
26415 TEST_REQUIRES_X86_SSE;
26416 for (size_t k = 1; k <= 20; k += 5) {
26417 GemmMicrokernelTester()
26418 .mr(4)
26419 .nr(2)
26420 .kr(4)
26421 .sr(1)
26422 .m(4)
26423 .n(2)
26424 .k(k)
26425 .ks(3)
26426 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26427 }
26428 }
26429
26430 TEST(F32_IGEMM_4X2C4__SSE, small_kernel_subtile) {
26431 TEST_REQUIRES_X86_SSE;
26432 for (size_t k = 1; k <= 20; k += 5) {
26433 for (uint32_t m = 1; m <= 4; m++) {
26434 for (uint32_t n = 1; n <= 2; n++) {
26435 GemmMicrokernelTester()
26436 .mr(4)
26437 .nr(2)
26438 .kr(4)
26439 .sr(1)
26440 .m(m)
26441 .n(n)
26442 .k(k)
26443 .ks(3)
26444 .iterations(1)
26445 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26446 }
26447 }
26448 }
26449 }
26450
26451 TEST(F32_IGEMM_4X2C4__SSE, n_gt_2_small_kernel) {
26452 TEST_REQUIRES_X86_SSE;
26453 for (uint32_t n = 3; n < 4; n++) {
26454 for (size_t k = 1; k <= 20; k += 5) {
26455 GemmMicrokernelTester()
26456 .mr(4)
26457 .nr(2)
26458 .kr(4)
26459 .sr(1)
26460 .m(4)
26461 .n(2)
26462 .k(k)
26463 .ks(3)
26464 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26465 }
26466 }
26467 }
26468
26469 TEST(F32_IGEMM_4X2C4__SSE, n_div_2_small_kernel) {
26470 TEST_REQUIRES_X86_SSE;
26471 for (uint32_t n = 4; n <= 6; n += 2) {
26472 for (size_t k = 1; k <= 20; k += 5) {
26473 GemmMicrokernelTester()
26474 .mr(4)
26475 .nr(2)
26476 .kr(4)
26477 .sr(1)
26478 .m(4)
26479 .n(2)
26480 .k(k)
26481 .ks(3)
26482 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26483 }
26484 }
26485 }
26486
26487 TEST(F32_IGEMM_4X2C4__SSE, strided_cm_subtile) {
26488 TEST_REQUIRES_X86_SSE;
26489 for (size_t k = 1; k <= 20; k += 5) {
26490 for (uint32_t m = 1; m <= 4; m++) {
26491 for (uint32_t n = 1; n <= 2; n++) {
26492 GemmMicrokernelTester()
26493 .mr(4)
26494 .nr(2)
26495 .kr(4)
26496 .sr(1)
26497 .m(m)
26498 .n(n)
26499 .k(k)
26500 .cm_stride(5)
26501 .iterations(1)
26502 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26503 }
26504 }
26505 }
26506 }
26507
26508 TEST(F32_IGEMM_4X2C4__SSE, a_offset) {
26509 TEST_REQUIRES_X86_SSE;
26510 for (size_t k = 1; k <= 20; k += 5) {
26511 GemmMicrokernelTester()
26512 .mr(4)
26513 .nr(2)
26514 .kr(4)
26515 .sr(1)
26516 .m(4)
26517 .n(2)
26518 .k(k)
26519 .ks(3)
26520 .a_offset(83)
26521 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26522 }
26523 }
26524
26525 TEST(F32_IGEMM_4X2C4__SSE, zero) {
26526 TEST_REQUIRES_X86_SSE;
26527 for (uint32_t mz = 0; mz < 4; mz++) {
26528 for (size_t k = 1; k <= 20; k += 5) {
26529 GemmMicrokernelTester()
26530 .mr(4)
26531 .nr(2)
26532 .kr(4)
26533 .sr(1)
26534 .m(4)
26535 .n(2)
26536 .k(k)
26537 .ks(3)
26538 .a_offset(83)
26539 .zero_index(mz)
26540 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26541 }
26542 }
26543 }
26544
26545 TEST(F32_IGEMM_4X2C4__SSE, qmin) {
26546 TEST_REQUIRES_X86_SSE;
26547 GemmMicrokernelTester()
26548 .mr(4)
26549 .nr(2)
26550 .kr(4)
26551 .sr(1)
26552 .m(4)
26553 .n(2)
26554 .k(4)
26555 .qmin(128)
26556 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26557 }
26558
26559 TEST(F32_IGEMM_4X2C4__SSE, qmax) {
26560 TEST_REQUIRES_X86_SSE;
26561 GemmMicrokernelTester()
26562 .mr(4)
26563 .nr(2)
26564 .kr(4)
26565 .sr(1)
26566 .m(4)
26567 .n(2)
26568 .k(4)
26569 .qmax(128)
26570 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26571 }
26572
26573 TEST(F32_IGEMM_4X2C4__SSE, strided_cm) {
26574 TEST_REQUIRES_X86_SSE;
26575 GemmMicrokernelTester()
26576 .mr(4)
26577 .nr(2)
26578 .kr(4)
26579 .sr(1)
26580 .m(4)
26581 .n(2)
26582 .k(4)
26583 .cm_stride(5)
26584 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
26585 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070026586#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070026587
26588
Marat Dukhan1dadbf72019-10-01 10:46:20 -070026589#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070026590 TEST(F32_IGEMM_4X2C4__PSIMD, k_eq_4) {
26591 TEST_REQUIRES_PSIMD;
26592 GemmMicrokernelTester()
26593 .mr(4)
26594 .nr(2)
26595 .kr(4)
26596 .sr(1)
26597 .m(4)
26598 .n(2)
26599 .k(4)
26600 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26601 }
26602
26603 TEST(F32_IGEMM_4X2C4__PSIMD, strided_cn) {
26604 TEST_REQUIRES_PSIMD;
26605 GemmMicrokernelTester()
26606 .mr(4)
26607 .nr(2)
26608 .kr(4)
26609 .sr(1)
26610 .m(4)
26611 .n(2)
26612 .k(4)
26613 .cn_stride(5)
26614 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26615 }
26616
26617 TEST(F32_IGEMM_4X2C4__PSIMD, k_eq_4_subtile) {
26618 TEST_REQUIRES_PSIMD;
26619 for (uint32_t m = 1; m <= 4; m++) {
26620 for (uint32_t n = 1; n <= 2; n++) {
26621 GemmMicrokernelTester()
26622 .mr(4)
26623 .nr(2)
26624 .kr(4)
26625 .sr(1)
26626 .m(m)
26627 .n(n)
26628 .k(4)
26629 .iterations(1)
26630 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26631 }
26632 }
26633 }
26634
26635 TEST(F32_IGEMM_4X2C4__PSIMD, k_eq_4_subtile_m) {
26636 TEST_REQUIRES_PSIMD;
26637 for (uint32_t m = 1; m <= 4; m++) {
26638 GemmMicrokernelTester()
26639 .mr(4)
26640 .nr(2)
26641 .kr(4)
26642 .sr(1)
26643 .m(m)
26644 .n(2)
26645 .k(4)
26646 .iterations(1)
26647 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26648 }
26649 }
26650
26651 TEST(F32_IGEMM_4X2C4__PSIMD, k_eq_4_subtile_n) {
26652 TEST_REQUIRES_PSIMD;
26653 for (uint32_t n = 1; n <= 2; n++) {
26654 GemmMicrokernelTester()
26655 .mr(4)
26656 .nr(2)
26657 .kr(4)
26658 .sr(1)
26659 .m(4)
26660 .n(n)
26661 .k(4)
26662 .iterations(1)
26663 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26664 }
26665 }
26666
26667 TEST(F32_IGEMM_4X2C4__PSIMD, k_lt_4) {
26668 TEST_REQUIRES_PSIMD;
26669 for (size_t k = 1; k < 4; k++) {
26670 GemmMicrokernelTester()
26671 .mr(4)
26672 .nr(2)
26673 .kr(4)
26674 .sr(1)
26675 .m(4)
26676 .n(2)
26677 .k(k)
26678 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26679 }
26680 }
26681
26682 TEST(F32_IGEMM_4X2C4__PSIMD, k_lt_4_subtile) {
26683 TEST_REQUIRES_PSIMD;
26684 for (size_t k = 1; k < 4; k++) {
26685 for (uint32_t m = 1; m <= 4; m++) {
26686 for (uint32_t n = 1; n <= 2; n++) {
26687 GemmMicrokernelTester()
26688 .mr(4)
26689 .nr(2)
26690 .kr(4)
26691 .sr(1)
26692 .m(m)
26693 .n(n)
26694 .k(k)
26695 .iterations(1)
26696 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26697 }
26698 }
26699 }
26700 }
26701
26702 TEST(F32_IGEMM_4X2C4__PSIMD, k_gt_4) {
26703 TEST_REQUIRES_PSIMD;
26704 for (size_t k = 5; k < 8; k++) {
26705 GemmMicrokernelTester()
26706 .mr(4)
26707 .nr(2)
26708 .kr(4)
26709 .sr(1)
26710 .m(4)
26711 .n(2)
26712 .k(k)
26713 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26714 }
26715 }
26716
26717 TEST(F32_IGEMM_4X2C4__PSIMD, k_gt_4_subtile) {
26718 TEST_REQUIRES_PSIMD;
26719 for (size_t k = 5; k < 8; k++) {
26720 for (uint32_t m = 1; m <= 4; m++) {
26721 for (uint32_t n = 1; n <= 2; n++) {
26722 GemmMicrokernelTester()
26723 .mr(4)
26724 .nr(2)
26725 .kr(4)
26726 .sr(1)
26727 .m(m)
26728 .n(n)
26729 .k(k)
26730 .iterations(1)
26731 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26732 }
26733 }
26734 }
26735 }
26736
26737 TEST(F32_IGEMM_4X2C4__PSIMD, k_div_4) {
26738 TEST_REQUIRES_PSIMD;
26739 for (size_t k = 8; k <= 40; k += 4) {
26740 GemmMicrokernelTester()
26741 .mr(4)
26742 .nr(2)
26743 .kr(4)
26744 .sr(1)
26745 .m(4)
26746 .n(2)
26747 .k(k)
26748 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26749 }
26750 }
26751
26752 TEST(F32_IGEMM_4X2C4__PSIMD, k_div_4_subtile) {
26753 TEST_REQUIRES_PSIMD;
26754 for (size_t k = 8; k <= 40; k += 4) {
26755 for (uint32_t m = 1; m <= 4; m++) {
26756 for (uint32_t n = 1; n <= 2; n++) {
26757 GemmMicrokernelTester()
26758 .mr(4)
26759 .nr(2)
26760 .kr(4)
26761 .sr(1)
26762 .m(m)
26763 .n(n)
26764 .k(k)
26765 .iterations(1)
26766 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26767 }
26768 }
26769 }
26770 }
26771
26772 TEST(F32_IGEMM_4X2C4__PSIMD, n_gt_2) {
26773 TEST_REQUIRES_PSIMD;
26774 for (uint32_t n = 3; n < 4; n++) {
26775 for (size_t k = 1; k <= 20; k += 5) {
26776 GemmMicrokernelTester()
26777 .mr(4)
26778 .nr(2)
26779 .kr(4)
26780 .sr(1)
26781 .m(4)
26782 .n(2)
26783 .k(k)
26784 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26785 }
26786 }
26787 }
26788
26789 TEST(F32_IGEMM_4X2C4__PSIMD, n_gt_2_strided_cn) {
26790 TEST_REQUIRES_PSIMD;
26791 for (uint32_t n = 3; n < 4; n++) {
26792 for (size_t k = 1; k <= 20; k += 5) {
26793 GemmMicrokernelTester()
26794 .mr(4)
26795 .nr(2)
26796 .kr(4)
26797 .sr(1)
26798 .m(4)
26799 .n(2)
26800 .k(k)
26801 .cn_stride(5)
26802 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26803 }
26804 }
26805 }
26806
26807 TEST(F32_IGEMM_4X2C4__PSIMD, n_gt_2_subtile) {
26808 TEST_REQUIRES_PSIMD;
26809 for (uint32_t n = 3; n < 4; n++) {
26810 for (size_t k = 1; k <= 20; k += 5) {
26811 for (uint32_t m = 1; m <= 4; m++) {
26812 GemmMicrokernelTester()
26813 .mr(4)
26814 .nr(2)
26815 .kr(4)
26816 .sr(1)
26817 .m(m)
26818 .n(n)
26819 .k(k)
26820 .iterations(1)
26821 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26822 }
26823 }
26824 }
26825 }
26826
26827 TEST(F32_IGEMM_4X2C4__PSIMD, n_div_2) {
26828 TEST_REQUIRES_PSIMD;
26829 for (uint32_t n = 4; n <= 6; n += 2) {
26830 for (size_t k = 1; k <= 20; k += 5) {
26831 GemmMicrokernelTester()
26832 .mr(4)
26833 .nr(2)
26834 .kr(4)
26835 .sr(1)
26836 .m(4)
26837 .n(2)
26838 .k(k)
26839 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26840 }
26841 }
26842 }
26843
26844 TEST(F32_IGEMM_4X2C4__PSIMD, n_div_2_strided_cn) {
26845 TEST_REQUIRES_PSIMD;
26846 for (uint32_t n = 4; n <= 6; n += 2) {
26847 for (size_t k = 1; k <= 20; k += 5) {
26848 GemmMicrokernelTester()
26849 .mr(4)
26850 .nr(2)
26851 .kr(4)
26852 .sr(1)
26853 .m(4)
26854 .n(n)
26855 .k(k)
26856 .cn_stride(5)
26857 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26858 }
26859 }
26860 }
26861
26862 TEST(F32_IGEMM_4X2C4__PSIMD, n_div_2_subtile) {
26863 TEST_REQUIRES_PSIMD;
26864 for (uint32_t n = 4; n <= 6; n += 2) {
26865 for (size_t k = 1; k <= 20; k += 5) {
26866 for (uint32_t m = 1; m <= 4; m++) {
26867 GemmMicrokernelTester()
26868 .mr(4)
26869 .nr(2)
26870 .kr(4)
26871 .sr(1)
26872 .m(m)
26873 .n(n)
26874 .k(k)
26875 .iterations(1)
26876 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26877 }
26878 }
26879 }
26880 }
26881
26882 TEST(F32_IGEMM_4X2C4__PSIMD, small_kernel) {
26883 TEST_REQUIRES_PSIMD;
26884 for (size_t k = 1; k <= 20; k += 5) {
26885 GemmMicrokernelTester()
26886 .mr(4)
26887 .nr(2)
26888 .kr(4)
26889 .sr(1)
26890 .m(4)
26891 .n(2)
26892 .k(k)
26893 .ks(3)
26894 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26895 }
26896 }
26897
26898 TEST(F32_IGEMM_4X2C4__PSIMD, small_kernel_subtile) {
26899 TEST_REQUIRES_PSIMD;
26900 for (size_t k = 1; k <= 20; k += 5) {
26901 for (uint32_t m = 1; m <= 4; m++) {
26902 for (uint32_t n = 1; n <= 2; n++) {
26903 GemmMicrokernelTester()
26904 .mr(4)
26905 .nr(2)
26906 .kr(4)
26907 .sr(1)
26908 .m(m)
26909 .n(n)
26910 .k(k)
26911 .ks(3)
26912 .iterations(1)
26913 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26914 }
26915 }
26916 }
26917 }
26918
26919 TEST(F32_IGEMM_4X2C4__PSIMD, n_gt_2_small_kernel) {
26920 TEST_REQUIRES_PSIMD;
26921 for (uint32_t n = 3; n < 4; n++) {
26922 for (size_t k = 1; k <= 20; k += 5) {
26923 GemmMicrokernelTester()
26924 .mr(4)
26925 .nr(2)
26926 .kr(4)
26927 .sr(1)
26928 .m(4)
26929 .n(2)
26930 .k(k)
26931 .ks(3)
26932 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26933 }
26934 }
26935 }
26936
26937 TEST(F32_IGEMM_4X2C4__PSIMD, n_div_2_small_kernel) {
26938 TEST_REQUIRES_PSIMD;
26939 for (uint32_t n = 4; n <= 6; n += 2) {
26940 for (size_t k = 1; k <= 20; k += 5) {
26941 GemmMicrokernelTester()
26942 .mr(4)
26943 .nr(2)
26944 .kr(4)
26945 .sr(1)
26946 .m(4)
26947 .n(2)
26948 .k(k)
26949 .ks(3)
26950 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26951 }
26952 }
26953 }
26954
26955 TEST(F32_IGEMM_4X2C4__PSIMD, strided_cm_subtile) {
26956 TEST_REQUIRES_PSIMD;
26957 for (size_t k = 1; k <= 20; k += 5) {
26958 for (uint32_t m = 1; m <= 4; m++) {
26959 for (uint32_t n = 1; n <= 2; n++) {
26960 GemmMicrokernelTester()
26961 .mr(4)
26962 .nr(2)
26963 .kr(4)
26964 .sr(1)
26965 .m(m)
26966 .n(n)
26967 .k(k)
26968 .cm_stride(5)
26969 .iterations(1)
26970 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26971 }
26972 }
26973 }
26974 }
26975
26976 TEST(F32_IGEMM_4X2C4__PSIMD, a_offset) {
26977 TEST_REQUIRES_PSIMD;
26978 for (size_t k = 1; k <= 20; k += 5) {
26979 GemmMicrokernelTester()
26980 .mr(4)
26981 .nr(2)
26982 .kr(4)
26983 .sr(1)
26984 .m(4)
26985 .n(2)
26986 .k(k)
26987 .ks(3)
26988 .a_offset(83)
26989 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
26990 }
26991 }
26992
26993 TEST(F32_IGEMM_4X2C4__PSIMD, zero) {
26994 TEST_REQUIRES_PSIMD;
26995 for (uint32_t mz = 0; mz < 4; mz++) {
26996 for (size_t k = 1; k <= 20; k += 5) {
26997 GemmMicrokernelTester()
26998 .mr(4)
26999 .nr(2)
27000 .kr(4)
27001 .sr(1)
27002 .m(4)
27003 .n(2)
27004 .k(k)
27005 .ks(3)
27006 .a_offset(83)
27007 .zero_index(mz)
27008 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
27009 }
27010 }
27011 }
27012
27013 TEST(F32_IGEMM_4X2C4__PSIMD, qmin) {
27014 TEST_REQUIRES_PSIMD;
27015 GemmMicrokernelTester()
27016 .mr(4)
27017 .nr(2)
27018 .kr(4)
27019 .sr(1)
27020 .m(4)
27021 .n(2)
27022 .k(4)
27023 .qmin(128)
27024 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
27025 }
27026
27027 TEST(F32_IGEMM_4X2C4__PSIMD, qmax) {
27028 TEST_REQUIRES_PSIMD;
27029 GemmMicrokernelTester()
27030 .mr(4)
27031 .nr(2)
27032 .kr(4)
27033 .sr(1)
27034 .m(4)
27035 .n(2)
27036 .k(4)
27037 .qmax(128)
27038 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
27039 }
27040
27041 TEST(F32_IGEMM_4X2C4__PSIMD, strided_cm) {
27042 TEST_REQUIRES_PSIMD;
27043 GemmMicrokernelTester()
27044 .mr(4)
27045 .nr(2)
27046 .kr(4)
27047 .sr(1)
27048 .m(4)
27049 .n(2)
27050 .k(4)
27051 .cm_stride(5)
27052 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
27053 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070027054#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070027055
27056
Marat Dukhanfda12b82019-11-21 12:27:59 -080027057#if XNN_ARCH_X86 || XNN_ARCH_X86_64
27058 TEST(F32_IGEMM_1X8__AVX_BROADCAST, k_eq_1) {
27059 TEST_REQUIRES_X86_AVX;
27060 GemmMicrokernelTester()
27061 .mr(1)
27062 .nr(8)
27063 .kr(1)
27064 .sr(1)
27065 .m(1)
27066 .n(8)
27067 .k(1)
27068 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27069 }
27070
27071 TEST(F32_IGEMM_1X8__AVX_BROADCAST, strided_cn) {
27072 TEST_REQUIRES_X86_AVX;
27073 GemmMicrokernelTester()
27074 .mr(1)
27075 .nr(8)
27076 .kr(1)
27077 .sr(1)
27078 .m(1)
27079 .n(8)
27080 .k(1)
27081 .cn_stride(11)
27082 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27083 }
27084
27085 TEST(F32_IGEMM_1X8__AVX_BROADCAST, k_eq_1_subtile) {
27086 TEST_REQUIRES_X86_AVX;
27087 for (uint32_t m = 1; m <= 1; m++) {
27088 for (uint32_t n = 1; n <= 8; n++) {
27089 GemmMicrokernelTester()
27090 .mr(1)
27091 .nr(8)
27092 .kr(1)
27093 .sr(1)
27094 .m(m)
27095 .n(n)
27096 .k(1)
27097 .iterations(1)
27098 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27099 }
27100 }
27101 }
27102
27103 TEST(F32_IGEMM_1X8__AVX_BROADCAST, k_eq_1_subtile_m) {
27104 TEST_REQUIRES_X86_AVX;
27105 for (uint32_t m = 1; m <= 1; m++) {
27106 GemmMicrokernelTester()
27107 .mr(1)
27108 .nr(8)
27109 .kr(1)
27110 .sr(1)
27111 .m(m)
27112 .n(8)
27113 .k(1)
27114 .iterations(1)
27115 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27116 }
27117 }
27118
27119 TEST(F32_IGEMM_1X8__AVX_BROADCAST, k_eq_1_subtile_n) {
27120 TEST_REQUIRES_X86_AVX;
27121 for (uint32_t n = 1; n <= 8; n++) {
27122 GemmMicrokernelTester()
27123 .mr(1)
27124 .nr(8)
27125 .kr(1)
27126 .sr(1)
27127 .m(1)
27128 .n(n)
27129 .k(1)
27130 .iterations(1)
27131 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27132 }
27133 }
27134
27135 TEST(F32_IGEMM_1X8__AVX_BROADCAST, k_gt_1) {
27136 TEST_REQUIRES_X86_AVX;
27137 for (size_t k = 2; k < 10; k++) {
27138 GemmMicrokernelTester()
27139 .mr(1)
27140 .nr(8)
27141 .kr(1)
27142 .sr(1)
27143 .m(1)
27144 .n(8)
27145 .k(k)
27146 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27147 }
27148 }
27149
27150 TEST(F32_IGEMM_1X8__AVX_BROADCAST, k_gt_1_subtile) {
27151 TEST_REQUIRES_X86_AVX;
27152 for (size_t k = 2; k < 10; k++) {
27153 for (uint32_t m = 1; m <= 1; m++) {
27154 for (uint32_t n = 1; n <= 8; n++) {
27155 GemmMicrokernelTester()
27156 .mr(1)
27157 .nr(8)
27158 .kr(1)
27159 .sr(1)
27160 .m(m)
27161 .n(n)
27162 .k(k)
27163 .iterations(1)
27164 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27165 }
27166 }
27167 }
27168 }
27169
27170 TEST(F32_IGEMM_1X8__AVX_BROADCAST, n_gt_8) {
27171 TEST_REQUIRES_X86_AVX;
27172 for (uint32_t n = 9; n < 16; n++) {
27173 for (size_t k = 1; k <= 5; k += 2) {
27174 GemmMicrokernelTester()
27175 .mr(1)
27176 .nr(8)
27177 .kr(1)
27178 .sr(1)
27179 .m(1)
27180 .n(8)
27181 .k(k)
27182 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27183 }
27184 }
27185 }
27186
27187 TEST(F32_IGEMM_1X8__AVX_BROADCAST, n_gt_8_strided_cn) {
27188 TEST_REQUIRES_X86_AVX;
27189 for (uint32_t n = 9; n < 16; n++) {
27190 for (size_t k = 1; k <= 5; k += 2) {
27191 GemmMicrokernelTester()
27192 .mr(1)
27193 .nr(8)
27194 .kr(1)
27195 .sr(1)
27196 .m(1)
27197 .n(8)
27198 .k(k)
27199 .cn_stride(11)
27200 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27201 }
27202 }
27203 }
27204
27205 TEST(F32_IGEMM_1X8__AVX_BROADCAST, n_gt_8_subtile) {
27206 TEST_REQUIRES_X86_AVX;
27207 for (uint32_t n = 9; n < 16; n++) {
27208 for (size_t k = 1; k <= 5; k += 2) {
27209 for (uint32_t m = 1; m <= 1; m++) {
27210 GemmMicrokernelTester()
27211 .mr(1)
27212 .nr(8)
27213 .kr(1)
27214 .sr(1)
27215 .m(m)
27216 .n(n)
27217 .k(k)
27218 .iterations(1)
27219 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27220 }
27221 }
27222 }
27223 }
27224
27225 TEST(F32_IGEMM_1X8__AVX_BROADCAST, n_div_8) {
27226 TEST_REQUIRES_X86_AVX;
27227 for (uint32_t n = 16; n <= 24; n += 8) {
27228 for (size_t k = 1; k <= 5; k += 2) {
27229 GemmMicrokernelTester()
27230 .mr(1)
27231 .nr(8)
27232 .kr(1)
27233 .sr(1)
27234 .m(1)
27235 .n(8)
27236 .k(k)
27237 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27238 }
27239 }
27240 }
27241
27242 TEST(F32_IGEMM_1X8__AVX_BROADCAST, n_div_8_strided_cn) {
27243 TEST_REQUIRES_X86_AVX;
27244 for (uint32_t n = 16; n <= 24; n += 8) {
27245 for (size_t k = 1; k <= 5; k += 2) {
27246 GemmMicrokernelTester()
27247 .mr(1)
27248 .nr(8)
27249 .kr(1)
27250 .sr(1)
27251 .m(1)
27252 .n(n)
27253 .k(k)
27254 .cn_stride(11)
27255 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27256 }
27257 }
27258 }
27259
27260 TEST(F32_IGEMM_1X8__AVX_BROADCAST, n_div_8_subtile) {
27261 TEST_REQUIRES_X86_AVX;
27262 for (uint32_t n = 16; n <= 24; n += 8) {
27263 for (size_t k = 1; k <= 5; k += 2) {
27264 for (uint32_t m = 1; m <= 1; m++) {
27265 GemmMicrokernelTester()
27266 .mr(1)
27267 .nr(8)
27268 .kr(1)
27269 .sr(1)
27270 .m(m)
27271 .n(n)
27272 .k(k)
27273 .iterations(1)
27274 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27275 }
27276 }
27277 }
27278 }
27279
27280 TEST(F32_IGEMM_1X8__AVX_BROADCAST, small_kernel) {
27281 TEST_REQUIRES_X86_AVX;
27282 for (size_t k = 1; k <= 5; k += 2) {
27283 GemmMicrokernelTester()
27284 .mr(1)
27285 .nr(8)
27286 .kr(1)
27287 .sr(1)
27288 .m(1)
27289 .n(8)
27290 .k(k)
27291 .ks(3)
27292 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27293 }
27294 }
27295
27296 TEST(F32_IGEMM_1X8__AVX_BROADCAST, small_kernel_subtile) {
27297 TEST_REQUIRES_X86_AVX;
27298 for (size_t k = 1; k <= 5; k += 2) {
27299 for (uint32_t m = 1; m <= 1; m++) {
27300 for (uint32_t n = 1; n <= 8; n++) {
27301 GemmMicrokernelTester()
27302 .mr(1)
27303 .nr(8)
27304 .kr(1)
27305 .sr(1)
27306 .m(m)
27307 .n(n)
27308 .k(k)
27309 .ks(3)
27310 .iterations(1)
27311 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27312 }
27313 }
27314 }
27315 }
27316
27317 TEST(F32_IGEMM_1X8__AVX_BROADCAST, n_gt_8_small_kernel) {
27318 TEST_REQUIRES_X86_AVX;
27319 for (uint32_t n = 9; n < 16; n++) {
27320 for (size_t k = 1; k <= 5; k += 2) {
27321 GemmMicrokernelTester()
27322 .mr(1)
27323 .nr(8)
27324 .kr(1)
27325 .sr(1)
27326 .m(1)
27327 .n(8)
27328 .k(k)
27329 .ks(3)
27330 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27331 }
27332 }
27333 }
27334
27335 TEST(F32_IGEMM_1X8__AVX_BROADCAST, n_div_8_small_kernel) {
27336 TEST_REQUIRES_X86_AVX;
27337 for (uint32_t n = 16; n <= 24; n += 8) {
27338 for (size_t k = 1; k <= 5; k += 2) {
27339 GemmMicrokernelTester()
27340 .mr(1)
27341 .nr(8)
27342 .kr(1)
27343 .sr(1)
27344 .m(1)
27345 .n(8)
27346 .k(k)
27347 .ks(3)
27348 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27349 }
27350 }
27351 }
27352
27353 TEST(F32_IGEMM_1X8__AVX_BROADCAST, strided_cm_subtile) {
27354 TEST_REQUIRES_X86_AVX;
27355 for (size_t k = 1; k <= 5; k += 2) {
27356 for (uint32_t m = 1; m <= 1; m++) {
27357 for (uint32_t n = 1; n <= 8; n++) {
27358 GemmMicrokernelTester()
27359 .mr(1)
27360 .nr(8)
27361 .kr(1)
27362 .sr(1)
27363 .m(m)
27364 .n(n)
27365 .k(k)
27366 .cm_stride(11)
27367 .iterations(1)
27368 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27369 }
27370 }
27371 }
27372 }
27373
27374 TEST(F32_IGEMM_1X8__AVX_BROADCAST, a_offset) {
27375 TEST_REQUIRES_X86_AVX;
27376 for (size_t k = 1; k <= 5; k += 2) {
27377 GemmMicrokernelTester()
27378 .mr(1)
27379 .nr(8)
27380 .kr(1)
27381 .sr(1)
27382 .m(1)
27383 .n(8)
27384 .k(k)
27385 .ks(3)
27386 .a_offset(7)
27387 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27388 }
27389 }
27390
27391 TEST(F32_IGEMM_1X8__AVX_BROADCAST, zero) {
27392 TEST_REQUIRES_X86_AVX;
27393 for (uint32_t mz = 0; mz < 1; mz++) {
27394 for (size_t k = 1; k <= 5; k += 2) {
27395 GemmMicrokernelTester()
27396 .mr(1)
27397 .nr(8)
27398 .kr(1)
27399 .sr(1)
27400 .m(1)
27401 .n(8)
27402 .k(k)
27403 .ks(3)
27404 .a_offset(7)
27405 .zero_index(mz)
27406 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27407 }
27408 }
27409 }
27410
27411 TEST(F32_IGEMM_1X8__AVX_BROADCAST, qmin) {
27412 TEST_REQUIRES_X86_AVX;
27413 GemmMicrokernelTester()
27414 .mr(1)
27415 .nr(8)
27416 .kr(1)
27417 .sr(1)
27418 .m(1)
27419 .n(8)
27420 .k(1)
27421 .qmin(128)
27422 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27423 }
27424
27425 TEST(F32_IGEMM_1X8__AVX_BROADCAST, qmax) {
27426 TEST_REQUIRES_X86_AVX;
27427 GemmMicrokernelTester()
27428 .mr(1)
27429 .nr(8)
27430 .kr(1)
27431 .sr(1)
27432 .m(1)
27433 .n(8)
27434 .k(1)
27435 .qmax(128)
27436 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27437 }
27438
27439 TEST(F32_IGEMM_1X8__AVX_BROADCAST, strided_cm) {
27440 TEST_REQUIRES_X86_AVX;
27441 GemmMicrokernelTester()
27442 .mr(1)
27443 .nr(8)
27444 .kr(1)
27445 .sr(1)
27446 .m(1)
27447 .n(8)
27448 .k(1)
27449 .cm_stride(11)
27450 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
27451 }
27452#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
27453
27454
27455#if XNN_ARCH_X86 || XNN_ARCH_X86_64
27456 TEST(F32_IGEMM_4X8__AVX_BROADCAST, k_eq_1) {
27457 TEST_REQUIRES_X86_AVX;
27458 GemmMicrokernelTester()
27459 .mr(4)
27460 .nr(8)
27461 .kr(1)
27462 .sr(1)
27463 .m(4)
27464 .n(8)
27465 .k(1)
27466 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27467 }
27468
27469 TEST(F32_IGEMM_4X8__AVX_BROADCAST, strided_cn) {
27470 TEST_REQUIRES_X86_AVX;
27471 GemmMicrokernelTester()
27472 .mr(4)
27473 .nr(8)
27474 .kr(1)
27475 .sr(1)
27476 .m(4)
27477 .n(8)
27478 .k(1)
27479 .cn_stride(11)
27480 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27481 }
27482
27483 TEST(F32_IGEMM_4X8__AVX_BROADCAST, k_eq_1_subtile) {
27484 TEST_REQUIRES_X86_AVX;
27485 for (uint32_t m = 1; m <= 4; m++) {
27486 for (uint32_t n = 1; n <= 8; n++) {
27487 GemmMicrokernelTester()
27488 .mr(4)
27489 .nr(8)
27490 .kr(1)
27491 .sr(1)
27492 .m(m)
27493 .n(n)
27494 .k(1)
27495 .iterations(1)
27496 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27497 }
27498 }
27499 }
27500
27501 TEST(F32_IGEMM_4X8__AVX_BROADCAST, k_eq_1_subtile_m) {
27502 TEST_REQUIRES_X86_AVX;
27503 for (uint32_t m = 1; m <= 4; m++) {
27504 GemmMicrokernelTester()
27505 .mr(4)
27506 .nr(8)
27507 .kr(1)
27508 .sr(1)
27509 .m(m)
27510 .n(8)
27511 .k(1)
27512 .iterations(1)
27513 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27514 }
27515 }
27516
27517 TEST(F32_IGEMM_4X8__AVX_BROADCAST, k_eq_1_subtile_n) {
27518 TEST_REQUIRES_X86_AVX;
27519 for (uint32_t n = 1; n <= 8; n++) {
27520 GemmMicrokernelTester()
27521 .mr(4)
27522 .nr(8)
27523 .kr(1)
27524 .sr(1)
27525 .m(4)
27526 .n(n)
27527 .k(1)
27528 .iterations(1)
27529 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27530 }
27531 }
27532
27533 TEST(F32_IGEMM_4X8__AVX_BROADCAST, k_gt_1) {
27534 TEST_REQUIRES_X86_AVX;
27535 for (size_t k = 2; k < 10; k++) {
27536 GemmMicrokernelTester()
27537 .mr(4)
27538 .nr(8)
27539 .kr(1)
27540 .sr(1)
27541 .m(4)
27542 .n(8)
27543 .k(k)
27544 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27545 }
27546 }
27547
27548 TEST(F32_IGEMM_4X8__AVX_BROADCAST, k_gt_1_subtile) {
27549 TEST_REQUIRES_X86_AVX;
27550 for (size_t k = 2; k < 10; k++) {
27551 for (uint32_t m = 1; m <= 4; m++) {
27552 for (uint32_t n = 1; n <= 8; n++) {
27553 GemmMicrokernelTester()
27554 .mr(4)
27555 .nr(8)
27556 .kr(1)
27557 .sr(1)
27558 .m(m)
27559 .n(n)
27560 .k(k)
27561 .iterations(1)
27562 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27563 }
27564 }
27565 }
27566 }
27567
27568 TEST(F32_IGEMM_4X8__AVX_BROADCAST, n_gt_8) {
27569 TEST_REQUIRES_X86_AVX;
27570 for (uint32_t n = 9; n < 16; n++) {
27571 for (size_t k = 1; k <= 5; k += 2) {
27572 GemmMicrokernelTester()
27573 .mr(4)
27574 .nr(8)
27575 .kr(1)
27576 .sr(1)
27577 .m(4)
27578 .n(8)
27579 .k(k)
27580 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27581 }
27582 }
27583 }
27584
27585 TEST(F32_IGEMM_4X8__AVX_BROADCAST, n_gt_8_strided_cn) {
27586 TEST_REQUIRES_X86_AVX;
27587 for (uint32_t n = 9; n < 16; n++) {
27588 for (size_t k = 1; k <= 5; k += 2) {
27589 GemmMicrokernelTester()
27590 .mr(4)
27591 .nr(8)
27592 .kr(1)
27593 .sr(1)
27594 .m(4)
27595 .n(8)
27596 .k(k)
27597 .cn_stride(11)
27598 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27599 }
27600 }
27601 }
27602
27603 TEST(F32_IGEMM_4X8__AVX_BROADCAST, n_gt_8_subtile) {
27604 TEST_REQUIRES_X86_AVX;
27605 for (uint32_t n = 9; n < 16; n++) {
27606 for (size_t k = 1; k <= 5; k += 2) {
27607 for (uint32_t m = 1; m <= 4; m++) {
27608 GemmMicrokernelTester()
27609 .mr(4)
27610 .nr(8)
27611 .kr(1)
27612 .sr(1)
27613 .m(m)
27614 .n(n)
27615 .k(k)
27616 .iterations(1)
27617 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27618 }
27619 }
27620 }
27621 }
27622
27623 TEST(F32_IGEMM_4X8__AVX_BROADCAST, n_div_8) {
27624 TEST_REQUIRES_X86_AVX;
27625 for (uint32_t n = 16; n <= 24; n += 8) {
27626 for (size_t k = 1; k <= 5; k += 2) {
27627 GemmMicrokernelTester()
27628 .mr(4)
27629 .nr(8)
27630 .kr(1)
27631 .sr(1)
27632 .m(4)
27633 .n(8)
27634 .k(k)
27635 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27636 }
27637 }
27638 }
27639
27640 TEST(F32_IGEMM_4X8__AVX_BROADCAST, n_div_8_strided_cn) {
27641 TEST_REQUIRES_X86_AVX;
27642 for (uint32_t n = 16; n <= 24; n += 8) {
27643 for (size_t k = 1; k <= 5; k += 2) {
27644 GemmMicrokernelTester()
27645 .mr(4)
27646 .nr(8)
27647 .kr(1)
27648 .sr(1)
27649 .m(4)
27650 .n(n)
27651 .k(k)
27652 .cn_stride(11)
27653 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27654 }
27655 }
27656 }
27657
27658 TEST(F32_IGEMM_4X8__AVX_BROADCAST, n_div_8_subtile) {
27659 TEST_REQUIRES_X86_AVX;
27660 for (uint32_t n = 16; n <= 24; n += 8) {
27661 for (size_t k = 1; k <= 5; k += 2) {
27662 for (uint32_t m = 1; m <= 4; m++) {
27663 GemmMicrokernelTester()
27664 .mr(4)
27665 .nr(8)
27666 .kr(1)
27667 .sr(1)
27668 .m(m)
27669 .n(n)
27670 .k(k)
27671 .iterations(1)
27672 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27673 }
27674 }
27675 }
27676 }
27677
27678 TEST(F32_IGEMM_4X8__AVX_BROADCAST, small_kernel) {
27679 TEST_REQUIRES_X86_AVX;
27680 for (size_t k = 1; k <= 5; k += 2) {
27681 GemmMicrokernelTester()
27682 .mr(4)
27683 .nr(8)
27684 .kr(1)
27685 .sr(1)
27686 .m(4)
27687 .n(8)
27688 .k(k)
27689 .ks(3)
27690 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27691 }
27692 }
27693
27694 TEST(F32_IGEMM_4X8__AVX_BROADCAST, small_kernel_subtile) {
27695 TEST_REQUIRES_X86_AVX;
27696 for (size_t k = 1; k <= 5; k += 2) {
27697 for (uint32_t m = 1; m <= 4; m++) {
27698 for (uint32_t n = 1; n <= 8; n++) {
27699 GemmMicrokernelTester()
27700 .mr(4)
27701 .nr(8)
27702 .kr(1)
27703 .sr(1)
27704 .m(m)
27705 .n(n)
27706 .k(k)
27707 .ks(3)
27708 .iterations(1)
27709 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27710 }
27711 }
27712 }
27713 }
27714
27715 TEST(F32_IGEMM_4X8__AVX_BROADCAST, n_gt_8_small_kernel) {
27716 TEST_REQUIRES_X86_AVX;
27717 for (uint32_t n = 9; n < 16; n++) {
27718 for (size_t k = 1; k <= 5; k += 2) {
27719 GemmMicrokernelTester()
27720 .mr(4)
27721 .nr(8)
27722 .kr(1)
27723 .sr(1)
27724 .m(4)
27725 .n(8)
27726 .k(k)
27727 .ks(3)
27728 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27729 }
27730 }
27731 }
27732
27733 TEST(F32_IGEMM_4X8__AVX_BROADCAST, n_div_8_small_kernel) {
27734 TEST_REQUIRES_X86_AVX;
27735 for (uint32_t n = 16; n <= 24; n += 8) {
27736 for (size_t k = 1; k <= 5; k += 2) {
27737 GemmMicrokernelTester()
27738 .mr(4)
27739 .nr(8)
27740 .kr(1)
27741 .sr(1)
27742 .m(4)
27743 .n(8)
27744 .k(k)
27745 .ks(3)
27746 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27747 }
27748 }
27749 }
27750
27751 TEST(F32_IGEMM_4X8__AVX_BROADCAST, strided_cm_subtile) {
27752 TEST_REQUIRES_X86_AVX;
27753 for (size_t k = 1; k <= 5; k += 2) {
27754 for (uint32_t m = 1; m <= 4; m++) {
27755 for (uint32_t n = 1; n <= 8; n++) {
27756 GemmMicrokernelTester()
27757 .mr(4)
27758 .nr(8)
27759 .kr(1)
27760 .sr(1)
27761 .m(m)
27762 .n(n)
27763 .k(k)
27764 .cm_stride(11)
27765 .iterations(1)
27766 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27767 }
27768 }
27769 }
27770 }
27771
27772 TEST(F32_IGEMM_4X8__AVX_BROADCAST, a_offset) {
27773 TEST_REQUIRES_X86_AVX;
27774 for (size_t k = 1; k <= 5; k += 2) {
27775 GemmMicrokernelTester()
27776 .mr(4)
27777 .nr(8)
27778 .kr(1)
27779 .sr(1)
27780 .m(4)
27781 .n(8)
27782 .k(k)
27783 .ks(3)
27784 .a_offset(23)
27785 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27786 }
27787 }
27788
27789 TEST(F32_IGEMM_4X8__AVX_BROADCAST, zero) {
27790 TEST_REQUIRES_X86_AVX;
27791 for (uint32_t mz = 0; mz < 4; mz++) {
27792 for (size_t k = 1; k <= 5; k += 2) {
27793 GemmMicrokernelTester()
27794 .mr(4)
27795 .nr(8)
27796 .kr(1)
27797 .sr(1)
27798 .m(4)
27799 .n(8)
27800 .k(k)
27801 .ks(3)
27802 .a_offset(23)
27803 .zero_index(mz)
27804 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27805 }
27806 }
27807 }
27808
27809 TEST(F32_IGEMM_4X8__AVX_BROADCAST, qmin) {
27810 TEST_REQUIRES_X86_AVX;
27811 GemmMicrokernelTester()
27812 .mr(4)
27813 .nr(8)
27814 .kr(1)
27815 .sr(1)
27816 .m(4)
27817 .n(8)
27818 .k(1)
27819 .qmin(128)
27820 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27821 }
27822
27823 TEST(F32_IGEMM_4X8__AVX_BROADCAST, qmax) {
27824 TEST_REQUIRES_X86_AVX;
27825 GemmMicrokernelTester()
27826 .mr(4)
27827 .nr(8)
27828 .kr(1)
27829 .sr(1)
27830 .m(4)
27831 .n(8)
27832 .k(1)
27833 .qmax(128)
27834 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27835 }
27836
27837 TEST(F32_IGEMM_4X8__AVX_BROADCAST, strided_cm) {
27838 TEST_REQUIRES_X86_AVX;
27839 GemmMicrokernelTester()
27840 .mr(4)
27841 .nr(8)
27842 .kr(1)
27843 .sr(1)
27844 .m(4)
27845 .n(8)
27846 .k(1)
27847 .cm_stride(11)
27848 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
27849 }
27850#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
27851
27852
27853#if XNN_ARCH_X86 || XNN_ARCH_X86_64
27854 TEST(F32_IGEMM_5X8__AVX_BROADCAST, k_eq_1) {
27855 TEST_REQUIRES_X86_AVX;
27856 GemmMicrokernelTester()
27857 .mr(5)
27858 .nr(8)
27859 .kr(1)
27860 .sr(1)
27861 .m(5)
27862 .n(8)
27863 .k(1)
27864 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
27865 }
27866
27867 TEST(F32_IGEMM_5X8__AVX_BROADCAST, strided_cn) {
27868 TEST_REQUIRES_X86_AVX;
27869 GemmMicrokernelTester()
27870 .mr(5)
27871 .nr(8)
27872 .kr(1)
27873 .sr(1)
27874 .m(5)
27875 .n(8)
27876 .k(1)
27877 .cn_stride(11)
27878 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
27879 }
27880
27881 TEST(F32_IGEMM_5X8__AVX_BROADCAST, k_eq_1_subtile) {
27882 TEST_REQUIRES_X86_AVX;
27883 for (uint32_t m = 1; m <= 5; m++) {
27884 for (uint32_t n = 1; n <= 8; n++) {
27885 GemmMicrokernelTester()
27886 .mr(5)
27887 .nr(8)
27888 .kr(1)
27889 .sr(1)
27890 .m(m)
27891 .n(n)
27892 .k(1)
27893 .iterations(1)
27894 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
27895 }
27896 }
27897 }
27898
27899 TEST(F32_IGEMM_5X8__AVX_BROADCAST, k_eq_1_subtile_m) {
27900 TEST_REQUIRES_X86_AVX;
27901 for (uint32_t m = 1; m <= 5; m++) {
27902 GemmMicrokernelTester()
27903 .mr(5)
27904 .nr(8)
27905 .kr(1)
27906 .sr(1)
27907 .m(m)
27908 .n(8)
27909 .k(1)
27910 .iterations(1)
27911 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
27912 }
27913 }
27914
27915 TEST(F32_IGEMM_5X8__AVX_BROADCAST, k_eq_1_subtile_n) {
27916 TEST_REQUIRES_X86_AVX;
27917 for (uint32_t n = 1; n <= 8; n++) {
27918 GemmMicrokernelTester()
27919 .mr(5)
27920 .nr(8)
27921 .kr(1)
27922 .sr(1)
27923 .m(5)
27924 .n(n)
27925 .k(1)
27926 .iterations(1)
27927 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
27928 }
27929 }
27930
27931 TEST(F32_IGEMM_5X8__AVX_BROADCAST, k_gt_1) {
27932 TEST_REQUIRES_X86_AVX;
27933 for (size_t k = 2; k < 10; k++) {
27934 GemmMicrokernelTester()
27935 .mr(5)
27936 .nr(8)
27937 .kr(1)
27938 .sr(1)
27939 .m(5)
27940 .n(8)
27941 .k(k)
27942 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
27943 }
27944 }
27945
27946 TEST(F32_IGEMM_5X8__AVX_BROADCAST, k_gt_1_subtile) {
27947 TEST_REQUIRES_X86_AVX;
27948 for (size_t k = 2; k < 10; k++) {
27949 for (uint32_t m = 1; m <= 5; m++) {
27950 for (uint32_t n = 1; n <= 8; n++) {
27951 GemmMicrokernelTester()
27952 .mr(5)
27953 .nr(8)
27954 .kr(1)
27955 .sr(1)
27956 .m(m)
27957 .n(n)
27958 .k(k)
27959 .iterations(1)
27960 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
27961 }
27962 }
27963 }
27964 }
27965
27966 TEST(F32_IGEMM_5X8__AVX_BROADCAST, n_gt_8) {
27967 TEST_REQUIRES_X86_AVX;
27968 for (uint32_t n = 9; n < 16; n++) {
27969 for (size_t k = 1; k <= 5; k += 2) {
27970 GemmMicrokernelTester()
27971 .mr(5)
27972 .nr(8)
27973 .kr(1)
27974 .sr(1)
27975 .m(5)
27976 .n(8)
27977 .k(k)
27978 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
27979 }
27980 }
27981 }
27982
27983 TEST(F32_IGEMM_5X8__AVX_BROADCAST, n_gt_8_strided_cn) {
27984 TEST_REQUIRES_X86_AVX;
27985 for (uint32_t n = 9; n < 16; n++) {
27986 for (size_t k = 1; k <= 5; k += 2) {
27987 GemmMicrokernelTester()
27988 .mr(5)
27989 .nr(8)
27990 .kr(1)
27991 .sr(1)
27992 .m(5)
27993 .n(8)
27994 .k(k)
27995 .cn_stride(11)
27996 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
27997 }
27998 }
27999 }
28000
28001 TEST(F32_IGEMM_5X8__AVX_BROADCAST, n_gt_8_subtile) {
28002 TEST_REQUIRES_X86_AVX;
28003 for (uint32_t n = 9; n < 16; n++) {
28004 for (size_t k = 1; k <= 5; k += 2) {
28005 for (uint32_t m = 1; m <= 5; m++) {
28006 GemmMicrokernelTester()
28007 .mr(5)
28008 .nr(8)
28009 .kr(1)
28010 .sr(1)
28011 .m(m)
28012 .n(n)
28013 .k(k)
28014 .iterations(1)
28015 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
28016 }
28017 }
28018 }
28019 }
28020
28021 TEST(F32_IGEMM_5X8__AVX_BROADCAST, n_div_8) {
28022 TEST_REQUIRES_X86_AVX;
28023 for (uint32_t n = 16; n <= 24; n += 8) {
28024 for (size_t k = 1; k <= 5; k += 2) {
28025 GemmMicrokernelTester()
28026 .mr(5)
28027 .nr(8)
28028 .kr(1)
28029 .sr(1)
28030 .m(5)
28031 .n(8)
28032 .k(k)
28033 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
28034 }
28035 }
28036 }
28037
28038 TEST(F32_IGEMM_5X8__AVX_BROADCAST, n_div_8_strided_cn) {
28039 TEST_REQUIRES_X86_AVX;
28040 for (uint32_t n = 16; n <= 24; n += 8) {
28041 for (size_t k = 1; k <= 5; k += 2) {
28042 GemmMicrokernelTester()
28043 .mr(5)
28044 .nr(8)
28045 .kr(1)
28046 .sr(1)
28047 .m(5)
28048 .n(n)
28049 .k(k)
28050 .cn_stride(11)
28051 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
28052 }
28053 }
28054 }
28055
28056 TEST(F32_IGEMM_5X8__AVX_BROADCAST, n_div_8_subtile) {
28057 TEST_REQUIRES_X86_AVX;
28058 for (uint32_t n = 16; n <= 24; n += 8) {
28059 for (size_t k = 1; k <= 5; k += 2) {
28060 for (uint32_t m = 1; m <= 5; m++) {
28061 GemmMicrokernelTester()
28062 .mr(5)
28063 .nr(8)
28064 .kr(1)
28065 .sr(1)
28066 .m(m)
28067 .n(n)
28068 .k(k)
28069 .iterations(1)
28070 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
28071 }
28072 }
28073 }
28074 }
28075
28076 TEST(F32_IGEMM_5X8__AVX_BROADCAST, small_kernel) {
28077 TEST_REQUIRES_X86_AVX;
28078 for (size_t k = 1; k <= 5; k += 2) {
28079 GemmMicrokernelTester()
28080 .mr(5)
28081 .nr(8)
28082 .kr(1)
28083 .sr(1)
28084 .m(5)
28085 .n(8)
28086 .k(k)
28087 .ks(3)
28088 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
28089 }
28090 }
28091
28092 TEST(F32_IGEMM_5X8__AVX_BROADCAST, small_kernel_subtile) {
28093 TEST_REQUIRES_X86_AVX;
28094 for (size_t k = 1; k <= 5; k += 2) {
28095 for (uint32_t m = 1; m <= 5; m++) {
28096 for (uint32_t n = 1; n <= 8; n++) {
28097 GemmMicrokernelTester()
28098 .mr(5)
28099 .nr(8)
28100 .kr(1)
28101 .sr(1)
28102 .m(m)
28103 .n(n)
28104 .k(k)
28105 .ks(3)
28106 .iterations(1)
28107 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
28108 }
28109 }
28110 }
28111 }
28112
28113 TEST(F32_IGEMM_5X8__AVX_BROADCAST, n_gt_8_small_kernel) {
28114 TEST_REQUIRES_X86_AVX;
28115 for (uint32_t n = 9; n < 16; n++) {
28116 for (size_t k = 1; k <= 5; k += 2) {
28117 GemmMicrokernelTester()
28118 .mr(5)
28119 .nr(8)
28120 .kr(1)
28121 .sr(1)
28122 .m(5)
28123 .n(8)
28124 .k(k)
28125 .ks(3)
28126 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
28127 }
28128 }
28129 }
28130
28131 TEST(F32_IGEMM_5X8__AVX_BROADCAST, n_div_8_small_kernel) {
28132 TEST_REQUIRES_X86_AVX;
28133 for (uint32_t n = 16; n <= 24; n += 8) {
28134 for (size_t k = 1; k <= 5; k += 2) {
28135 GemmMicrokernelTester()
28136 .mr(5)
28137 .nr(8)
28138 .kr(1)
28139 .sr(1)
28140 .m(5)
28141 .n(8)
28142 .k(k)
28143 .ks(3)
28144 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
28145 }
28146 }
28147 }
28148
28149 TEST(F32_IGEMM_5X8__AVX_BROADCAST, strided_cm_subtile) {
28150 TEST_REQUIRES_X86_AVX;
28151 for (size_t k = 1; k <= 5; k += 2) {
28152 for (uint32_t m = 1; m <= 5; m++) {
28153 for (uint32_t n = 1; n <= 8; n++) {
28154 GemmMicrokernelTester()
28155 .mr(5)
28156 .nr(8)
28157 .kr(1)
28158 .sr(1)
28159 .m(m)
28160 .n(n)
28161 .k(k)
28162 .cm_stride(11)
28163 .iterations(1)
28164 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
28165 }
28166 }
28167 }
28168 }
28169
28170 TEST(F32_IGEMM_5X8__AVX_BROADCAST, a_offset) {
28171 TEST_REQUIRES_X86_AVX;
28172 for (size_t k = 1; k <= 5; k += 2) {
28173 GemmMicrokernelTester()
28174 .mr(5)
28175 .nr(8)
28176 .kr(1)
28177 .sr(1)
28178 .m(5)
28179 .n(8)
28180 .k(k)
28181 .ks(3)
28182 .a_offset(29)
28183 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
28184 }
28185 }
28186
28187 TEST(F32_IGEMM_5X8__AVX_BROADCAST, zero) {
28188 TEST_REQUIRES_X86_AVX;
28189 for (uint32_t mz = 0; mz < 5; mz++) {
28190 for (size_t k = 1; k <= 5; k += 2) {
28191 GemmMicrokernelTester()
28192 .mr(5)
28193 .nr(8)
28194 .kr(1)
28195 .sr(1)
28196 .m(5)
28197 .n(8)
28198 .k(k)
28199 .ks(3)
28200 .a_offset(29)
28201 .zero_index(mz)
28202 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
28203 }
28204 }
28205 }
28206
28207 TEST(F32_IGEMM_5X8__AVX_BROADCAST, qmin) {
28208 TEST_REQUIRES_X86_AVX;
28209 GemmMicrokernelTester()
28210 .mr(5)
28211 .nr(8)
28212 .kr(1)
28213 .sr(1)
28214 .m(5)
28215 .n(8)
28216 .k(1)
28217 .qmin(128)
28218 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
28219 }
28220
28221 TEST(F32_IGEMM_5X8__AVX_BROADCAST, qmax) {
28222 TEST_REQUIRES_X86_AVX;
28223 GemmMicrokernelTester()
28224 .mr(5)
28225 .nr(8)
28226 .kr(1)
28227 .sr(1)
28228 .m(5)
28229 .n(8)
28230 .k(1)
28231 .qmax(128)
28232 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
28233 }
28234
28235 TEST(F32_IGEMM_5X8__AVX_BROADCAST, strided_cm) {
28236 TEST_REQUIRES_X86_AVX;
28237 GemmMicrokernelTester()
28238 .mr(5)
28239 .nr(8)
28240 .kr(1)
28241 .sr(1)
28242 .m(5)
28243 .n(8)
28244 .k(1)
28245 .cm_stride(11)
28246 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
28247 }
28248#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
28249
28250
28251#if XNN_ARCH_X86 || XNN_ARCH_X86_64
28252 TEST(F32_IGEMM_6X8__AVX_BROADCAST, k_eq_1) {
28253 TEST_REQUIRES_X86_AVX;
28254 GemmMicrokernelTester()
28255 .mr(6)
28256 .nr(8)
28257 .kr(1)
28258 .sr(1)
28259 .m(6)
28260 .n(8)
28261 .k(1)
28262 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28263 }
28264
28265 TEST(F32_IGEMM_6X8__AVX_BROADCAST, strided_cn) {
28266 TEST_REQUIRES_X86_AVX;
28267 GemmMicrokernelTester()
28268 .mr(6)
28269 .nr(8)
28270 .kr(1)
28271 .sr(1)
28272 .m(6)
28273 .n(8)
28274 .k(1)
28275 .cn_stride(11)
28276 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28277 }
28278
28279 TEST(F32_IGEMM_6X8__AVX_BROADCAST, k_eq_1_subtile) {
28280 TEST_REQUIRES_X86_AVX;
28281 for (uint32_t m = 1; m <= 6; m++) {
28282 for (uint32_t n = 1; n <= 8; n++) {
28283 GemmMicrokernelTester()
28284 .mr(6)
28285 .nr(8)
28286 .kr(1)
28287 .sr(1)
28288 .m(m)
28289 .n(n)
28290 .k(1)
28291 .iterations(1)
28292 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28293 }
28294 }
28295 }
28296
28297 TEST(F32_IGEMM_6X8__AVX_BROADCAST, k_eq_1_subtile_m) {
28298 TEST_REQUIRES_X86_AVX;
28299 for (uint32_t m = 1; m <= 6; m++) {
28300 GemmMicrokernelTester()
28301 .mr(6)
28302 .nr(8)
28303 .kr(1)
28304 .sr(1)
28305 .m(m)
28306 .n(8)
28307 .k(1)
28308 .iterations(1)
28309 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28310 }
28311 }
28312
28313 TEST(F32_IGEMM_6X8__AVX_BROADCAST, k_eq_1_subtile_n) {
28314 TEST_REQUIRES_X86_AVX;
28315 for (uint32_t n = 1; n <= 8; n++) {
28316 GemmMicrokernelTester()
28317 .mr(6)
28318 .nr(8)
28319 .kr(1)
28320 .sr(1)
28321 .m(6)
28322 .n(n)
28323 .k(1)
28324 .iterations(1)
28325 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28326 }
28327 }
28328
28329 TEST(F32_IGEMM_6X8__AVX_BROADCAST, k_gt_1) {
28330 TEST_REQUIRES_X86_AVX;
28331 for (size_t k = 2; k < 10; k++) {
28332 GemmMicrokernelTester()
28333 .mr(6)
28334 .nr(8)
28335 .kr(1)
28336 .sr(1)
28337 .m(6)
28338 .n(8)
28339 .k(k)
28340 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28341 }
28342 }
28343
28344 TEST(F32_IGEMM_6X8__AVX_BROADCAST, k_gt_1_subtile) {
28345 TEST_REQUIRES_X86_AVX;
28346 for (size_t k = 2; k < 10; k++) {
28347 for (uint32_t m = 1; m <= 6; m++) {
28348 for (uint32_t n = 1; n <= 8; n++) {
28349 GemmMicrokernelTester()
28350 .mr(6)
28351 .nr(8)
28352 .kr(1)
28353 .sr(1)
28354 .m(m)
28355 .n(n)
28356 .k(k)
28357 .iterations(1)
28358 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28359 }
28360 }
28361 }
28362 }
28363
28364 TEST(F32_IGEMM_6X8__AVX_BROADCAST, n_gt_8) {
28365 TEST_REQUIRES_X86_AVX;
28366 for (uint32_t n = 9; n < 16; n++) {
28367 for (size_t k = 1; k <= 5; k += 2) {
28368 GemmMicrokernelTester()
28369 .mr(6)
28370 .nr(8)
28371 .kr(1)
28372 .sr(1)
28373 .m(6)
28374 .n(8)
28375 .k(k)
28376 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28377 }
28378 }
28379 }
28380
28381 TEST(F32_IGEMM_6X8__AVX_BROADCAST, n_gt_8_strided_cn) {
28382 TEST_REQUIRES_X86_AVX;
28383 for (uint32_t n = 9; n < 16; n++) {
28384 for (size_t k = 1; k <= 5; k += 2) {
28385 GemmMicrokernelTester()
28386 .mr(6)
28387 .nr(8)
28388 .kr(1)
28389 .sr(1)
28390 .m(6)
28391 .n(8)
28392 .k(k)
28393 .cn_stride(11)
28394 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28395 }
28396 }
28397 }
28398
28399 TEST(F32_IGEMM_6X8__AVX_BROADCAST, n_gt_8_subtile) {
28400 TEST_REQUIRES_X86_AVX;
28401 for (uint32_t n = 9; n < 16; n++) {
28402 for (size_t k = 1; k <= 5; k += 2) {
28403 for (uint32_t m = 1; m <= 6; m++) {
28404 GemmMicrokernelTester()
28405 .mr(6)
28406 .nr(8)
28407 .kr(1)
28408 .sr(1)
28409 .m(m)
28410 .n(n)
28411 .k(k)
28412 .iterations(1)
28413 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28414 }
28415 }
28416 }
28417 }
28418
28419 TEST(F32_IGEMM_6X8__AVX_BROADCAST, n_div_8) {
28420 TEST_REQUIRES_X86_AVX;
28421 for (uint32_t n = 16; n <= 24; n += 8) {
28422 for (size_t k = 1; k <= 5; k += 2) {
28423 GemmMicrokernelTester()
28424 .mr(6)
28425 .nr(8)
28426 .kr(1)
28427 .sr(1)
28428 .m(6)
28429 .n(8)
28430 .k(k)
28431 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28432 }
28433 }
28434 }
28435
28436 TEST(F32_IGEMM_6X8__AVX_BROADCAST, n_div_8_strided_cn) {
28437 TEST_REQUIRES_X86_AVX;
28438 for (uint32_t n = 16; n <= 24; n += 8) {
28439 for (size_t k = 1; k <= 5; k += 2) {
28440 GemmMicrokernelTester()
28441 .mr(6)
28442 .nr(8)
28443 .kr(1)
28444 .sr(1)
28445 .m(6)
28446 .n(n)
28447 .k(k)
28448 .cn_stride(11)
28449 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28450 }
28451 }
28452 }
28453
28454 TEST(F32_IGEMM_6X8__AVX_BROADCAST, n_div_8_subtile) {
28455 TEST_REQUIRES_X86_AVX;
28456 for (uint32_t n = 16; n <= 24; n += 8) {
28457 for (size_t k = 1; k <= 5; k += 2) {
28458 for (uint32_t m = 1; m <= 6; m++) {
28459 GemmMicrokernelTester()
28460 .mr(6)
28461 .nr(8)
28462 .kr(1)
28463 .sr(1)
28464 .m(m)
28465 .n(n)
28466 .k(k)
28467 .iterations(1)
28468 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28469 }
28470 }
28471 }
28472 }
28473
28474 TEST(F32_IGEMM_6X8__AVX_BROADCAST, small_kernel) {
28475 TEST_REQUIRES_X86_AVX;
28476 for (size_t k = 1; k <= 5; k += 2) {
28477 GemmMicrokernelTester()
28478 .mr(6)
28479 .nr(8)
28480 .kr(1)
28481 .sr(1)
28482 .m(6)
28483 .n(8)
28484 .k(k)
28485 .ks(3)
28486 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28487 }
28488 }
28489
28490 TEST(F32_IGEMM_6X8__AVX_BROADCAST, small_kernel_subtile) {
28491 TEST_REQUIRES_X86_AVX;
28492 for (size_t k = 1; k <= 5; k += 2) {
28493 for (uint32_t m = 1; m <= 6; m++) {
28494 for (uint32_t n = 1; n <= 8; n++) {
28495 GemmMicrokernelTester()
28496 .mr(6)
28497 .nr(8)
28498 .kr(1)
28499 .sr(1)
28500 .m(m)
28501 .n(n)
28502 .k(k)
28503 .ks(3)
28504 .iterations(1)
28505 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28506 }
28507 }
28508 }
28509 }
28510
28511 TEST(F32_IGEMM_6X8__AVX_BROADCAST, n_gt_8_small_kernel) {
28512 TEST_REQUIRES_X86_AVX;
28513 for (uint32_t n = 9; n < 16; n++) {
28514 for (size_t k = 1; k <= 5; k += 2) {
28515 GemmMicrokernelTester()
28516 .mr(6)
28517 .nr(8)
28518 .kr(1)
28519 .sr(1)
28520 .m(6)
28521 .n(8)
28522 .k(k)
28523 .ks(3)
28524 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28525 }
28526 }
28527 }
28528
28529 TEST(F32_IGEMM_6X8__AVX_BROADCAST, n_div_8_small_kernel) {
28530 TEST_REQUIRES_X86_AVX;
28531 for (uint32_t n = 16; n <= 24; n += 8) {
28532 for (size_t k = 1; k <= 5; k += 2) {
28533 GemmMicrokernelTester()
28534 .mr(6)
28535 .nr(8)
28536 .kr(1)
28537 .sr(1)
28538 .m(6)
28539 .n(8)
28540 .k(k)
28541 .ks(3)
28542 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28543 }
28544 }
28545 }
28546
28547 TEST(F32_IGEMM_6X8__AVX_BROADCAST, strided_cm_subtile) {
28548 TEST_REQUIRES_X86_AVX;
28549 for (size_t k = 1; k <= 5; k += 2) {
28550 for (uint32_t m = 1; m <= 6; m++) {
28551 for (uint32_t n = 1; n <= 8; n++) {
28552 GemmMicrokernelTester()
28553 .mr(6)
28554 .nr(8)
28555 .kr(1)
28556 .sr(1)
28557 .m(m)
28558 .n(n)
28559 .k(k)
28560 .cm_stride(11)
28561 .iterations(1)
28562 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28563 }
28564 }
28565 }
28566 }
28567
28568 TEST(F32_IGEMM_6X8__AVX_BROADCAST, a_offset) {
28569 TEST_REQUIRES_X86_AVX;
28570 for (size_t k = 1; k <= 5; k += 2) {
28571 GemmMicrokernelTester()
28572 .mr(6)
28573 .nr(8)
28574 .kr(1)
28575 .sr(1)
28576 .m(6)
28577 .n(8)
28578 .k(k)
28579 .ks(3)
28580 .a_offset(37)
28581 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28582 }
28583 }
28584
28585 TEST(F32_IGEMM_6X8__AVX_BROADCAST, zero) {
28586 TEST_REQUIRES_X86_AVX;
28587 for (uint32_t mz = 0; mz < 6; mz++) {
28588 for (size_t k = 1; k <= 5; k += 2) {
28589 GemmMicrokernelTester()
28590 .mr(6)
28591 .nr(8)
28592 .kr(1)
28593 .sr(1)
28594 .m(6)
28595 .n(8)
28596 .k(k)
28597 .ks(3)
28598 .a_offset(37)
28599 .zero_index(mz)
28600 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28601 }
28602 }
28603 }
28604
28605 TEST(F32_IGEMM_6X8__AVX_BROADCAST, qmin) {
28606 TEST_REQUIRES_X86_AVX;
28607 GemmMicrokernelTester()
28608 .mr(6)
28609 .nr(8)
28610 .kr(1)
28611 .sr(1)
28612 .m(6)
28613 .n(8)
28614 .k(1)
28615 .qmin(128)
28616 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28617 }
28618
28619 TEST(F32_IGEMM_6X8__AVX_BROADCAST, qmax) {
28620 TEST_REQUIRES_X86_AVX;
28621 GemmMicrokernelTester()
28622 .mr(6)
28623 .nr(8)
28624 .kr(1)
28625 .sr(1)
28626 .m(6)
28627 .n(8)
28628 .k(1)
28629 .qmax(128)
28630 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28631 }
28632
28633 TEST(F32_IGEMM_6X8__AVX_BROADCAST, strided_cm) {
28634 TEST_REQUIRES_X86_AVX;
28635 GemmMicrokernelTester()
28636 .mr(6)
28637 .nr(8)
28638 .kr(1)
28639 .sr(1)
28640 .m(6)
28641 .n(8)
28642 .k(1)
28643 .cm_stride(11)
28644 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
28645 }
28646#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
28647
28648
28649#if XNN_ARCH_X86 || XNN_ARCH_X86_64
28650 TEST(F32_IGEMM_7X8__AVX_BROADCAST, k_eq_1) {
28651 TEST_REQUIRES_X86_AVX;
28652 GemmMicrokernelTester()
28653 .mr(7)
28654 .nr(8)
28655 .kr(1)
28656 .sr(1)
28657 .m(7)
28658 .n(8)
28659 .k(1)
28660 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
28661 }
28662
28663 TEST(F32_IGEMM_7X8__AVX_BROADCAST, strided_cn) {
28664 TEST_REQUIRES_X86_AVX;
28665 GemmMicrokernelTester()
28666 .mr(7)
28667 .nr(8)
28668 .kr(1)
28669 .sr(1)
28670 .m(7)
28671 .n(8)
28672 .k(1)
28673 .cn_stride(11)
28674 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
28675 }
28676
28677 TEST(F32_IGEMM_7X8__AVX_BROADCAST, k_eq_1_subtile) {
28678 TEST_REQUIRES_X86_AVX;
28679 for (uint32_t m = 1; m <= 7; m++) {
28680 for (uint32_t n = 1; n <= 8; n++) {
28681 GemmMicrokernelTester()
28682 .mr(7)
28683 .nr(8)
28684 .kr(1)
28685 .sr(1)
28686 .m(m)
28687 .n(n)
28688 .k(1)
28689 .iterations(1)
28690 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
28691 }
28692 }
28693 }
28694
28695 TEST(F32_IGEMM_7X8__AVX_BROADCAST, k_eq_1_subtile_m) {
28696 TEST_REQUIRES_X86_AVX;
28697 for (uint32_t m = 1; m <= 7; m++) {
28698 GemmMicrokernelTester()
28699 .mr(7)
28700 .nr(8)
28701 .kr(1)
28702 .sr(1)
28703 .m(m)
28704 .n(8)
28705 .k(1)
28706 .iterations(1)
28707 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
28708 }
28709 }
28710
28711 TEST(F32_IGEMM_7X8__AVX_BROADCAST, k_eq_1_subtile_n) {
28712 TEST_REQUIRES_X86_AVX;
28713 for (uint32_t n = 1; n <= 8; n++) {
28714 GemmMicrokernelTester()
28715 .mr(7)
28716 .nr(8)
28717 .kr(1)
28718 .sr(1)
28719 .m(7)
28720 .n(n)
28721 .k(1)
28722 .iterations(1)
28723 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
28724 }
28725 }
28726
28727 TEST(F32_IGEMM_7X8__AVX_BROADCAST, k_gt_1) {
28728 TEST_REQUIRES_X86_AVX;
28729 for (size_t k = 2; k < 10; k++) {
28730 GemmMicrokernelTester()
28731 .mr(7)
28732 .nr(8)
28733 .kr(1)
28734 .sr(1)
28735 .m(7)
28736 .n(8)
28737 .k(k)
28738 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
28739 }
28740 }
28741
28742 TEST(F32_IGEMM_7X8__AVX_BROADCAST, k_gt_1_subtile) {
28743 TEST_REQUIRES_X86_AVX;
28744 for (size_t k = 2; k < 10; k++) {
28745 for (uint32_t m = 1; m <= 7; m++) {
28746 for (uint32_t n = 1; n <= 8; n++) {
28747 GemmMicrokernelTester()
28748 .mr(7)
28749 .nr(8)
28750 .kr(1)
28751 .sr(1)
28752 .m(m)
28753 .n(n)
28754 .k(k)
28755 .iterations(1)
28756 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
28757 }
28758 }
28759 }
28760 }
28761
28762 TEST(F32_IGEMM_7X8__AVX_BROADCAST, n_gt_8) {
28763 TEST_REQUIRES_X86_AVX;
28764 for (uint32_t n = 9; n < 16; n++) {
28765 for (size_t k = 1; k <= 5; k += 2) {
28766 GemmMicrokernelTester()
28767 .mr(7)
28768 .nr(8)
28769 .kr(1)
28770 .sr(1)
28771 .m(7)
28772 .n(8)
28773 .k(k)
28774 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
28775 }
28776 }
28777 }
28778
28779 TEST(F32_IGEMM_7X8__AVX_BROADCAST, n_gt_8_strided_cn) {
28780 TEST_REQUIRES_X86_AVX;
28781 for (uint32_t n = 9; n < 16; n++) {
28782 for (size_t k = 1; k <= 5; k += 2) {
28783 GemmMicrokernelTester()
28784 .mr(7)
28785 .nr(8)
28786 .kr(1)
28787 .sr(1)
28788 .m(7)
28789 .n(8)
28790 .k(k)
28791 .cn_stride(11)
28792 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
28793 }
28794 }
28795 }
28796
28797 TEST(F32_IGEMM_7X8__AVX_BROADCAST, n_gt_8_subtile) {
28798 TEST_REQUIRES_X86_AVX;
28799 for (uint32_t n = 9; n < 16; n++) {
28800 for (size_t k = 1; k <= 5; k += 2) {
28801 for (uint32_t m = 1; m <= 7; m++) {
28802 GemmMicrokernelTester()
28803 .mr(7)
28804 .nr(8)
28805 .kr(1)
28806 .sr(1)
28807 .m(m)
28808 .n(n)
28809 .k(k)
28810 .iterations(1)
28811 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
28812 }
28813 }
28814 }
28815 }
28816
28817 TEST(F32_IGEMM_7X8__AVX_BROADCAST, n_div_8) {
28818 TEST_REQUIRES_X86_AVX;
28819 for (uint32_t n = 16; n <= 24; n += 8) {
28820 for (size_t k = 1; k <= 5; k += 2) {
28821 GemmMicrokernelTester()
28822 .mr(7)
28823 .nr(8)
28824 .kr(1)
28825 .sr(1)
28826 .m(7)
28827 .n(8)
28828 .k(k)
28829 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
28830 }
28831 }
28832 }
28833
28834 TEST(F32_IGEMM_7X8__AVX_BROADCAST, n_div_8_strided_cn) {
28835 TEST_REQUIRES_X86_AVX;
28836 for (uint32_t n = 16; n <= 24; n += 8) {
28837 for (size_t k = 1; k <= 5; k += 2) {
28838 GemmMicrokernelTester()
28839 .mr(7)
28840 .nr(8)
28841 .kr(1)
28842 .sr(1)
28843 .m(7)
28844 .n(n)
28845 .k(k)
28846 .cn_stride(11)
28847 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
28848 }
28849 }
28850 }
28851
28852 TEST(F32_IGEMM_7X8__AVX_BROADCAST, n_div_8_subtile) {
28853 TEST_REQUIRES_X86_AVX;
28854 for (uint32_t n = 16; n <= 24; n += 8) {
28855 for (size_t k = 1; k <= 5; k += 2) {
28856 for (uint32_t m = 1; m <= 7; m++) {
28857 GemmMicrokernelTester()
28858 .mr(7)
28859 .nr(8)
28860 .kr(1)
28861 .sr(1)
28862 .m(m)
28863 .n(n)
28864 .k(k)
28865 .iterations(1)
28866 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
28867 }
28868 }
28869 }
28870 }
28871
28872 TEST(F32_IGEMM_7X8__AVX_BROADCAST, small_kernel) {
28873 TEST_REQUIRES_X86_AVX;
28874 for (size_t k = 1; k <= 5; k += 2) {
28875 GemmMicrokernelTester()
28876 .mr(7)
28877 .nr(8)
28878 .kr(1)
28879 .sr(1)
28880 .m(7)
28881 .n(8)
28882 .k(k)
28883 .ks(3)
28884 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
28885 }
28886 }
28887
28888 TEST(F32_IGEMM_7X8__AVX_BROADCAST, small_kernel_subtile) {
28889 TEST_REQUIRES_X86_AVX;
28890 for (size_t k = 1; k <= 5; k += 2) {
28891 for (uint32_t m = 1; m <= 7; m++) {
28892 for (uint32_t n = 1; n <= 8; n++) {
28893 GemmMicrokernelTester()
28894 .mr(7)
28895 .nr(8)
28896 .kr(1)
28897 .sr(1)
28898 .m(m)
28899 .n(n)
28900 .k(k)
28901 .ks(3)
28902 .iterations(1)
28903 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
28904 }
28905 }
28906 }
28907 }
28908
28909 TEST(F32_IGEMM_7X8__AVX_BROADCAST, n_gt_8_small_kernel) {
28910 TEST_REQUIRES_X86_AVX;
28911 for (uint32_t n = 9; n < 16; n++) {
28912 for (size_t k = 1; k <= 5; k += 2) {
28913 GemmMicrokernelTester()
28914 .mr(7)
28915 .nr(8)
28916 .kr(1)
28917 .sr(1)
28918 .m(7)
28919 .n(8)
28920 .k(k)
28921 .ks(3)
28922 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
28923 }
28924 }
28925 }
28926
28927 TEST(F32_IGEMM_7X8__AVX_BROADCAST, n_div_8_small_kernel) {
28928 TEST_REQUIRES_X86_AVX;
28929 for (uint32_t n = 16; n <= 24; n += 8) {
28930 for (size_t k = 1; k <= 5; k += 2) {
28931 GemmMicrokernelTester()
28932 .mr(7)
28933 .nr(8)
28934 .kr(1)
28935 .sr(1)
28936 .m(7)
28937 .n(8)
28938 .k(k)
28939 .ks(3)
28940 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
28941 }
28942 }
28943 }
28944
28945 TEST(F32_IGEMM_7X8__AVX_BROADCAST, strided_cm_subtile) {
28946 TEST_REQUIRES_X86_AVX;
28947 for (size_t k = 1; k <= 5; k += 2) {
28948 for (uint32_t m = 1; m <= 7; m++) {
28949 for (uint32_t n = 1; n <= 8; n++) {
28950 GemmMicrokernelTester()
28951 .mr(7)
28952 .nr(8)
28953 .kr(1)
28954 .sr(1)
28955 .m(m)
28956 .n(n)
28957 .k(k)
28958 .cm_stride(11)
28959 .iterations(1)
28960 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
28961 }
28962 }
28963 }
28964 }
28965
28966 TEST(F32_IGEMM_7X8__AVX_BROADCAST, a_offset) {
28967 TEST_REQUIRES_X86_AVX;
28968 for (size_t k = 1; k <= 5; k += 2) {
28969 GemmMicrokernelTester()
28970 .mr(7)
28971 .nr(8)
28972 .kr(1)
28973 .sr(1)
28974 .m(7)
28975 .n(8)
28976 .k(k)
28977 .ks(3)
28978 .a_offset(37)
28979 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
28980 }
28981 }
28982
28983 TEST(F32_IGEMM_7X8__AVX_BROADCAST, zero) {
28984 TEST_REQUIRES_X86_AVX;
28985 for (uint32_t mz = 0; mz < 7; mz++) {
28986 for (size_t k = 1; k <= 5; k += 2) {
28987 GemmMicrokernelTester()
28988 .mr(7)
28989 .nr(8)
28990 .kr(1)
28991 .sr(1)
28992 .m(7)
28993 .n(8)
28994 .k(k)
28995 .ks(3)
28996 .a_offset(37)
28997 .zero_index(mz)
28998 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
28999 }
29000 }
29001 }
29002
29003 TEST(F32_IGEMM_7X8__AVX_BROADCAST, qmin) {
29004 TEST_REQUIRES_X86_AVX;
29005 GemmMicrokernelTester()
29006 .mr(7)
29007 .nr(8)
29008 .kr(1)
29009 .sr(1)
29010 .m(7)
29011 .n(8)
29012 .k(1)
29013 .qmin(128)
29014 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
29015 }
29016
29017 TEST(F32_IGEMM_7X8__AVX_BROADCAST, qmax) {
29018 TEST_REQUIRES_X86_AVX;
29019 GemmMicrokernelTester()
29020 .mr(7)
29021 .nr(8)
29022 .kr(1)
29023 .sr(1)
29024 .m(7)
29025 .n(8)
29026 .k(1)
29027 .qmax(128)
29028 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
29029 }
29030
29031 TEST(F32_IGEMM_7X8__AVX_BROADCAST, strided_cm) {
29032 TEST_REQUIRES_X86_AVX;
29033 GemmMicrokernelTester()
29034 .mr(7)
29035 .nr(8)
29036 .kr(1)
29037 .sr(1)
29038 .m(7)
29039 .n(8)
29040 .k(1)
29041 .cm_stride(11)
29042 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
29043 }
29044#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
29045
29046
29047#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Marat Dukhaneccfd712019-12-08 16:49:27 -080029048 TEST(F32_IGEMM_1X16__AVX_BROADCAST, k_eq_1) {
29049 TEST_REQUIRES_X86_AVX;
29050 GemmMicrokernelTester()
29051 .mr(1)
29052 .nr(16)
29053 .kr(1)
29054 .sr(1)
29055 .m(1)
29056 .n(16)
29057 .k(1)
29058 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29059 }
29060
29061 TEST(F32_IGEMM_1X16__AVX_BROADCAST, strided_cn) {
29062 TEST_REQUIRES_X86_AVX;
29063 GemmMicrokernelTester()
29064 .mr(1)
29065 .nr(16)
29066 .kr(1)
29067 .sr(1)
29068 .m(1)
29069 .n(16)
29070 .k(1)
29071 .cn_stride(19)
29072 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29073 }
29074
29075 TEST(F32_IGEMM_1X16__AVX_BROADCAST, k_eq_1_subtile) {
29076 TEST_REQUIRES_X86_AVX;
29077 for (uint32_t m = 1; m <= 1; m++) {
29078 for (uint32_t n = 1; n <= 16; n++) {
29079 GemmMicrokernelTester()
29080 .mr(1)
29081 .nr(16)
29082 .kr(1)
29083 .sr(1)
29084 .m(m)
29085 .n(n)
29086 .k(1)
29087 .iterations(1)
29088 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29089 }
29090 }
29091 }
29092
29093 TEST(F32_IGEMM_1X16__AVX_BROADCAST, k_eq_1_subtile_m) {
29094 TEST_REQUIRES_X86_AVX;
29095 for (uint32_t m = 1; m <= 1; m++) {
29096 GemmMicrokernelTester()
29097 .mr(1)
29098 .nr(16)
29099 .kr(1)
29100 .sr(1)
29101 .m(m)
29102 .n(16)
29103 .k(1)
29104 .iterations(1)
29105 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29106 }
29107 }
29108
29109 TEST(F32_IGEMM_1X16__AVX_BROADCAST, k_eq_1_subtile_n) {
29110 TEST_REQUIRES_X86_AVX;
29111 for (uint32_t n = 1; n <= 16; n++) {
29112 GemmMicrokernelTester()
29113 .mr(1)
29114 .nr(16)
29115 .kr(1)
29116 .sr(1)
29117 .m(1)
29118 .n(n)
29119 .k(1)
29120 .iterations(1)
29121 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29122 }
29123 }
29124
29125 TEST(F32_IGEMM_1X16__AVX_BROADCAST, k_gt_1) {
29126 TEST_REQUIRES_X86_AVX;
29127 for (size_t k = 2; k < 10; k++) {
29128 GemmMicrokernelTester()
29129 .mr(1)
29130 .nr(16)
29131 .kr(1)
29132 .sr(1)
29133 .m(1)
29134 .n(16)
29135 .k(k)
29136 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29137 }
29138 }
29139
29140 TEST(F32_IGEMM_1X16__AVX_BROADCAST, k_gt_1_subtile) {
29141 TEST_REQUIRES_X86_AVX;
29142 for (size_t k = 2; k < 10; k++) {
29143 for (uint32_t m = 1; m <= 1; m++) {
29144 for (uint32_t n = 1; n <= 16; n++) {
29145 GemmMicrokernelTester()
29146 .mr(1)
29147 .nr(16)
29148 .kr(1)
29149 .sr(1)
29150 .m(m)
29151 .n(n)
29152 .k(k)
29153 .iterations(1)
29154 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29155 }
29156 }
29157 }
29158 }
29159
29160 TEST(F32_IGEMM_1X16__AVX_BROADCAST, n_gt_16) {
29161 TEST_REQUIRES_X86_AVX;
29162 for (uint32_t n = 17; n < 32; n++) {
29163 for (size_t k = 1; k <= 5; k += 2) {
29164 GemmMicrokernelTester()
29165 .mr(1)
29166 .nr(16)
29167 .kr(1)
29168 .sr(1)
29169 .m(1)
29170 .n(16)
29171 .k(k)
29172 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29173 }
29174 }
29175 }
29176
29177 TEST(F32_IGEMM_1X16__AVX_BROADCAST, n_gt_16_strided_cn) {
29178 TEST_REQUIRES_X86_AVX;
29179 for (uint32_t n = 17; n < 32; n++) {
29180 for (size_t k = 1; k <= 5; k += 2) {
29181 GemmMicrokernelTester()
29182 .mr(1)
29183 .nr(16)
29184 .kr(1)
29185 .sr(1)
29186 .m(1)
29187 .n(16)
29188 .k(k)
29189 .cn_stride(19)
29190 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29191 }
29192 }
29193 }
29194
29195 TEST(F32_IGEMM_1X16__AVX_BROADCAST, n_gt_16_subtile) {
29196 TEST_REQUIRES_X86_AVX;
29197 for (uint32_t n = 17; n < 32; n++) {
29198 for (size_t k = 1; k <= 5; k += 2) {
29199 for (uint32_t m = 1; m <= 1; m++) {
29200 GemmMicrokernelTester()
29201 .mr(1)
29202 .nr(16)
29203 .kr(1)
29204 .sr(1)
29205 .m(m)
29206 .n(n)
29207 .k(k)
29208 .iterations(1)
29209 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29210 }
29211 }
29212 }
29213 }
29214
29215 TEST(F32_IGEMM_1X16__AVX_BROADCAST, n_div_16) {
29216 TEST_REQUIRES_X86_AVX;
29217 for (uint32_t n = 32; n <= 48; n += 16) {
29218 for (size_t k = 1; k <= 5; k += 2) {
29219 GemmMicrokernelTester()
29220 .mr(1)
29221 .nr(16)
29222 .kr(1)
29223 .sr(1)
29224 .m(1)
29225 .n(16)
29226 .k(k)
29227 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29228 }
29229 }
29230 }
29231
29232 TEST(F32_IGEMM_1X16__AVX_BROADCAST, n_div_16_strided_cn) {
29233 TEST_REQUIRES_X86_AVX;
29234 for (uint32_t n = 32; n <= 48; n += 16) {
29235 for (size_t k = 1; k <= 5; k += 2) {
29236 GemmMicrokernelTester()
29237 .mr(1)
29238 .nr(16)
29239 .kr(1)
29240 .sr(1)
29241 .m(1)
29242 .n(n)
29243 .k(k)
29244 .cn_stride(19)
29245 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29246 }
29247 }
29248 }
29249
29250 TEST(F32_IGEMM_1X16__AVX_BROADCAST, n_div_16_subtile) {
29251 TEST_REQUIRES_X86_AVX;
29252 for (uint32_t n = 32; n <= 48; n += 16) {
29253 for (size_t k = 1; k <= 5; k += 2) {
29254 for (uint32_t m = 1; m <= 1; m++) {
29255 GemmMicrokernelTester()
29256 .mr(1)
29257 .nr(16)
29258 .kr(1)
29259 .sr(1)
29260 .m(m)
29261 .n(n)
29262 .k(k)
29263 .iterations(1)
29264 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29265 }
29266 }
29267 }
29268 }
29269
29270 TEST(F32_IGEMM_1X16__AVX_BROADCAST, small_kernel) {
29271 TEST_REQUIRES_X86_AVX;
29272 for (size_t k = 1; k <= 5; k += 2) {
29273 GemmMicrokernelTester()
29274 .mr(1)
29275 .nr(16)
29276 .kr(1)
29277 .sr(1)
29278 .m(1)
29279 .n(16)
29280 .k(k)
29281 .ks(3)
29282 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29283 }
29284 }
29285
29286 TEST(F32_IGEMM_1X16__AVX_BROADCAST, small_kernel_subtile) {
29287 TEST_REQUIRES_X86_AVX;
29288 for (size_t k = 1; k <= 5; k += 2) {
29289 for (uint32_t m = 1; m <= 1; m++) {
29290 for (uint32_t n = 1; n <= 16; n++) {
29291 GemmMicrokernelTester()
29292 .mr(1)
29293 .nr(16)
29294 .kr(1)
29295 .sr(1)
29296 .m(m)
29297 .n(n)
29298 .k(k)
29299 .ks(3)
29300 .iterations(1)
29301 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29302 }
29303 }
29304 }
29305 }
29306
29307 TEST(F32_IGEMM_1X16__AVX_BROADCAST, n_gt_16_small_kernel) {
29308 TEST_REQUIRES_X86_AVX;
29309 for (uint32_t n = 17; n < 32; n++) {
29310 for (size_t k = 1; k <= 5; k += 2) {
29311 GemmMicrokernelTester()
29312 .mr(1)
29313 .nr(16)
29314 .kr(1)
29315 .sr(1)
29316 .m(1)
29317 .n(16)
29318 .k(k)
29319 .ks(3)
29320 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29321 }
29322 }
29323 }
29324
29325 TEST(F32_IGEMM_1X16__AVX_BROADCAST, n_div_16_small_kernel) {
29326 TEST_REQUIRES_X86_AVX;
29327 for (uint32_t n = 32; n <= 48; n += 16) {
29328 for (size_t k = 1; k <= 5; k += 2) {
29329 GemmMicrokernelTester()
29330 .mr(1)
29331 .nr(16)
29332 .kr(1)
29333 .sr(1)
29334 .m(1)
29335 .n(16)
29336 .k(k)
29337 .ks(3)
29338 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29339 }
29340 }
29341 }
29342
29343 TEST(F32_IGEMM_1X16__AVX_BROADCAST, strided_cm_subtile) {
29344 TEST_REQUIRES_X86_AVX;
29345 for (size_t k = 1; k <= 5; k += 2) {
29346 for (uint32_t m = 1; m <= 1; m++) {
29347 for (uint32_t n = 1; n <= 16; n++) {
29348 GemmMicrokernelTester()
29349 .mr(1)
29350 .nr(16)
29351 .kr(1)
29352 .sr(1)
29353 .m(m)
29354 .n(n)
29355 .k(k)
29356 .cm_stride(19)
29357 .iterations(1)
29358 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29359 }
29360 }
29361 }
29362 }
29363
29364 TEST(F32_IGEMM_1X16__AVX_BROADCAST, a_offset) {
29365 TEST_REQUIRES_X86_AVX;
29366 for (size_t k = 1; k <= 5; k += 2) {
29367 GemmMicrokernelTester()
29368 .mr(1)
29369 .nr(16)
29370 .kr(1)
29371 .sr(1)
29372 .m(1)
29373 .n(16)
29374 .k(k)
29375 .ks(3)
29376 .a_offset(7)
29377 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29378 }
29379 }
29380
29381 TEST(F32_IGEMM_1X16__AVX_BROADCAST, zero) {
29382 TEST_REQUIRES_X86_AVX;
29383 for (uint32_t mz = 0; mz < 1; mz++) {
29384 for (size_t k = 1; k <= 5; k += 2) {
29385 GemmMicrokernelTester()
29386 .mr(1)
29387 .nr(16)
29388 .kr(1)
29389 .sr(1)
29390 .m(1)
29391 .n(16)
29392 .k(k)
29393 .ks(3)
29394 .a_offset(7)
29395 .zero_index(mz)
29396 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29397 }
29398 }
29399 }
29400
29401 TEST(F32_IGEMM_1X16__AVX_BROADCAST, qmin) {
29402 TEST_REQUIRES_X86_AVX;
29403 GemmMicrokernelTester()
29404 .mr(1)
29405 .nr(16)
29406 .kr(1)
29407 .sr(1)
29408 .m(1)
29409 .n(16)
29410 .k(1)
29411 .qmin(128)
29412 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29413 }
29414
29415 TEST(F32_IGEMM_1X16__AVX_BROADCAST, qmax) {
29416 TEST_REQUIRES_X86_AVX;
29417 GemmMicrokernelTester()
29418 .mr(1)
29419 .nr(16)
29420 .kr(1)
29421 .sr(1)
29422 .m(1)
29423 .n(16)
29424 .k(1)
29425 .qmax(128)
29426 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29427 }
29428
29429 TEST(F32_IGEMM_1X16__AVX_BROADCAST, strided_cm) {
29430 TEST_REQUIRES_X86_AVX;
29431 GemmMicrokernelTester()
29432 .mr(1)
29433 .nr(16)
29434 .kr(1)
29435 .sr(1)
29436 .m(1)
29437 .n(16)
29438 .k(1)
29439 .cm_stride(19)
29440 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
29441 }
29442#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
29443
29444
29445#if XNN_ARCH_X86 || XNN_ARCH_X86_64
29446 TEST(F32_IGEMM_3X16__AVX_BROADCAST, k_eq_1) {
29447 TEST_REQUIRES_X86_AVX;
29448 GemmMicrokernelTester()
29449 .mr(3)
29450 .nr(16)
29451 .kr(1)
29452 .sr(1)
29453 .m(3)
29454 .n(16)
29455 .k(1)
29456 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29457 }
29458
29459 TEST(F32_IGEMM_3X16__AVX_BROADCAST, strided_cn) {
29460 TEST_REQUIRES_X86_AVX;
29461 GemmMicrokernelTester()
29462 .mr(3)
29463 .nr(16)
29464 .kr(1)
29465 .sr(1)
29466 .m(3)
29467 .n(16)
29468 .k(1)
29469 .cn_stride(19)
29470 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29471 }
29472
29473 TEST(F32_IGEMM_3X16__AVX_BROADCAST, k_eq_1_subtile) {
29474 TEST_REQUIRES_X86_AVX;
29475 for (uint32_t m = 1; m <= 3; m++) {
29476 for (uint32_t n = 1; n <= 16; n++) {
29477 GemmMicrokernelTester()
29478 .mr(3)
29479 .nr(16)
29480 .kr(1)
29481 .sr(1)
29482 .m(m)
29483 .n(n)
29484 .k(1)
29485 .iterations(1)
29486 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29487 }
29488 }
29489 }
29490
29491 TEST(F32_IGEMM_3X16__AVX_BROADCAST, k_eq_1_subtile_m) {
29492 TEST_REQUIRES_X86_AVX;
29493 for (uint32_t m = 1; m <= 3; m++) {
29494 GemmMicrokernelTester()
29495 .mr(3)
29496 .nr(16)
29497 .kr(1)
29498 .sr(1)
29499 .m(m)
29500 .n(16)
29501 .k(1)
29502 .iterations(1)
29503 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29504 }
29505 }
29506
29507 TEST(F32_IGEMM_3X16__AVX_BROADCAST, k_eq_1_subtile_n) {
29508 TEST_REQUIRES_X86_AVX;
29509 for (uint32_t n = 1; n <= 16; n++) {
29510 GemmMicrokernelTester()
29511 .mr(3)
29512 .nr(16)
29513 .kr(1)
29514 .sr(1)
29515 .m(3)
29516 .n(n)
29517 .k(1)
29518 .iterations(1)
29519 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29520 }
29521 }
29522
29523 TEST(F32_IGEMM_3X16__AVX_BROADCAST, k_gt_1) {
29524 TEST_REQUIRES_X86_AVX;
29525 for (size_t k = 2; k < 10; k++) {
29526 GemmMicrokernelTester()
29527 .mr(3)
29528 .nr(16)
29529 .kr(1)
29530 .sr(1)
29531 .m(3)
29532 .n(16)
29533 .k(k)
29534 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29535 }
29536 }
29537
29538 TEST(F32_IGEMM_3X16__AVX_BROADCAST, k_gt_1_subtile) {
29539 TEST_REQUIRES_X86_AVX;
29540 for (size_t k = 2; k < 10; k++) {
29541 for (uint32_t m = 1; m <= 3; m++) {
29542 for (uint32_t n = 1; n <= 16; n++) {
29543 GemmMicrokernelTester()
29544 .mr(3)
29545 .nr(16)
29546 .kr(1)
29547 .sr(1)
29548 .m(m)
29549 .n(n)
29550 .k(k)
29551 .iterations(1)
29552 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29553 }
29554 }
29555 }
29556 }
29557
29558 TEST(F32_IGEMM_3X16__AVX_BROADCAST, n_gt_16) {
29559 TEST_REQUIRES_X86_AVX;
29560 for (uint32_t n = 17; n < 32; n++) {
29561 for (size_t k = 1; k <= 5; k += 2) {
29562 GemmMicrokernelTester()
29563 .mr(3)
29564 .nr(16)
29565 .kr(1)
29566 .sr(1)
29567 .m(3)
29568 .n(16)
29569 .k(k)
29570 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29571 }
29572 }
29573 }
29574
29575 TEST(F32_IGEMM_3X16__AVX_BROADCAST, n_gt_16_strided_cn) {
29576 TEST_REQUIRES_X86_AVX;
29577 for (uint32_t n = 17; n < 32; n++) {
29578 for (size_t k = 1; k <= 5; k += 2) {
29579 GemmMicrokernelTester()
29580 .mr(3)
29581 .nr(16)
29582 .kr(1)
29583 .sr(1)
29584 .m(3)
29585 .n(16)
29586 .k(k)
29587 .cn_stride(19)
29588 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29589 }
29590 }
29591 }
29592
29593 TEST(F32_IGEMM_3X16__AVX_BROADCAST, n_gt_16_subtile) {
29594 TEST_REQUIRES_X86_AVX;
29595 for (uint32_t n = 17; n < 32; n++) {
29596 for (size_t k = 1; k <= 5; k += 2) {
29597 for (uint32_t m = 1; m <= 3; m++) {
29598 GemmMicrokernelTester()
29599 .mr(3)
29600 .nr(16)
29601 .kr(1)
29602 .sr(1)
29603 .m(m)
29604 .n(n)
29605 .k(k)
29606 .iterations(1)
29607 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29608 }
29609 }
29610 }
29611 }
29612
29613 TEST(F32_IGEMM_3X16__AVX_BROADCAST, n_div_16) {
29614 TEST_REQUIRES_X86_AVX;
29615 for (uint32_t n = 32; n <= 48; n += 16) {
29616 for (size_t k = 1; k <= 5; k += 2) {
29617 GemmMicrokernelTester()
29618 .mr(3)
29619 .nr(16)
29620 .kr(1)
29621 .sr(1)
29622 .m(3)
29623 .n(16)
29624 .k(k)
29625 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29626 }
29627 }
29628 }
29629
29630 TEST(F32_IGEMM_3X16__AVX_BROADCAST, n_div_16_strided_cn) {
29631 TEST_REQUIRES_X86_AVX;
29632 for (uint32_t n = 32; n <= 48; n += 16) {
29633 for (size_t k = 1; k <= 5; k += 2) {
29634 GemmMicrokernelTester()
29635 .mr(3)
29636 .nr(16)
29637 .kr(1)
29638 .sr(1)
29639 .m(3)
29640 .n(n)
29641 .k(k)
29642 .cn_stride(19)
29643 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29644 }
29645 }
29646 }
29647
29648 TEST(F32_IGEMM_3X16__AVX_BROADCAST, n_div_16_subtile) {
29649 TEST_REQUIRES_X86_AVX;
29650 for (uint32_t n = 32; n <= 48; n += 16) {
29651 for (size_t k = 1; k <= 5; k += 2) {
29652 for (uint32_t m = 1; m <= 3; m++) {
29653 GemmMicrokernelTester()
29654 .mr(3)
29655 .nr(16)
29656 .kr(1)
29657 .sr(1)
29658 .m(m)
29659 .n(n)
29660 .k(k)
29661 .iterations(1)
29662 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29663 }
29664 }
29665 }
29666 }
29667
29668 TEST(F32_IGEMM_3X16__AVX_BROADCAST, small_kernel) {
29669 TEST_REQUIRES_X86_AVX;
29670 for (size_t k = 1; k <= 5; k += 2) {
29671 GemmMicrokernelTester()
29672 .mr(3)
29673 .nr(16)
29674 .kr(1)
29675 .sr(1)
29676 .m(3)
29677 .n(16)
29678 .k(k)
29679 .ks(3)
29680 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29681 }
29682 }
29683
29684 TEST(F32_IGEMM_3X16__AVX_BROADCAST, small_kernel_subtile) {
29685 TEST_REQUIRES_X86_AVX;
29686 for (size_t k = 1; k <= 5; k += 2) {
29687 for (uint32_t m = 1; m <= 3; m++) {
29688 for (uint32_t n = 1; n <= 16; n++) {
29689 GemmMicrokernelTester()
29690 .mr(3)
29691 .nr(16)
29692 .kr(1)
29693 .sr(1)
29694 .m(m)
29695 .n(n)
29696 .k(k)
29697 .ks(3)
29698 .iterations(1)
29699 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29700 }
29701 }
29702 }
29703 }
29704
29705 TEST(F32_IGEMM_3X16__AVX_BROADCAST, n_gt_16_small_kernel) {
29706 TEST_REQUIRES_X86_AVX;
29707 for (uint32_t n = 17; n < 32; n++) {
29708 for (size_t k = 1; k <= 5; k += 2) {
29709 GemmMicrokernelTester()
29710 .mr(3)
29711 .nr(16)
29712 .kr(1)
29713 .sr(1)
29714 .m(3)
29715 .n(16)
29716 .k(k)
29717 .ks(3)
29718 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29719 }
29720 }
29721 }
29722
29723 TEST(F32_IGEMM_3X16__AVX_BROADCAST, n_div_16_small_kernel) {
29724 TEST_REQUIRES_X86_AVX;
29725 for (uint32_t n = 32; n <= 48; n += 16) {
29726 for (size_t k = 1; k <= 5; k += 2) {
29727 GemmMicrokernelTester()
29728 .mr(3)
29729 .nr(16)
29730 .kr(1)
29731 .sr(1)
29732 .m(3)
29733 .n(16)
29734 .k(k)
29735 .ks(3)
29736 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29737 }
29738 }
29739 }
29740
29741 TEST(F32_IGEMM_3X16__AVX_BROADCAST, strided_cm_subtile) {
29742 TEST_REQUIRES_X86_AVX;
29743 for (size_t k = 1; k <= 5; k += 2) {
29744 for (uint32_t m = 1; m <= 3; m++) {
29745 for (uint32_t n = 1; n <= 16; n++) {
29746 GemmMicrokernelTester()
29747 .mr(3)
29748 .nr(16)
29749 .kr(1)
29750 .sr(1)
29751 .m(m)
29752 .n(n)
29753 .k(k)
29754 .cm_stride(19)
29755 .iterations(1)
29756 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29757 }
29758 }
29759 }
29760 }
29761
29762 TEST(F32_IGEMM_3X16__AVX_BROADCAST, a_offset) {
29763 TEST_REQUIRES_X86_AVX;
29764 for (size_t k = 1; k <= 5; k += 2) {
29765 GemmMicrokernelTester()
29766 .mr(3)
29767 .nr(16)
29768 .kr(1)
29769 .sr(1)
29770 .m(3)
29771 .n(16)
29772 .k(k)
29773 .ks(3)
29774 .a_offset(17)
29775 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29776 }
29777 }
29778
29779 TEST(F32_IGEMM_3X16__AVX_BROADCAST, zero) {
29780 TEST_REQUIRES_X86_AVX;
29781 for (uint32_t mz = 0; mz < 3; mz++) {
29782 for (size_t k = 1; k <= 5; k += 2) {
29783 GemmMicrokernelTester()
29784 .mr(3)
29785 .nr(16)
29786 .kr(1)
29787 .sr(1)
29788 .m(3)
29789 .n(16)
29790 .k(k)
29791 .ks(3)
29792 .a_offset(17)
29793 .zero_index(mz)
29794 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29795 }
29796 }
29797 }
29798
29799 TEST(F32_IGEMM_3X16__AVX_BROADCAST, qmin) {
29800 TEST_REQUIRES_X86_AVX;
29801 GemmMicrokernelTester()
29802 .mr(3)
29803 .nr(16)
29804 .kr(1)
29805 .sr(1)
29806 .m(3)
29807 .n(16)
29808 .k(1)
29809 .qmin(128)
29810 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29811 }
29812
29813 TEST(F32_IGEMM_3X16__AVX_BROADCAST, qmax) {
29814 TEST_REQUIRES_X86_AVX;
29815 GemmMicrokernelTester()
29816 .mr(3)
29817 .nr(16)
29818 .kr(1)
29819 .sr(1)
29820 .m(3)
29821 .n(16)
29822 .k(1)
29823 .qmax(128)
29824 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29825 }
29826
29827 TEST(F32_IGEMM_3X16__AVX_BROADCAST, strided_cm) {
29828 TEST_REQUIRES_X86_AVX;
29829 GemmMicrokernelTester()
29830 .mr(3)
29831 .nr(16)
29832 .kr(1)
29833 .sr(1)
29834 .m(3)
29835 .n(16)
29836 .k(1)
29837 .cm_stride(19)
29838 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
29839 }
29840#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
29841
29842
29843#if XNN_ARCH_X86 || XNN_ARCH_X86_64
29844 TEST(F32_IGEMM_4X16__AVX_BROADCAST, k_eq_1) {
29845 TEST_REQUIRES_X86_AVX;
29846 GemmMicrokernelTester()
29847 .mr(4)
29848 .nr(16)
29849 .kr(1)
29850 .sr(1)
29851 .m(4)
29852 .n(16)
29853 .k(1)
29854 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
29855 }
29856
29857 TEST(F32_IGEMM_4X16__AVX_BROADCAST, strided_cn) {
29858 TEST_REQUIRES_X86_AVX;
29859 GemmMicrokernelTester()
29860 .mr(4)
29861 .nr(16)
29862 .kr(1)
29863 .sr(1)
29864 .m(4)
29865 .n(16)
29866 .k(1)
29867 .cn_stride(19)
29868 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
29869 }
29870
29871 TEST(F32_IGEMM_4X16__AVX_BROADCAST, k_eq_1_subtile) {
29872 TEST_REQUIRES_X86_AVX;
29873 for (uint32_t m = 1; m <= 4; m++) {
29874 for (uint32_t n = 1; n <= 16; n++) {
29875 GemmMicrokernelTester()
29876 .mr(4)
29877 .nr(16)
29878 .kr(1)
29879 .sr(1)
29880 .m(m)
29881 .n(n)
29882 .k(1)
29883 .iterations(1)
29884 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
29885 }
29886 }
29887 }
29888
29889 TEST(F32_IGEMM_4X16__AVX_BROADCAST, k_eq_1_subtile_m) {
29890 TEST_REQUIRES_X86_AVX;
29891 for (uint32_t m = 1; m <= 4; m++) {
29892 GemmMicrokernelTester()
29893 .mr(4)
29894 .nr(16)
29895 .kr(1)
29896 .sr(1)
29897 .m(m)
29898 .n(16)
29899 .k(1)
29900 .iterations(1)
29901 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
29902 }
29903 }
29904
29905 TEST(F32_IGEMM_4X16__AVX_BROADCAST, k_eq_1_subtile_n) {
29906 TEST_REQUIRES_X86_AVX;
29907 for (uint32_t n = 1; n <= 16; n++) {
29908 GemmMicrokernelTester()
29909 .mr(4)
29910 .nr(16)
29911 .kr(1)
29912 .sr(1)
29913 .m(4)
29914 .n(n)
29915 .k(1)
29916 .iterations(1)
29917 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
29918 }
29919 }
29920
29921 TEST(F32_IGEMM_4X16__AVX_BROADCAST, k_gt_1) {
29922 TEST_REQUIRES_X86_AVX;
29923 for (size_t k = 2; k < 10; k++) {
29924 GemmMicrokernelTester()
29925 .mr(4)
29926 .nr(16)
29927 .kr(1)
29928 .sr(1)
29929 .m(4)
29930 .n(16)
29931 .k(k)
29932 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
29933 }
29934 }
29935
29936 TEST(F32_IGEMM_4X16__AVX_BROADCAST, k_gt_1_subtile) {
29937 TEST_REQUIRES_X86_AVX;
29938 for (size_t k = 2; k < 10; k++) {
29939 for (uint32_t m = 1; m <= 4; m++) {
29940 for (uint32_t n = 1; n <= 16; n++) {
29941 GemmMicrokernelTester()
29942 .mr(4)
29943 .nr(16)
29944 .kr(1)
29945 .sr(1)
29946 .m(m)
29947 .n(n)
29948 .k(k)
29949 .iterations(1)
29950 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
29951 }
29952 }
29953 }
29954 }
29955
29956 TEST(F32_IGEMM_4X16__AVX_BROADCAST, n_gt_16) {
29957 TEST_REQUIRES_X86_AVX;
29958 for (uint32_t n = 17; n < 32; n++) {
29959 for (size_t k = 1; k <= 5; k += 2) {
29960 GemmMicrokernelTester()
29961 .mr(4)
29962 .nr(16)
29963 .kr(1)
29964 .sr(1)
29965 .m(4)
29966 .n(16)
29967 .k(k)
29968 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
29969 }
29970 }
29971 }
29972
29973 TEST(F32_IGEMM_4X16__AVX_BROADCAST, n_gt_16_strided_cn) {
29974 TEST_REQUIRES_X86_AVX;
29975 for (uint32_t n = 17; n < 32; n++) {
29976 for (size_t k = 1; k <= 5; k += 2) {
29977 GemmMicrokernelTester()
29978 .mr(4)
29979 .nr(16)
29980 .kr(1)
29981 .sr(1)
29982 .m(4)
29983 .n(16)
29984 .k(k)
29985 .cn_stride(19)
29986 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
29987 }
29988 }
29989 }
29990
29991 TEST(F32_IGEMM_4X16__AVX_BROADCAST, n_gt_16_subtile) {
29992 TEST_REQUIRES_X86_AVX;
29993 for (uint32_t n = 17; n < 32; n++) {
29994 for (size_t k = 1; k <= 5; k += 2) {
29995 for (uint32_t m = 1; m <= 4; m++) {
29996 GemmMicrokernelTester()
29997 .mr(4)
29998 .nr(16)
29999 .kr(1)
30000 .sr(1)
30001 .m(m)
30002 .n(n)
30003 .k(k)
30004 .iterations(1)
30005 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
30006 }
30007 }
30008 }
30009 }
30010
30011 TEST(F32_IGEMM_4X16__AVX_BROADCAST, n_div_16) {
30012 TEST_REQUIRES_X86_AVX;
30013 for (uint32_t n = 32; n <= 48; n += 16) {
30014 for (size_t k = 1; k <= 5; k += 2) {
30015 GemmMicrokernelTester()
30016 .mr(4)
30017 .nr(16)
30018 .kr(1)
30019 .sr(1)
30020 .m(4)
30021 .n(16)
30022 .k(k)
30023 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
30024 }
30025 }
30026 }
30027
30028 TEST(F32_IGEMM_4X16__AVX_BROADCAST, n_div_16_strided_cn) {
30029 TEST_REQUIRES_X86_AVX;
30030 for (uint32_t n = 32; n <= 48; n += 16) {
30031 for (size_t k = 1; k <= 5; k += 2) {
30032 GemmMicrokernelTester()
30033 .mr(4)
30034 .nr(16)
30035 .kr(1)
30036 .sr(1)
30037 .m(4)
30038 .n(n)
30039 .k(k)
30040 .cn_stride(19)
30041 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
30042 }
30043 }
30044 }
30045
30046 TEST(F32_IGEMM_4X16__AVX_BROADCAST, n_div_16_subtile) {
30047 TEST_REQUIRES_X86_AVX;
30048 for (uint32_t n = 32; n <= 48; n += 16) {
30049 for (size_t k = 1; k <= 5; k += 2) {
30050 for (uint32_t m = 1; m <= 4; m++) {
30051 GemmMicrokernelTester()
30052 .mr(4)
30053 .nr(16)
30054 .kr(1)
30055 .sr(1)
30056 .m(m)
30057 .n(n)
30058 .k(k)
30059 .iterations(1)
30060 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
30061 }
30062 }
30063 }
30064 }
30065
30066 TEST(F32_IGEMM_4X16__AVX_BROADCAST, small_kernel) {
30067 TEST_REQUIRES_X86_AVX;
30068 for (size_t k = 1; k <= 5; k += 2) {
30069 GemmMicrokernelTester()
30070 .mr(4)
30071 .nr(16)
30072 .kr(1)
30073 .sr(1)
30074 .m(4)
30075 .n(16)
30076 .k(k)
30077 .ks(3)
30078 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
30079 }
30080 }
30081
30082 TEST(F32_IGEMM_4X16__AVX_BROADCAST, small_kernel_subtile) {
30083 TEST_REQUIRES_X86_AVX;
30084 for (size_t k = 1; k <= 5; k += 2) {
30085 for (uint32_t m = 1; m <= 4; m++) {
30086 for (uint32_t n = 1; n <= 16; n++) {
30087 GemmMicrokernelTester()
30088 .mr(4)
30089 .nr(16)
30090 .kr(1)
30091 .sr(1)
30092 .m(m)
30093 .n(n)
30094 .k(k)
30095 .ks(3)
30096 .iterations(1)
30097 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
30098 }
30099 }
30100 }
30101 }
30102
30103 TEST(F32_IGEMM_4X16__AVX_BROADCAST, n_gt_16_small_kernel) {
30104 TEST_REQUIRES_X86_AVX;
30105 for (uint32_t n = 17; n < 32; n++) {
30106 for (size_t k = 1; k <= 5; k += 2) {
30107 GemmMicrokernelTester()
30108 .mr(4)
30109 .nr(16)
30110 .kr(1)
30111 .sr(1)
30112 .m(4)
30113 .n(16)
30114 .k(k)
30115 .ks(3)
30116 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
30117 }
30118 }
30119 }
30120
30121 TEST(F32_IGEMM_4X16__AVX_BROADCAST, n_div_16_small_kernel) {
30122 TEST_REQUIRES_X86_AVX;
30123 for (uint32_t n = 32; n <= 48; n += 16) {
30124 for (size_t k = 1; k <= 5; k += 2) {
30125 GemmMicrokernelTester()
30126 .mr(4)
30127 .nr(16)
30128 .kr(1)
30129 .sr(1)
30130 .m(4)
30131 .n(16)
30132 .k(k)
30133 .ks(3)
30134 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
30135 }
30136 }
30137 }
30138
30139 TEST(F32_IGEMM_4X16__AVX_BROADCAST, strided_cm_subtile) {
30140 TEST_REQUIRES_X86_AVX;
30141 for (size_t k = 1; k <= 5; k += 2) {
30142 for (uint32_t m = 1; m <= 4; m++) {
30143 for (uint32_t n = 1; n <= 16; n++) {
30144 GemmMicrokernelTester()
30145 .mr(4)
30146 .nr(16)
30147 .kr(1)
30148 .sr(1)
30149 .m(m)
30150 .n(n)
30151 .k(k)
30152 .cm_stride(19)
30153 .iterations(1)
30154 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
30155 }
30156 }
30157 }
30158 }
30159
30160 TEST(F32_IGEMM_4X16__AVX_BROADCAST, a_offset) {
30161 TEST_REQUIRES_X86_AVX;
30162 for (size_t k = 1; k <= 5; k += 2) {
30163 GemmMicrokernelTester()
30164 .mr(4)
30165 .nr(16)
30166 .kr(1)
30167 .sr(1)
30168 .m(4)
30169 .n(16)
30170 .k(k)
30171 .ks(3)
30172 .a_offset(23)
30173 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
30174 }
30175 }
30176
30177 TEST(F32_IGEMM_4X16__AVX_BROADCAST, zero) {
30178 TEST_REQUIRES_X86_AVX;
30179 for (uint32_t mz = 0; mz < 4; mz++) {
30180 for (size_t k = 1; k <= 5; k += 2) {
30181 GemmMicrokernelTester()
30182 .mr(4)
30183 .nr(16)
30184 .kr(1)
30185 .sr(1)
30186 .m(4)
30187 .n(16)
30188 .k(k)
30189 .ks(3)
30190 .a_offset(23)
30191 .zero_index(mz)
30192 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
30193 }
30194 }
30195 }
30196
30197 TEST(F32_IGEMM_4X16__AVX_BROADCAST, qmin) {
30198 TEST_REQUIRES_X86_AVX;
30199 GemmMicrokernelTester()
30200 .mr(4)
30201 .nr(16)
30202 .kr(1)
30203 .sr(1)
30204 .m(4)
30205 .n(16)
30206 .k(1)
30207 .qmin(128)
30208 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
30209 }
30210
30211 TEST(F32_IGEMM_4X16__AVX_BROADCAST, qmax) {
30212 TEST_REQUIRES_X86_AVX;
30213 GemmMicrokernelTester()
30214 .mr(4)
30215 .nr(16)
30216 .kr(1)
30217 .sr(1)
30218 .m(4)
30219 .n(16)
30220 .k(1)
30221 .qmax(128)
30222 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
30223 }
30224
30225 TEST(F32_IGEMM_4X16__AVX_BROADCAST, strided_cm) {
30226 TEST_REQUIRES_X86_AVX;
30227 GemmMicrokernelTester()
30228 .mr(4)
30229 .nr(16)
30230 .kr(1)
30231 .sr(1)
30232 .m(4)
30233 .n(16)
30234 .k(1)
30235 .cm_stride(19)
30236 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
30237 }
30238#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
30239
30240
30241#if XNN_ARCH_X86 || XNN_ARCH_X86_64
30242 TEST(F32_IGEMM_5X16__AVX_BROADCAST, k_eq_1) {
30243 TEST_REQUIRES_X86_AVX;
30244 GemmMicrokernelTester()
30245 .mr(5)
30246 .nr(16)
30247 .kr(1)
30248 .sr(1)
30249 .m(5)
30250 .n(16)
30251 .k(1)
30252 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30253 }
30254
30255 TEST(F32_IGEMM_5X16__AVX_BROADCAST, strided_cn) {
30256 TEST_REQUIRES_X86_AVX;
30257 GemmMicrokernelTester()
30258 .mr(5)
30259 .nr(16)
30260 .kr(1)
30261 .sr(1)
30262 .m(5)
30263 .n(16)
30264 .k(1)
30265 .cn_stride(19)
30266 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30267 }
30268
30269 TEST(F32_IGEMM_5X16__AVX_BROADCAST, k_eq_1_subtile) {
30270 TEST_REQUIRES_X86_AVX;
30271 for (uint32_t m = 1; m <= 5; m++) {
30272 for (uint32_t n = 1; n <= 16; n++) {
30273 GemmMicrokernelTester()
30274 .mr(5)
30275 .nr(16)
30276 .kr(1)
30277 .sr(1)
30278 .m(m)
30279 .n(n)
30280 .k(1)
30281 .iterations(1)
30282 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30283 }
30284 }
30285 }
30286
30287 TEST(F32_IGEMM_5X16__AVX_BROADCAST, k_eq_1_subtile_m) {
30288 TEST_REQUIRES_X86_AVX;
30289 for (uint32_t m = 1; m <= 5; m++) {
30290 GemmMicrokernelTester()
30291 .mr(5)
30292 .nr(16)
30293 .kr(1)
30294 .sr(1)
30295 .m(m)
30296 .n(16)
30297 .k(1)
30298 .iterations(1)
30299 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30300 }
30301 }
30302
30303 TEST(F32_IGEMM_5X16__AVX_BROADCAST, k_eq_1_subtile_n) {
30304 TEST_REQUIRES_X86_AVX;
30305 for (uint32_t n = 1; n <= 16; n++) {
30306 GemmMicrokernelTester()
30307 .mr(5)
30308 .nr(16)
30309 .kr(1)
30310 .sr(1)
30311 .m(5)
30312 .n(n)
30313 .k(1)
30314 .iterations(1)
30315 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30316 }
30317 }
30318
30319 TEST(F32_IGEMM_5X16__AVX_BROADCAST, k_gt_1) {
30320 TEST_REQUIRES_X86_AVX;
30321 for (size_t k = 2; k < 10; k++) {
30322 GemmMicrokernelTester()
30323 .mr(5)
30324 .nr(16)
30325 .kr(1)
30326 .sr(1)
30327 .m(5)
30328 .n(16)
30329 .k(k)
30330 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30331 }
30332 }
30333
30334 TEST(F32_IGEMM_5X16__AVX_BROADCAST, k_gt_1_subtile) {
30335 TEST_REQUIRES_X86_AVX;
30336 for (size_t k = 2; k < 10; k++) {
30337 for (uint32_t m = 1; m <= 5; m++) {
30338 for (uint32_t n = 1; n <= 16; n++) {
30339 GemmMicrokernelTester()
30340 .mr(5)
30341 .nr(16)
30342 .kr(1)
30343 .sr(1)
30344 .m(m)
30345 .n(n)
30346 .k(k)
30347 .iterations(1)
30348 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30349 }
30350 }
30351 }
30352 }
30353
30354 TEST(F32_IGEMM_5X16__AVX_BROADCAST, n_gt_16) {
30355 TEST_REQUIRES_X86_AVX;
30356 for (uint32_t n = 17; n < 32; n++) {
30357 for (size_t k = 1; k <= 5; k += 2) {
30358 GemmMicrokernelTester()
30359 .mr(5)
30360 .nr(16)
30361 .kr(1)
30362 .sr(1)
30363 .m(5)
30364 .n(16)
30365 .k(k)
30366 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30367 }
30368 }
30369 }
30370
30371 TEST(F32_IGEMM_5X16__AVX_BROADCAST, n_gt_16_strided_cn) {
30372 TEST_REQUIRES_X86_AVX;
30373 for (uint32_t n = 17; n < 32; n++) {
30374 for (size_t k = 1; k <= 5; k += 2) {
30375 GemmMicrokernelTester()
30376 .mr(5)
30377 .nr(16)
30378 .kr(1)
30379 .sr(1)
30380 .m(5)
30381 .n(16)
30382 .k(k)
30383 .cn_stride(19)
30384 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30385 }
30386 }
30387 }
30388
30389 TEST(F32_IGEMM_5X16__AVX_BROADCAST, n_gt_16_subtile) {
30390 TEST_REQUIRES_X86_AVX;
30391 for (uint32_t n = 17; n < 32; n++) {
30392 for (size_t k = 1; k <= 5; k += 2) {
30393 for (uint32_t m = 1; m <= 5; m++) {
30394 GemmMicrokernelTester()
30395 .mr(5)
30396 .nr(16)
30397 .kr(1)
30398 .sr(1)
30399 .m(m)
30400 .n(n)
30401 .k(k)
30402 .iterations(1)
30403 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30404 }
30405 }
30406 }
30407 }
30408
30409 TEST(F32_IGEMM_5X16__AVX_BROADCAST, n_div_16) {
30410 TEST_REQUIRES_X86_AVX;
30411 for (uint32_t n = 32; n <= 48; n += 16) {
30412 for (size_t k = 1; k <= 5; k += 2) {
30413 GemmMicrokernelTester()
30414 .mr(5)
30415 .nr(16)
30416 .kr(1)
30417 .sr(1)
30418 .m(5)
30419 .n(16)
30420 .k(k)
30421 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30422 }
30423 }
30424 }
30425
30426 TEST(F32_IGEMM_5X16__AVX_BROADCAST, n_div_16_strided_cn) {
30427 TEST_REQUIRES_X86_AVX;
30428 for (uint32_t n = 32; n <= 48; n += 16) {
30429 for (size_t k = 1; k <= 5; k += 2) {
30430 GemmMicrokernelTester()
30431 .mr(5)
30432 .nr(16)
30433 .kr(1)
30434 .sr(1)
30435 .m(5)
30436 .n(n)
30437 .k(k)
30438 .cn_stride(19)
30439 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30440 }
30441 }
30442 }
30443
30444 TEST(F32_IGEMM_5X16__AVX_BROADCAST, n_div_16_subtile) {
30445 TEST_REQUIRES_X86_AVX;
30446 for (uint32_t n = 32; n <= 48; n += 16) {
30447 for (size_t k = 1; k <= 5; k += 2) {
30448 for (uint32_t m = 1; m <= 5; m++) {
30449 GemmMicrokernelTester()
30450 .mr(5)
30451 .nr(16)
30452 .kr(1)
30453 .sr(1)
30454 .m(m)
30455 .n(n)
30456 .k(k)
30457 .iterations(1)
30458 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30459 }
30460 }
30461 }
30462 }
30463
30464 TEST(F32_IGEMM_5X16__AVX_BROADCAST, small_kernel) {
30465 TEST_REQUIRES_X86_AVX;
30466 for (size_t k = 1; k <= 5; k += 2) {
30467 GemmMicrokernelTester()
30468 .mr(5)
30469 .nr(16)
30470 .kr(1)
30471 .sr(1)
30472 .m(5)
30473 .n(16)
30474 .k(k)
30475 .ks(3)
30476 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30477 }
30478 }
30479
30480 TEST(F32_IGEMM_5X16__AVX_BROADCAST, small_kernel_subtile) {
30481 TEST_REQUIRES_X86_AVX;
30482 for (size_t k = 1; k <= 5; k += 2) {
30483 for (uint32_t m = 1; m <= 5; m++) {
30484 for (uint32_t n = 1; n <= 16; n++) {
30485 GemmMicrokernelTester()
30486 .mr(5)
30487 .nr(16)
30488 .kr(1)
30489 .sr(1)
30490 .m(m)
30491 .n(n)
30492 .k(k)
30493 .ks(3)
30494 .iterations(1)
30495 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30496 }
30497 }
30498 }
30499 }
30500
30501 TEST(F32_IGEMM_5X16__AVX_BROADCAST, n_gt_16_small_kernel) {
30502 TEST_REQUIRES_X86_AVX;
30503 for (uint32_t n = 17; n < 32; n++) {
30504 for (size_t k = 1; k <= 5; k += 2) {
30505 GemmMicrokernelTester()
30506 .mr(5)
30507 .nr(16)
30508 .kr(1)
30509 .sr(1)
30510 .m(5)
30511 .n(16)
30512 .k(k)
30513 .ks(3)
30514 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30515 }
30516 }
30517 }
30518
30519 TEST(F32_IGEMM_5X16__AVX_BROADCAST, n_div_16_small_kernel) {
30520 TEST_REQUIRES_X86_AVX;
30521 for (uint32_t n = 32; n <= 48; n += 16) {
30522 for (size_t k = 1; k <= 5; k += 2) {
30523 GemmMicrokernelTester()
30524 .mr(5)
30525 .nr(16)
30526 .kr(1)
30527 .sr(1)
30528 .m(5)
30529 .n(16)
30530 .k(k)
30531 .ks(3)
30532 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30533 }
30534 }
30535 }
30536
30537 TEST(F32_IGEMM_5X16__AVX_BROADCAST, strided_cm_subtile) {
30538 TEST_REQUIRES_X86_AVX;
30539 for (size_t k = 1; k <= 5; k += 2) {
30540 for (uint32_t m = 1; m <= 5; m++) {
30541 for (uint32_t n = 1; n <= 16; n++) {
30542 GemmMicrokernelTester()
30543 .mr(5)
30544 .nr(16)
30545 .kr(1)
30546 .sr(1)
30547 .m(m)
30548 .n(n)
30549 .k(k)
30550 .cm_stride(19)
30551 .iterations(1)
30552 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30553 }
30554 }
30555 }
30556 }
30557
30558 TEST(F32_IGEMM_5X16__AVX_BROADCAST, a_offset) {
30559 TEST_REQUIRES_X86_AVX;
30560 for (size_t k = 1; k <= 5; k += 2) {
30561 GemmMicrokernelTester()
30562 .mr(5)
30563 .nr(16)
30564 .kr(1)
30565 .sr(1)
30566 .m(5)
30567 .n(16)
30568 .k(k)
30569 .ks(3)
30570 .a_offset(29)
30571 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30572 }
30573 }
30574
30575 TEST(F32_IGEMM_5X16__AVX_BROADCAST, zero) {
30576 TEST_REQUIRES_X86_AVX;
30577 for (uint32_t mz = 0; mz < 5; mz++) {
30578 for (size_t k = 1; k <= 5; k += 2) {
30579 GemmMicrokernelTester()
30580 .mr(5)
30581 .nr(16)
30582 .kr(1)
30583 .sr(1)
30584 .m(5)
30585 .n(16)
30586 .k(k)
30587 .ks(3)
30588 .a_offset(29)
30589 .zero_index(mz)
30590 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30591 }
30592 }
30593 }
30594
30595 TEST(F32_IGEMM_5X16__AVX_BROADCAST, qmin) {
30596 TEST_REQUIRES_X86_AVX;
30597 GemmMicrokernelTester()
30598 .mr(5)
30599 .nr(16)
30600 .kr(1)
30601 .sr(1)
30602 .m(5)
30603 .n(16)
30604 .k(1)
30605 .qmin(128)
30606 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30607 }
30608
30609 TEST(F32_IGEMM_5X16__AVX_BROADCAST, qmax) {
30610 TEST_REQUIRES_X86_AVX;
30611 GemmMicrokernelTester()
30612 .mr(5)
30613 .nr(16)
30614 .kr(1)
30615 .sr(1)
30616 .m(5)
30617 .n(16)
30618 .k(1)
30619 .qmax(128)
30620 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30621 }
30622
30623 TEST(F32_IGEMM_5X16__AVX_BROADCAST, strided_cm) {
30624 TEST_REQUIRES_X86_AVX;
30625 GemmMicrokernelTester()
30626 .mr(5)
30627 .nr(16)
30628 .kr(1)
30629 .sr(1)
30630 .m(5)
30631 .n(16)
30632 .k(1)
30633 .cm_stride(19)
30634 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
30635 }
30636#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
30637
30638
30639#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Marat Dukhanfda12b82019-11-21 12:27:59 -080030640 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, k_eq_1) {
30641 TEST_REQUIRES_X86_FMA3;
30642 GemmMicrokernelTester()
30643 .mr(1)
30644 .nr(8)
30645 .kr(1)
30646 .sr(1)
30647 .m(1)
30648 .n(8)
30649 .k(1)
30650 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
30651 }
30652
30653 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, strided_cn) {
30654 TEST_REQUIRES_X86_FMA3;
30655 GemmMicrokernelTester()
30656 .mr(1)
30657 .nr(8)
30658 .kr(1)
30659 .sr(1)
30660 .m(1)
30661 .n(8)
30662 .k(1)
30663 .cn_stride(11)
30664 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
30665 }
30666
30667 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, k_eq_1_subtile) {
30668 TEST_REQUIRES_X86_FMA3;
30669 for (uint32_t m = 1; m <= 1; m++) {
30670 for (uint32_t n = 1; n <= 8; n++) {
30671 GemmMicrokernelTester()
30672 .mr(1)
30673 .nr(8)
30674 .kr(1)
30675 .sr(1)
30676 .m(m)
30677 .n(n)
30678 .k(1)
30679 .iterations(1)
30680 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
30681 }
30682 }
30683 }
30684
30685 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
30686 TEST_REQUIRES_X86_FMA3;
30687 for (uint32_t m = 1; m <= 1; m++) {
30688 GemmMicrokernelTester()
30689 .mr(1)
30690 .nr(8)
30691 .kr(1)
30692 .sr(1)
30693 .m(m)
30694 .n(8)
30695 .k(1)
30696 .iterations(1)
30697 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
30698 }
30699 }
30700
30701 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
30702 TEST_REQUIRES_X86_FMA3;
30703 for (uint32_t n = 1; n <= 8; n++) {
30704 GemmMicrokernelTester()
30705 .mr(1)
30706 .nr(8)
30707 .kr(1)
30708 .sr(1)
30709 .m(1)
30710 .n(n)
30711 .k(1)
30712 .iterations(1)
30713 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
30714 }
30715 }
30716
30717 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, k_gt_1) {
30718 TEST_REQUIRES_X86_FMA3;
30719 for (size_t k = 2; k < 10; k++) {
30720 GemmMicrokernelTester()
30721 .mr(1)
30722 .nr(8)
30723 .kr(1)
30724 .sr(1)
30725 .m(1)
30726 .n(8)
30727 .k(k)
30728 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
30729 }
30730 }
30731
30732 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, k_gt_1_subtile) {
30733 TEST_REQUIRES_X86_FMA3;
30734 for (size_t k = 2; k < 10; k++) {
30735 for (uint32_t m = 1; m <= 1; m++) {
30736 for (uint32_t n = 1; n <= 8; n++) {
30737 GemmMicrokernelTester()
30738 .mr(1)
30739 .nr(8)
30740 .kr(1)
30741 .sr(1)
30742 .m(m)
30743 .n(n)
30744 .k(k)
30745 .iterations(1)
30746 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
30747 }
30748 }
30749 }
30750 }
30751
30752 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, n_gt_8) {
30753 TEST_REQUIRES_X86_FMA3;
30754 for (uint32_t n = 9; n < 16; n++) {
30755 for (size_t k = 1; k <= 5; k += 2) {
30756 GemmMicrokernelTester()
30757 .mr(1)
30758 .nr(8)
30759 .kr(1)
30760 .sr(1)
30761 .m(1)
30762 .n(8)
30763 .k(k)
30764 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
30765 }
30766 }
30767 }
30768
30769 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
30770 TEST_REQUIRES_X86_FMA3;
30771 for (uint32_t n = 9; n < 16; n++) {
30772 for (size_t k = 1; k <= 5; k += 2) {
30773 GemmMicrokernelTester()
30774 .mr(1)
30775 .nr(8)
30776 .kr(1)
30777 .sr(1)
30778 .m(1)
30779 .n(8)
30780 .k(k)
30781 .cn_stride(11)
30782 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
30783 }
30784 }
30785 }
30786
30787 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, n_gt_8_subtile) {
30788 TEST_REQUIRES_X86_FMA3;
30789 for (uint32_t n = 9; n < 16; n++) {
30790 for (size_t k = 1; k <= 5; k += 2) {
30791 for (uint32_t m = 1; m <= 1; m++) {
30792 GemmMicrokernelTester()
30793 .mr(1)
30794 .nr(8)
30795 .kr(1)
30796 .sr(1)
30797 .m(m)
30798 .n(n)
30799 .k(k)
30800 .iterations(1)
30801 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
30802 }
30803 }
30804 }
30805 }
30806
30807 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, n_div_8) {
30808 TEST_REQUIRES_X86_FMA3;
30809 for (uint32_t n = 16; n <= 24; n += 8) {
30810 for (size_t k = 1; k <= 5; k += 2) {
30811 GemmMicrokernelTester()
30812 .mr(1)
30813 .nr(8)
30814 .kr(1)
30815 .sr(1)
30816 .m(1)
30817 .n(8)
30818 .k(k)
30819 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
30820 }
30821 }
30822 }
30823
30824 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, n_div_8_strided_cn) {
30825 TEST_REQUIRES_X86_FMA3;
30826 for (uint32_t n = 16; n <= 24; n += 8) {
30827 for (size_t k = 1; k <= 5; k += 2) {
30828 GemmMicrokernelTester()
30829 .mr(1)
30830 .nr(8)
30831 .kr(1)
30832 .sr(1)
30833 .m(1)
30834 .n(n)
30835 .k(k)
30836 .cn_stride(11)
30837 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
30838 }
30839 }
30840 }
30841
30842 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, n_div_8_subtile) {
30843 TEST_REQUIRES_X86_FMA3;
30844 for (uint32_t n = 16; n <= 24; n += 8) {
30845 for (size_t k = 1; k <= 5; k += 2) {
30846 for (uint32_t m = 1; m <= 1; m++) {
30847 GemmMicrokernelTester()
30848 .mr(1)
30849 .nr(8)
30850 .kr(1)
30851 .sr(1)
30852 .m(m)
30853 .n(n)
30854 .k(k)
30855 .iterations(1)
30856 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
30857 }
30858 }
30859 }
30860 }
30861
30862 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, small_kernel) {
30863 TEST_REQUIRES_X86_FMA3;
30864 for (size_t k = 1; k <= 5; k += 2) {
30865 GemmMicrokernelTester()
30866 .mr(1)
30867 .nr(8)
30868 .kr(1)
30869 .sr(1)
30870 .m(1)
30871 .n(8)
30872 .k(k)
30873 .ks(3)
30874 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
30875 }
30876 }
30877
30878 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, small_kernel_subtile) {
30879 TEST_REQUIRES_X86_FMA3;
30880 for (size_t k = 1; k <= 5; k += 2) {
30881 for (uint32_t m = 1; m <= 1; m++) {
30882 for (uint32_t n = 1; n <= 8; n++) {
30883 GemmMicrokernelTester()
30884 .mr(1)
30885 .nr(8)
30886 .kr(1)
30887 .sr(1)
30888 .m(m)
30889 .n(n)
30890 .k(k)
30891 .ks(3)
30892 .iterations(1)
30893 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
30894 }
30895 }
30896 }
30897 }
30898
30899 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, n_gt_8_small_kernel) {
30900 TEST_REQUIRES_X86_FMA3;
30901 for (uint32_t n = 9; n < 16; n++) {
30902 for (size_t k = 1; k <= 5; k += 2) {
30903 GemmMicrokernelTester()
30904 .mr(1)
30905 .nr(8)
30906 .kr(1)
30907 .sr(1)
30908 .m(1)
30909 .n(8)
30910 .k(k)
30911 .ks(3)
30912 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
30913 }
30914 }
30915 }
30916
30917 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, n_div_8_small_kernel) {
30918 TEST_REQUIRES_X86_FMA3;
30919 for (uint32_t n = 16; n <= 24; n += 8) {
30920 for (size_t k = 1; k <= 5; k += 2) {
30921 GemmMicrokernelTester()
30922 .mr(1)
30923 .nr(8)
30924 .kr(1)
30925 .sr(1)
30926 .m(1)
30927 .n(8)
30928 .k(k)
30929 .ks(3)
30930 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
30931 }
30932 }
30933 }
30934
30935 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, strided_cm_subtile) {
30936 TEST_REQUIRES_X86_FMA3;
30937 for (size_t k = 1; k <= 5; k += 2) {
30938 for (uint32_t m = 1; m <= 1; m++) {
30939 for (uint32_t n = 1; n <= 8; n++) {
30940 GemmMicrokernelTester()
30941 .mr(1)
30942 .nr(8)
30943 .kr(1)
30944 .sr(1)
30945 .m(m)
30946 .n(n)
30947 .k(k)
30948 .cm_stride(11)
30949 .iterations(1)
30950 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
30951 }
30952 }
30953 }
30954 }
30955
30956 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, a_offset) {
30957 TEST_REQUIRES_X86_FMA3;
30958 for (size_t k = 1; k <= 5; k += 2) {
30959 GemmMicrokernelTester()
30960 .mr(1)
30961 .nr(8)
30962 .kr(1)
30963 .sr(1)
30964 .m(1)
30965 .n(8)
30966 .k(k)
30967 .ks(3)
30968 .a_offset(7)
30969 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
30970 }
30971 }
30972
30973 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, zero) {
30974 TEST_REQUIRES_X86_FMA3;
30975 for (uint32_t mz = 0; mz < 1; mz++) {
30976 for (size_t k = 1; k <= 5; k += 2) {
30977 GemmMicrokernelTester()
30978 .mr(1)
30979 .nr(8)
30980 .kr(1)
30981 .sr(1)
30982 .m(1)
30983 .n(8)
30984 .k(k)
30985 .ks(3)
30986 .a_offset(7)
30987 .zero_index(mz)
30988 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
30989 }
30990 }
30991 }
30992
30993 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, qmin) {
30994 TEST_REQUIRES_X86_FMA3;
30995 GemmMicrokernelTester()
30996 .mr(1)
30997 .nr(8)
30998 .kr(1)
30999 .sr(1)
31000 .m(1)
31001 .n(8)
31002 .k(1)
31003 .qmin(128)
31004 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
31005 }
31006
31007 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, qmax) {
31008 TEST_REQUIRES_X86_FMA3;
31009 GemmMicrokernelTester()
31010 .mr(1)
31011 .nr(8)
31012 .kr(1)
31013 .sr(1)
31014 .m(1)
31015 .n(8)
31016 .k(1)
31017 .qmax(128)
31018 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
31019 }
31020
31021 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, strided_cm) {
31022 TEST_REQUIRES_X86_FMA3;
31023 GemmMicrokernelTester()
31024 .mr(1)
31025 .nr(8)
31026 .kr(1)
31027 .sr(1)
31028 .m(1)
31029 .n(8)
31030 .k(1)
31031 .cm_stride(11)
31032 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
31033 }
31034#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
31035
31036
31037#if XNN_ARCH_X86 || XNN_ARCH_X86_64
31038 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, k_eq_1) {
31039 TEST_REQUIRES_X86_FMA3;
31040 GemmMicrokernelTester()
31041 .mr(4)
31042 .nr(8)
31043 .kr(1)
31044 .sr(1)
31045 .m(4)
31046 .n(8)
31047 .k(1)
31048 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31049 }
31050
31051 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, strided_cn) {
31052 TEST_REQUIRES_X86_FMA3;
31053 GemmMicrokernelTester()
31054 .mr(4)
31055 .nr(8)
31056 .kr(1)
31057 .sr(1)
31058 .m(4)
31059 .n(8)
31060 .k(1)
31061 .cn_stride(11)
31062 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31063 }
31064
31065 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, k_eq_1_subtile) {
31066 TEST_REQUIRES_X86_FMA3;
31067 for (uint32_t m = 1; m <= 4; m++) {
31068 for (uint32_t n = 1; n <= 8; n++) {
31069 GemmMicrokernelTester()
31070 .mr(4)
31071 .nr(8)
31072 .kr(1)
31073 .sr(1)
31074 .m(m)
31075 .n(n)
31076 .k(1)
31077 .iterations(1)
31078 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31079 }
31080 }
31081 }
31082
31083 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
31084 TEST_REQUIRES_X86_FMA3;
31085 for (uint32_t m = 1; m <= 4; m++) {
31086 GemmMicrokernelTester()
31087 .mr(4)
31088 .nr(8)
31089 .kr(1)
31090 .sr(1)
31091 .m(m)
31092 .n(8)
31093 .k(1)
31094 .iterations(1)
31095 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31096 }
31097 }
31098
31099 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
31100 TEST_REQUIRES_X86_FMA3;
31101 for (uint32_t n = 1; n <= 8; n++) {
31102 GemmMicrokernelTester()
31103 .mr(4)
31104 .nr(8)
31105 .kr(1)
31106 .sr(1)
31107 .m(4)
31108 .n(n)
31109 .k(1)
31110 .iterations(1)
31111 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31112 }
31113 }
31114
31115 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, k_gt_1) {
31116 TEST_REQUIRES_X86_FMA3;
31117 for (size_t k = 2; k < 10; k++) {
31118 GemmMicrokernelTester()
31119 .mr(4)
31120 .nr(8)
31121 .kr(1)
31122 .sr(1)
31123 .m(4)
31124 .n(8)
31125 .k(k)
31126 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31127 }
31128 }
31129
31130 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, k_gt_1_subtile) {
31131 TEST_REQUIRES_X86_FMA3;
31132 for (size_t k = 2; k < 10; k++) {
31133 for (uint32_t m = 1; m <= 4; m++) {
31134 for (uint32_t n = 1; n <= 8; n++) {
31135 GemmMicrokernelTester()
31136 .mr(4)
31137 .nr(8)
31138 .kr(1)
31139 .sr(1)
31140 .m(m)
31141 .n(n)
31142 .k(k)
31143 .iterations(1)
31144 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31145 }
31146 }
31147 }
31148 }
31149
31150 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, n_gt_8) {
31151 TEST_REQUIRES_X86_FMA3;
31152 for (uint32_t n = 9; n < 16; n++) {
31153 for (size_t k = 1; k <= 5; k += 2) {
31154 GemmMicrokernelTester()
31155 .mr(4)
31156 .nr(8)
31157 .kr(1)
31158 .sr(1)
31159 .m(4)
31160 .n(8)
31161 .k(k)
31162 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31163 }
31164 }
31165 }
31166
31167 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
31168 TEST_REQUIRES_X86_FMA3;
31169 for (uint32_t n = 9; n < 16; n++) {
31170 for (size_t k = 1; k <= 5; k += 2) {
31171 GemmMicrokernelTester()
31172 .mr(4)
31173 .nr(8)
31174 .kr(1)
31175 .sr(1)
31176 .m(4)
31177 .n(8)
31178 .k(k)
31179 .cn_stride(11)
31180 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31181 }
31182 }
31183 }
31184
31185 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, n_gt_8_subtile) {
31186 TEST_REQUIRES_X86_FMA3;
31187 for (uint32_t n = 9; n < 16; n++) {
31188 for (size_t k = 1; k <= 5; k += 2) {
31189 for (uint32_t m = 1; m <= 4; m++) {
31190 GemmMicrokernelTester()
31191 .mr(4)
31192 .nr(8)
31193 .kr(1)
31194 .sr(1)
31195 .m(m)
31196 .n(n)
31197 .k(k)
31198 .iterations(1)
31199 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31200 }
31201 }
31202 }
31203 }
31204
31205 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, n_div_8) {
31206 TEST_REQUIRES_X86_FMA3;
31207 for (uint32_t n = 16; n <= 24; n += 8) {
31208 for (size_t k = 1; k <= 5; k += 2) {
31209 GemmMicrokernelTester()
31210 .mr(4)
31211 .nr(8)
31212 .kr(1)
31213 .sr(1)
31214 .m(4)
31215 .n(8)
31216 .k(k)
31217 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31218 }
31219 }
31220 }
31221
31222 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, n_div_8_strided_cn) {
31223 TEST_REQUIRES_X86_FMA3;
31224 for (uint32_t n = 16; n <= 24; n += 8) {
31225 for (size_t k = 1; k <= 5; k += 2) {
31226 GemmMicrokernelTester()
31227 .mr(4)
31228 .nr(8)
31229 .kr(1)
31230 .sr(1)
31231 .m(4)
31232 .n(n)
31233 .k(k)
31234 .cn_stride(11)
31235 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31236 }
31237 }
31238 }
31239
31240 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, n_div_8_subtile) {
31241 TEST_REQUIRES_X86_FMA3;
31242 for (uint32_t n = 16; n <= 24; n += 8) {
31243 for (size_t k = 1; k <= 5; k += 2) {
31244 for (uint32_t m = 1; m <= 4; m++) {
31245 GemmMicrokernelTester()
31246 .mr(4)
31247 .nr(8)
31248 .kr(1)
31249 .sr(1)
31250 .m(m)
31251 .n(n)
31252 .k(k)
31253 .iterations(1)
31254 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31255 }
31256 }
31257 }
31258 }
31259
31260 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, small_kernel) {
31261 TEST_REQUIRES_X86_FMA3;
31262 for (size_t k = 1; k <= 5; k += 2) {
31263 GemmMicrokernelTester()
31264 .mr(4)
31265 .nr(8)
31266 .kr(1)
31267 .sr(1)
31268 .m(4)
31269 .n(8)
31270 .k(k)
31271 .ks(3)
31272 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31273 }
31274 }
31275
31276 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, small_kernel_subtile) {
31277 TEST_REQUIRES_X86_FMA3;
31278 for (size_t k = 1; k <= 5; k += 2) {
31279 for (uint32_t m = 1; m <= 4; m++) {
31280 for (uint32_t n = 1; n <= 8; n++) {
31281 GemmMicrokernelTester()
31282 .mr(4)
31283 .nr(8)
31284 .kr(1)
31285 .sr(1)
31286 .m(m)
31287 .n(n)
31288 .k(k)
31289 .ks(3)
31290 .iterations(1)
31291 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31292 }
31293 }
31294 }
31295 }
31296
31297 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, n_gt_8_small_kernel) {
31298 TEST_REQUIRES_X86_FMA3;
31299 for (uint32_t n = 9; n < 16; n++) {
31300 for (size_t k = 1; k <= 5; k += 2) {
31301 GemmMicrokernelTester()
31302 .mr(4)
31303 .nr(8)
31304 .kr(1)
31305 .sr(1)
31306 .m(4)
31307 .n(8)
31308 .k(k)
31309 .ks(3)
31310 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31311 }
31312 }
31313 }
31314
31315 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, n_div_8_small_kernel) {
31316 TEST_REQUIRES_X86_FMA3;
31317 for (uint32_t n = 16; n <= 24; n += 8) {
31318 for (size_t k = 1; k <= 5; k += 2) {
31319 GemmMicrokernelTester()
31320 .mr(4)
31321 .nr(8)
31322 .kr(1)
31323 .sr(1)
31324 .m(4)
31325 .n(8)
31326 .k(k)
31327 .ks(3)
31328 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31329 }
31330 }
31331 }
31332
31333 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, strided_cm_subtile) {
31334 TEST_REQUIRES_X86_FMA3;
31335 for (size_t k = 1; k <= 5; k += 2) {
31336 for (uint32_t m = 1; m <= 4; m++) {
31337 for (uint32_t n = 1; n <= 8; n++) {
31338 GemmMicrokernelTester()
31339 .mr(4)
31340 .nr(8)
31341 .kr(1)
31342 .sr(1)
31343 .m(m)
31344 .n(n)
31345 .k(k)
31346 .cm_stride(11)
31347 .iterations(1)
31348 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31349 }
31350 }
31351 }
31352 }
31353
31354 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, a_offset) {
31355 TEST_REQUIRES_X86_FMA3;
31356 for (size_t k = 1; k <= 5; k += 2) {
31357 GemmMicrokernelTester()
31358 .mr(4)
31359 .nr(8)
31360 .kr(1)
31361 .sr(1)
31362 .m(4)
31363 .n(8)
31364 .k(k)
31365 .ks(3)
31366 .a_offset(23)
31367 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31368 }
31369 }
31370
31371 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, zero) {
31372 TEST_REQUIRES_X86_FMA3;
31373 for (uint32_t mz = 0; mz < 4; mz++) {
31374 for (size_t k = 1; k <= 5; k += 2) {
31375 GemmMicrokernelTester()
31376 .mr(4)
31377 .nr(8)
31378 .kr(1)
31379 .sr(1)
31380 .m(4)
31381 .n(8)
31382 .k(k)
31383 .ks(3)
31384 .a_offset(23)
31385 .zero_index(mz)
31386 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31387 }
31388 }
31389 }
31390
31391 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, qmin) {
31392 TEST_REQUIRES_X86_FMA3;
31393 GemmMicrokernelTester()
31394 .mr(4)
31395 .nr(8)
31396 .kr(1)
31397 .sr(1)
31398 .m(4)
31399 .n(8)
31400 .k(1)
31401 .qmin(128)
31402 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31403 }
31404
31405 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, qmax) {
31406 TEST_REQUIRES_X86_FMA3;
31407 GemmMicrokernelTester()
31408 .mr(4)
31409 .nr(8)
31410 .kr(1)
31411 .sr(1)
31412 .m(4)
31413 .n(8)
31414 .k(1)
31415 .qmax(128)
31416 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31417 }
31418
31419 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, strided_cm) {
31420 TEST_REQUIRES_X86_FMA3;
31421 GemmMicrokernelTester()
31422 .mr(4)
31423 .nr(8)
31424 .kr(1)
31425 .sr(1)
31426 .m(4)
31427 .n(8)
31428 .k(1)
31429 .cm_stride(11)
31430 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
31431 }
31432#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
31433
31434
31435#if XNN_ARCH_X86 || XNN_ARCH_X86_64
31436 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, k_eq_1) {
31437 TEST_REQUIRES_X86_FMA3;
31438 GemmMicrokernelTester()
31439 .mr(5)
31440 .nr(8)
31441 .kr(1)
31442 .sr(1)
31443 .m(5)
31444 .n(8)
31445 .k(1)
31446 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31447 }
31448
31449 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, strided_cn) {
31450 TEST_REQUIRES_X86_FMA3;
31451 GemmMicrokernelTester()
31452 .mr(5)
31453 .nr(8)
31454 .kr(1)
31455 .sr(1)
31456 .m(5)
31457 .n(8)
31458 .k(1)
31459 .cn_stride(11)
31460 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31461 }
31462
31463 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, k_eq_1_subtile) {
31464 TEST_REQUIRES_X86_FMA3;
31465 for (uint32_t m = 1; m <= 5; m++) {
31466 for (uint32_t n = 1; n <= 8; n++) {
31467 GemmMicrokernelTester()
31468 .mr(5)
31469 .nr(8)
31470 .kr(1)
31471 .sr(1)
31472 .m(m)
31473 .n(n)
31474 .k(1)
31475 .iterations(1)
31476 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31477 }
31478 }
31479 }
31480
31481 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
31482 TEST_REQUIRES_X86_FMA3;
31483 for (uint32_t m = 1; m <= 5; m++) {
31484 GemmMicrokernelTester()
31485 .mr(5)
31486 .nr(8)
31487 .kr(1)
31488 .sr(1)
31489 .m(m)
31490 .n(8)
31491 .k(1)
31492 .iterations(1)
31493 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31494 }
31495 }
31496
31497 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
31498 TEST_REQUIRES_X86_FMA3;
31499 for (uint32_t n = 1; n <= 8; n++) {
31500 GemmMicrokernelTester()
31501 .mr(5)
31502 .nr(8)
31503 .kr(1)
31504 .sr(1)
31505 .m(5)
31506 .n(n)
31507 .k(1)
31508 .iterations(1)
31509 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31510 }
31511 }
31512
31513 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, k_gt_1) {
31514 TEST_REQUIRES_X86_FMA3;
31515 for (size_t k = 2; k < 10; k++) {
31516 GemmMicrokernelTester()
31517 .mr(5)
31518 .nr(8)
31519 .kr(1)
31520 .sr(1)
31521 .m(5)
31522 .n(8)
31523 .k(k)
31524 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31525 }
31526 }
31527
31528 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, k_gt_1_subtile) {
31529 TEST_REQUIRES_X86_FMA3;
31530 for (size_t k = 2; k < 10; k++) {
31531 for (uint32_t m = 1; m <= 5; m++) {
31532 for (uint32_t n = 1; n <= 8; n++) {
31533 GemmMicrokernelTester()
31534 .mr(5)
31535 .nr(8)
31536 .kr(1)
31537 .sr(1)
31538 .m(m)
31539 .n(n)
31540 .k(k)
31541 .iterations(1)
31542 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31543 }
31544 }
31545 }
31546 }
31547
31548 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, n_gt_8) {
31549 TEST_REQUIRES_X86_FMA3;
31550 for (uint32_t n = 9; n < 16; n++) {
31551 for (size_t k = 1; k <= 5; k += 2) {
31552 GemmMicrokernelTester()
31553 .mr(5)
31554 .nr(8)
31555 .kr(1)
31556 .sr(1)
31557 .m(5)
31558 .n(8)
31559 .k(k)
31560 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31561 }
31562 }
31563 }
31564
31565 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
31566 TEST_REQUIRES_X86_FMA3;
31567 for (uint32_t n = 9; n < 16; n++) {
31568 for (size_t k = 1; k <= 5; k += 2) {
31569 GemmMicrokernelTester()
31570 .mr(5)
31571 .nr(8)
31572 .kr(1)
31573 .sr(1)
31574 .m(5)
31575 .n(8)
31576 .k(k)
31577 .cn_stride(11)
31578 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31579 }
31580 }
31581 }
31582
31583 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, n_gt_8_subtile) {
31584 TEST_REQUIRES_X86_FMA3;
31585 for (uint32_t n = 9; n < 16; n++) {
31586 for (size_t k = 1; k <= 5; k += 2) {
31587 for (uint32_t m = 1; m <= 5; m++) {
31588 GemmMicrokernelTester()
31589 .mr(5)
31590 .nr(8)
31591 .kr(1)
31592 .sr(1)
31593 .m(m)
31594 .n(n)
31595 .k(k)
31596 .iterations(1)
31597 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31598 }
31599 }
31600 }
31601 }
31602
31603 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, n_div_8) {
31604 TEST_REQUIRES_X86_FMA3;
31605 for (uint32_t n = 16; n <= 24; n += 8) {
31606 for (size_t k = 1; k <= 5; k += 2) {
31607 GemmMicrokernelTester()
31608 .mr(5)
31609 .nr(8)
31610 .kr(1)
31611 .sr(1)
31612 .m(5)
31613 .n(8)
31614 .k(k)
31615 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31616 }
31617 }
31618 }
31619
31620 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, n_div_8_strided_cn) {
31621 TEST_REQUIRES_X86_FMA3;
31622 for (uint32_t n = 16; n <= 24; n += 8) {
31623 for (size_t k = 1; k <= 5; k += 2) {
31624 GemmMicrokernelTester()
31625 .mr(5)
31626 .nr(8)
31627 .kr(1)
31628 .sr(1)
31629 .m(5)
31630 .n(n)
31631 .k(k)
31632 .cn_stride(11)
31633 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31634 }
31635 }
31636 }
31637
31638 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, n_div_8_subtile) {
31639 TEST_REQUIRES_X86_FMA3;
31640 for (uint32_t n = 16; n <= 24; n += 8) {
31641 for (size_t k = 1; k <= 5; k += 2) {
31642 for (uint32_t m = 1; m <= 5; m++) {
31643 GemmMicrokernelTester()
31644 .mr(5)
31645 .nr(8)
31646 .kr(1)
31647 .sr(1)
31648 .m(m)
31649 .n(n)
31650 .k(k)
31651 .iterations(1)
31652 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31653 }
31654 }
31655 }
31656 }
31657
31658 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, small_kernel) {
31659 TEST_REQUIRES_X86_FMA3;
31660 for (size_t k = 1; k <= 5; k += 2) {
31661 GemmMicrokernelTester()
31662 .mr(5)
31663 .nr(8)
31664 .kr(1)
31665 .sr(1)
31666 .m(5)
31667 .n(8)
31668 .k(k)
31669 .ks(3)
31670 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31671 }
31672 }
31673
31674 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, small_kernel_subtile) {
31675 TEST_REQUIRES_X86_FMA3;
31676 for (size_t k = 1; k <= 5; k += 2) {
31677 for (uint32_t m = 1; m <= 5; m++) {
31678 for (uint32_t n = 1; n <= 8; n++) {
31679 GemmMicrokernelTester()
31680 .mr(5)
31681 .nr(8)
31682 .kr(1)
31683 .sr(1)
31684 .m(m)
31685 .n(n)
31686 .k(k)
31687 .ks(3)
31688 .iterations(1)
31689 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31690 }
31691 }
31692 }
31693 }
31694
31695 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, n_gt_8_small_kernel) {
31696 TEST_REQUIRES_X86_FMA3;
31697 for (uint32_t n = 9; n < 16; n++) {
31698 for (size_t k = 1; k <= 5; k += 2) {
31699 GemmMicrokernelTester()
31700 .mr(5)
31701 .nr(8)
31702 .kr(1)
31703 .sr(1)
31704 .m(5)
31705 .n(8)
31706 .k(k)
31707 .ks(3)
31708 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31709 }
31710 }
31711 }
31712
31713 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, n_div_8_small_kernel) {
31714 TEST_REQUIRES_X86_FMA3;
31715 for (uint32_t n = 16; n <= 24; n += 8) {
31716 for (size_t k = 1; k <= 5; k += 2) {
31717 GemmMicrokernelTester()
31718 .mr(5)
31719 .nr(8)
31720 .kr(1)
31721 .sr(1)
31722 .m(5)
31723 .n(8)
31724 .k(k)
31725 .ks(3)
31726 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31727 }
31728 }
31729 }
31730
31731 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, strided_cm_subtile) {
31732 TEST_REQUIRES_X86_FMA3;
31733 for (size_t k = 1; k <= 5; k += 2) {
31734 for (uint32_t m = 1; m <= 5; m++) {
31735 for (uint32_t n = 1; n <= 8; n++) {
31736 GemmMicrokernelTester()
31737 .mr(5)
31738 .nr(8)
31739 .kr(1)
31740 .sr(1)
31741 .m(m)
31742 .n(n)
31743 .k(k)
31744 .cm_stride(11)
31745 .iterations(1)
31746 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31747 }
31748 }
31749 }
31750 }
31751
31752 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, a_offset) {
31753 TEST_REQUIRES_X86_FMA3;
31754 for (size_t k = 1; k <= 5; k += 2) {
31755 GemmMicrokernelTester()
31756 .mr(5)
31757 .nr(8)
31758 .kr(1)
31759 .sr(1)
31760 .m(5)
31761 .n(8)
31762 .k(k)
31763 .ks(3)
31764 .a_offset(29)
31765 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31766 }
31767 }
31768
31769 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, zero) {
31770 TEST_REQUIRES_X86_FMA3;
31771 for (uint32_t mz = 0; mz < 5; mz++) {
31772 for (size_t k = 1; k <= 5; k += 2) {
31773 GemmMicrokernelTester()
31774 .mr(5)
31775 .nr(8)
31776 .kr(1)
31777 .sr(1)
31778 .m(5)
31779 .n(8)
31780 .k(k)
31781 .ks(3)
31782 .a_offset(29)
31783 .zero_index(mz)
31784 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31785 }
31786 }
31787 }
31788
31789 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, qmin) {
31790 TEST_REQUIRES_X86_FMA3;
31791 GemmMicrokernelTester()
31792 .mr(5)
31793 .nr(8)
31794 .kr(1)
31795 .sr(1)
31796 .m(5)
31797 .n(8)
31798 .k(1)
31799 .qmin(128)
31800 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31801 }
31802
31803 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, qmax) {
31804 TEST_REQUIRES_X86_FMA3;
31805 GemmMicrokernelTester()
31806 .mr(5)
31807 .nr(8)
31808 .kr(1)
31809 .sr(1)
31810 .m(5)
31811 .n(8)
31812 .k(1)
31813 .qmax(128)
31814 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31815 }
31816
31817 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, strided_cm) {
31818 TEST_REQUIRES_X86_FMA3;
31819 GemmMicrokernelTester()
31820 .mr(5)
31821 .nr(8)
31822 .kr(1)
31823 .sr(1)
31824 .m(5)
31825 .n(8)
31826 .k(1)
31827 .cm_stride(11)
31828 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
31829 }
31830#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
31831
31832
31833#if XNN_ARCH_X86 || XNN_ARCH_X86_64
31834 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, k_eq_1) {
31835 TEST_REQUIRES_X86_FMA3;
31836 GemmMicrokernelTester()
31837 .mr(6)
31838 .nr(8)
31839 .kr(1)
31840 .sr(1)
31841 .m(6)
31842 .n(8)
31843 .k(1)
31844 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
31845 }
31846
31847 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, strided_cn) {
31848 TEST_REQUIRES_X86_FMA3;
31849 GemmMicrokernelTester()
31850 .mr(6)
31851 .nr(8)
31852 .kr(1)
31853 .sr(1)
31854 .m(6)
31855 .n(8)
31856 .k(1)
31857 .cn_stride(11)
31858 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
31859 }
31860
31861 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, k_eq_1_subtile) {
31862 TEST_REQUIRES_X86_FMA3;
31863 for (uint32_t m = 1; m <= 6; m++) {
31864 for (uint32_t n = 1; n <= 8; n++) {
31865 GemmMicrokernelTester()
31866 .mr(6)
31867 .nr(8)
31868 .kr(1)
31869 .sr(1)
31870 .m(m)
31871 .n(n)
31872 .k(1)
31873 .iterations(1)
31874 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
31875 }
31876 }
31877 }
31878
31879 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
31880 TEST_REQUIRES_X86_FMA3;
31881 for (uint32_t m = 1; m <= 6; m++) {
31882 GemmMicrokernelTester()
31883 .mr(6)
31884 .nr(8)
31885 .kr(1)
31886 .sr(1)
31887 .m(m)
31888 .n(8)
31889 .k(1)
31890 .iterations(1)
31891 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
31892 }
31893 }
31894
31895 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
31896 TEST_REQUIRES_X86_FMA3;
31897 for (uint32_t n = 1; n <= 8; n++) {
31898 GemmMicrokernelTester()
31899 .mr(6)
31900 .nr(8)
31901 .kr(1)
31902 .sr(1)
31903 .m(6)
31904 .n(n)
31905 .k(1)
31906 .iterations(1)
31907 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
31908 }
31909 }
31910
31911 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, k_gt_1) {
31912 TEST_REQUIRES_X86_FMA3;
31913 for (size_t k = 2; k < 10; k++) {
31914 GemmMicrokernelTester()
31915 .mr(6)
31916 .nr(8)
31917 .kr(1)
31918 .sr(1)
31919 .m(6)
31920 .n(8)
31921 .k(k)
31922 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
31923 }
31924 }
31925
31926 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, k_gt_1_subtile) {
31927 TEST_REQUIRES_X86_FMA3;
31928 for (size_t k = 2; k < 10; k++) {
31929 for (uint32_t m = 1; m <= 6; m++) {
31930 for (uint32_t n = 1; n <= 8; n++) {
31931 GemmMicrokernelTester()
31932 .mr(6)
31933 .nr(8)
31934 .kr(1)
31935 .sr(1)
31936 .m(m)
31937 .n(n)
31938 .k(k)
31939 .iterations(1)
31940 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
31941 }
31942 }
31943 }
31944 }
31945
31946 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, n_gt_8) {
31947 TEST_REQUIRES_X86_FMA3;
31948 for (uint32_t n = 9; n < 16; n++) {
31949 for (size_t k = 1; k <= 5; k += 2) {
31950 GemmMicrokernelTester()
31951 .mr(6)
31952 .nr(8)
31953 .kr(1)
31954 .sr(1)
31955 .m(6)
31956 .n(8)
31957 .k(k)
31958 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
31959 }
31960 }
31961 }
31962
31963 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
31964 TEST_REQUIRES_X86_FMA3;
31965 for (uint32_t n = 9; n < 16; n++) {
31966 for (size_t k = 1; k <= 5; k += 2) {
31967 GemmMicrokernelTester()
31968 .mr(6)
31969 .nr(8)
31970 .kr(1)
31971 .sr(1)
31972 .m(6)
31973 .n(8)
31974 .k(k)
31975 .cn_stride(11)
31976 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
31977 }
31978 }
31979 }
31980
31981 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, n_gt_8_subtile) {
31982 TEST_REQUIRES_X86_FMA3;
31983 for (uint32_t n = 9; n < 16; n++) {
31984 for (size_t k = 1; k <= 5; k += 2) {
31985 for (uint32_t m = 1; m <= 6; m++) {
31986 GemmMicrokernelTester()
31987 .mr(6)
31988 .nr(8)
31989 .kr(1)
31990 .sr(1)
31991 .m(m)
31992 .n(n)
31993 .k(k)
31994 .iterations(1)
31995 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
31996 }
31997 }
31998 }
31999 }
32000
32001 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, n_div_8) {
32002 TEST_REQUIRES_X86_FMA3;
32003 for (uint32_t n = 16; n <= 24; n += 8) {
32004 for (size_t k = 1; k <= 5; k += 2) {
32005 GemmMicrokernelTester()
32006 .mr(6)
32007 .nr(8)
32008 .kr(1)
32009 .sr(1)
32010 .m(6)
32011 .n(8)
32012 .k(k)
32013 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
32014 }
32015 }
32016 }
32017
32018 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, n_div_8_strided_cn) {
32019 TEST_REQUIRES_X86_FMA3;
32020 for (uint32_t n = 16; n <= 24; n += 8) {
32021 for (size_t k = 1; k <= 5; k += 2) {
32022 GemmMicrokernelTester()
32023 .mr(6)
32024 .nr(8)
32025 .kr(1)
32026 .sr(1)
32027 .m(6)
32028 .n(n)
32029 .k(k)
32030 .cn_stride(11)
32031 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
32032 }
32033 }
32034 }
32035
32036 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, n_div_8_subtile) {
32037 TEST_REQUIRES_X86_FMA3;
32038 for (uint32_t n = 16; n <= 24; n += 8) {
32039 for (size_t k = 1; k <= 5; k += 2) {
32040 for (uint32_t m = 1; m <= 6; m++) {
32041 GemmMicrokernelTester()
32042 .mr(6)
32043 .nr(8)
32044 .kr(1)
32045 .sr(1)
32046 .m(m)
32047 .n(n)
32048 .k(k)
32049 .iterations(1)
32050 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
32051 }
32052 }
32053 }
32054 }
32055
32056 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, small_kernel) {
32057 TEST_REQUIRES_X86_FMA3;
32058 for (size_t k = 1; k <= 5; k += 2) {
32059 GemmMicrokernelTester()
32060 .mr(6)
32061 .nr(8)
32062 .kr(1)
32063 .sr(1)
32064 .m(6)
32065 .n(8)
32066 .k(k)
32067 .ks(3)
32068 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
32069 }
32070 }
32071
32072 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, small_kernel_subtile) {
32073 TEST_REQUIRES_X86_FMA3;
32074 for (size_t k = 1; k <= 5; k += 2) {
32075 for (uint32_t m = 1; m <= 6; m++) {
32076 for (uint32_t n = 1; n <= 8; n++) {
32077 GemmMicrokernelTester()
32078 .mr(6)
32079 .nr(8)
32080 .kr(1)
32081 .sr(1)
32082 .m(m)
32083 .n(n)
32084 .k(k)
32085 .ks(3)
32086 .iterations(1)
32087 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
32088 }
32089 }
32090 }
32091 }
32092
32093 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, n_gt_8_small_kernel) {
32094 TEST_REQUIRES_X86_FMA3;
32095 for (uint32_t n = 9; n < 16; n++) {
32096 for (size_t k = 1; k <= 5; k += 2) {
32097 GemmMicrokernelTester()
32098 .mr(6)
32099 .nr(8)
32100 .kr(1)
32101 .sr(1)
32102 .m(6)
32103 .n(8)
32104 .k(k)
32105 .ks(3)
32106 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
32107 }
32108 }
32109 }
32110
32111 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, n_div_8_small_kernel) {
32112 TEST_REQUIRES_X86_FMA3;
32113 for (uint32_t n = 16; n <= 24; n += 8) {
32114 for (size_t k = 1; k <= 5; k += 2) {
32115 GemmMicrokernelTester()
32116 .mr(6)
32117 .nr(8)
32118 .kr(1)
32119 .sr(1)
32120 .m(6)
32121 .n(8)
32122 .k(k)
32123 .ks(3)
32124 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
32125 }
32126 }
32127 }
32128
32129 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, strided_cm_subtile) {
32130 TEST_REQUIRES_X86_FMA3;
32131 for (size_t k = 1; k <= 5; k += 2) {
32132 for (uint32_t m = 1; m <= 6; m++) {
32133 for (uint32_t n = 1; n <= 8; n++) {
32134 GemmMicrokernelTester()
32135 .mr(6)
32136 .nr(8)
32137 .kr(1)
32138 .sr(1)
32139 .m(m)
32140 .n(n)
32141 .k(k)
32142 .cm_stride(11)
32143 .iterations(1)
32144 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
32145 }
32146 }
32147 }
32148 }
32149
32150 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, a_offset) {
32151 TEST_REQUIRES_X86_FMA3;
32152 for (size_t k = 1; k <= 5; k += 2) {
32153 GemmMicrokernelTester()
32154 .mr(6)
32155 .nr(8)
32156 .kr(1)
32157 .sr(1)
32158 .m(6)
32159 .n(8)
32160 .k(k)
32161 .ks(3)
32162 .a_offset(37)
32163 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
32164 }
32165 }
32166
32167 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, zero) {
32168 TEST_REQUIRES_X86_FMA3;
32169 for (uint32_t mz = 0; mz < 6; mz++) {
32170 for (size_t k = 1; k <= 5; k += 2) {
32171 GemmMicrokernelTester()
32172 .mr(6)
32173 .nr(8)
32174 .kr(1)
32175 .sr(1)
32176 .m(6)
32177 .n(8)
32178 .k(k)
32179 .ks(3)
32180 .a_offset(37)
32181 .zero_index(mz)
32182 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
32183 }
32184 }
32185 }
32186
32187 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, qmin) {
32188 TEST_REQUIRES_X86_FMA3;
32189 GemmMicrokernelTester()
32190 .mr(6)
32191 .nr(8)
32192 .kr(1)
32193 .sr(1)
32194 .m(6)
32195 .n(8)
32196 .k(1)
32197 .qmin(128)
32198 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
32199 }
32200
32201 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, qmax) {
32202 TEST_REQUIRES_X86_FMA3;
32203 GemmMicrokernelTester()
32204 .mr(6)
32205 .nr(8)
32206 .kr(1)
32207 .sr(1)
32208 .m(6)
32209 .n(8)
32210 .k(1)
32211 .qmax(128)
32212 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
32213 }
32214
32215 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, strided_cm) {
32216 TEST_REQUIRES_X86_FMA3;
32217 GemmMicrokernelTester()
32218 .mr(6)
32219 .nr(8)
32220 .kr(1)
32221 .sr(1)
32222 .m(6)
32223 .n(8)
32224 .k(1)
32225 .cm_stride(11)
32226 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
32227 }
32228#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
32229
32230
32231#if XNN_ARCH_X86 || XNN_ARCH_X86_64
32232 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, k_eq_1) {
32233 TEST_REQUIRES_X86_FMA3;
32234 GemmMicrokernelTester()
32235 .mr(7)
32236 .nr(8)
32237 .kr(1)
32238 .sr(1)
32239 .m(7)
32240 .n(8)
32241 .k(1)
32242 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32243 }
32244
32245 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, strided_cn) {
32246 TEST_REQUIRES_X86_FMA3;
32247 GemmMicrokernelTester()
32248 .mr(7)
32249 .nr(8)
32250 .kr(1)
32251 .sr(1)
32252 .m(7)
32253 .n(8)
32254 .k(1)
32255 .cn_stride(11)
32256 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32257 }
32258
32259 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, k_eq_1_subtile) {
32260 TEST_REQUIRES_X86_FMA3;
32261 for (uint32_t m = 1; m <= 7; m++) {
32262 for (uint32_t n = 1; n <= 8; n++) {
32263 GemmMicrokernelTester()
32264 .mr(7)
32265 .nr(8)
32266 .kr(1)
32267 .sr(1)
32268 .m(m)
32269 .n(n)
32270 .k(1)
32271 .iterations(1)
32272 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32273 }
32274 }
32275 }
32276
32277 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
32278 TEST_REQUIRES_X86_FMA3;
32279 for (uint32_t m = 1; m <= 7; m++) {
32280 GemmMicrokernelTester()
32281 .mr(7)
32282 .nr(8)
32283 .kr(1)
32284 .sr(1)
32285 .m(m)
32286 .n(8)
32287 .k(1)
32288 .iterations(1)
32289 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32290 }
32291 }
32292
32293 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
32294 TEST_REQUIRES_X86_FMA3;
32295 for (uint32_t n = 1; n <= 8; n++) {
32296 GemmMicrokernelTester()
32297 .mr(7)
32298 .nr(8)
32299 .kr(1)
32300 .sr(1)
32301 .m(7)
32302 .n(n)
32303 .k(1)
32304 .iterations(1)
32305 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32306 }
32307 }
32308
32309 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, k_gt_1) {
32310 TEST_REQUIRES_X86_FMA3;
32311 for (size_t k = 2; k < 10; k++) {
32312 GemmMicrokernelTester()
32313 .mr(7)
32314 .nr(8)
32315 .kr(1)
32316 .sr(1)
32317 .m(7)
32318 .n(8)
32319 .k(k)
32320 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32321 }
32322 }
32323
32324 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, k_gt_1_subtile) {
32325 TEST_REQUIRES_X86_FMA3;
32326 for (size_t k = 2; k < 10; k++) {
32327 for (uint32_t m = 1; m <= 7; m++) {
32328 for (uint32_t n = 1; n <= 8; n++) {
32329 GemmMicrokernelTester()
32330 .mr(7)
32331 .nr(8)
32332 .kr(1)
32333 .sr(1)
32334 .m(m)
32335 .n(n)
32336 .k(k)
32337 .iterations(1)
32338 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32339 }
32340 }
32341 }
32342 }
32343
32344 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, n_gt_8) {
32345 TEST_REQUIRES_X86_FMA3;
32346 for (uint32_t n = 9; n < 16; n++) {
32347 for (size_t k = 1; k <= 5; k += 2) {
32348 GemmMicrokernelTester()
32349 .mr(7)
32350 .nr(8)
32351 .kr(1)
32352 .sr(1)
32353 .m(7)
32354 .n(8)
32355 .k(k)
32356 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32357 }
32358 }
32359 }
32360
32361 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
32362 TEST_REQUIRES_X86_FMA3;
32363 for (uint32_t n = 9; n < 16; n++) {
32364 for (size_t k = 1; k <= 5; k += 2) {
32365 GemmMicrokernelTester()
32366 .mr(7)
32367 .nr(8)
32368 .kr(1)
32369 .sr(1)
32370 .m(7)
32371 .n(8)
32372 .k(k)
32373 .cn_stride(11)
32374 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32375 }
32376 }
32377 }
32378
32379 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, n_gt_8_subtile) {
32380 TEST_REQUIRES_X86_FMA3;
32381 for (uint32_t n = 9; n < 16; n++) {
32382 for (size_t k = 1; k <= 5; k += 2) {
32383 for (uint32_t m = 1; m <= 7; m++) {
32384 GemmMicrokernelTester()
32385 .mr(7)
32386 .nr(8)
32387 .kr(1)
32388 .sr(1)
32389 .m(m)
32390 .n(n)
32391 .k(k)
32392 .iterations(1)
32393 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32394 }
32395 }
32396 }
32397 }
32398
32399 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, n_div_8) {
32400 TEST_REQUIRES_X86_FMA3;
32401 for (uint32_t n = 16; n <= 24; n += 8) {
32402 for (size_t k = 1; k <= 5; k += 2) {
32403 GemmMicrokernelTester()
32404 .mr(7)
32405 .nr(8)
32406 .kr(1)
32407 .sr(1)
32408 .m(7)
32409 .n(8)
32410 .k(k)
32411 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32412 }
32413 }
32414 }
32415
32416 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, n_div_8_strided_cn) {
32417 TEST_REQUIRES_X86_FMA3;
32418 for (uint32_t n = 16; n <= 24; n += 8) {
32419 for (size_t k = 1; k <= 5; k += 2) {
32420 GemmMicrokernelTester()
32421 .mr(7)
32422 .nr(8)
32423 .kr(1)
32424 .sr(1)
32425 .m(7)
32426 .n(n)
32427 .k(k)
32428 .cn_stride(11)
32429 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32430 }
32431 }
32432 }
32433
32434 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, n_div_8_subtile) {
32435 TEST_REQUIRES_X86_FMA3;
32436 for (uint32_t n = 16; n <= 24; n += 8) {
32437 for (size_t k = 1; k <= 5; k += 2) {
32438 for (uint32_t m = 1; m <= 7; m++) {
32439 GemmMicrokernelTester()
32440 .mr(7)
32441 .nr(8)
32442 .kr(1)
32443 .sr(1)
32444 .m(m)
32445 .n(n)
32446 .k(k)
32447 .iterations(1)
32448 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32449 }
32450 }
32451 }
32452 }
32453
32454 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, small_kernel) {
32455 TEST_REQUIRES_X86_FMA3;
32456 for (size_t k = 1; k <= 5; k += 2) {
32457 GemmMicrokernelTester()
32458 .mr(7)
32459 .nr(8)
32460 .kr(1)
32461 .sr(1)
32462 .m(7)
32463 .n(8)
32464 .k(k)
32465 .ks(3)
32466 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32467 }
32468 }
32469
32470 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, small_kernel_subtile) {
32471 TEST_REQUIRES_X86_FMA3;
32472 for (size_t k = 1; k <= 5; k += 2) {
32473 for (uint32_t m = 1; m <= 7; m++) {
32474 for (uint32_t n = 1; n <= 8; n++) {
32475 GemmMicrokernelTester()
32476 .mr(7)
32477 .nr(8)
32478 .kr(1)
32479 .sr(1)
32480 .m(m)
32481 .n(n)
32482 .k(k)
32483 .ks(3)
32484 .iterations(1)
32485 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32486 }
32487 }
32488 }
32489 }
32490
32491 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, n_gt_8_small_kernel) {
32492 TEST_REQUIRES_X86_FMA3;
32493 for (uint32_t n = 9; n < 16; n++) {
32494 for (size_t k = 1; k <= 5; k += 2) {
32495 GemmMicrokernelTester()
32496 .mr(7)
32497 .nr(8)
32498 .kr(1)
32499 .sr(1)
32500 .m(7)
32501 .n(8)
32502 .k(k)
32503 .ks(3)
32504 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32505 }
32506 }
32507 }
32508
32509 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, n_div_8_small_kernel) {
32510 TEST_REQUIRES_X86_FMA3;
32511 for (uint32_t n = 16; n <= 24; n += 8) {
32512 for (size_t k = 1; k <= 5; k += 2) {
32513 GemmMicrokernelTester()
32514 .mr(7)
32515 .nr(8)
32516 .kr(1)
32517 .sr(1)
32518 .m(7)
32519 .n(8)
32520 .k(k)
32521 .ks(3)
32522 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32523 }
32524 }
32525 }
32526
32527 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, strided_cm_subtile) {
32528 TEST_REQUIRES_X86_FMA3;
32529 for (size_t k = 1; k <= 5; k += 2) {
32530 for (uint32_t m = 1; m <= 7; m++) {
32531 for (uint32_t n = 1; n <= 8; n++) {
32532 GemmMicrokernelTester()
32533 .mr(7)
32534 .nr(8)
32535 .kr(1)
32536 .sr(1)
32537 .m(m)
32538 .n(n)
32539 .k(k)
32540 .cm_stride(11)
32541 .iterations(1)
32542 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32543 }
32544 }
32545 }
32546 }
32547
32548 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, a_offset) {
32549 TEST_REQUIRES_X86_FMA3;
32550 for (size_t k = 1; k <= 5; k += 2) {
32551 GemmMicrokernelTester()
32552 .mr(7)
32553 .nr(8)
32554 .kr(1)
32555 .sr(1)
32556 .m(7)
32557 .n(8)
32558 .k(k)
32559 .ks(3)
32560 .a_offset(37)
32561 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32562 }
32563 }
32564
32565 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, zero) {
32566 TEST_REQUIRES_X86_FMA3;
32567 for (uint32_t mz = 0; mz < 7; mz++) {
32568 for (size_t k = 1; k <= 5; k += 2) {
32569 GemmMicrokernelTester()
32570 .mr(7)
32571 .nr(8)
32572 .kr(1)
32573 .sr(1)
32574 .m(7)
32575 .n(8)
32576 .k(k)
32577 .ks(3)
32578 .a_offset(37)
32579 .zero_index(mz)
32580 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32581 }
32582 }
32583 }
32584
32585 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, qmin) {
32586 TEST_REQUIRES_X86_FMA3;
32587 GemmMicrokernelTester()
32588 .mr(7)
32589 .nr(8)
32590 .kr(1)
32591 .sr(1)
32592 .m(7)
32593 .n(8)
32594 .k(1)
32595 .qmin(128)
32596 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32597 }
32598
32599 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, qmax) {
32600 TEST_REQUIRES_X86_FMA3;
32601 GemmMicrokernelTester()
32602 .mr(7)
32603 .nr(8)
32604 .kr(1)
32605 .sr(1)
32606 .m(7)
32607 .n(8)
32608 .k(1)
32609 .qmax(128)
32610 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32611 }
32612
32613 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, strided_cm) {
32614 TEST_REQUIRES_X86_FMA3;
32615 GemmMicrokernelTester()
32616 .mr(7)
32617 .nr(8)
32618 .kr(1)
32619 .sr(1)
32620 .m(7)
32621 .n(8)
32622 .k(1)
32623 .cm_stride(11)
32624 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
32625 }
32626#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
32627
32628
32629#if XNN_ARCH_X86 || XNN_ARCH_X86_64
32630 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, k_eq_1) {
32631 TEST_REQUIRES_X86_FMA3;
32632 GemmMicrokernelTester()
32633 .mr(8)
32634 .nr(8)
32635 .kr(1)
32636 .sr(1)
32637 .m(8)
32638 .n(8)
32639 .k(1)
32640 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
32641 }
32642
32643 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, strided_cn) {
32644 TEST_REQUIRES_X86_FMA3;
32645 GemmMicrokernelTester()
32646 .mr(8)
32647 .nr(8)
32648 .kr(1)
32649 .sr(1)
32650 .m(8)
32651 .n(8)
32652 .k(1)
32653 .cn_stride(11)
32654 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
32655 }
32656
32657 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, k_eq_1_subtile) {
32658 TEST_REQUIRES_X86_FMA3;
32659 for (uint32_t m = 1; m <= 8; m++) {
32660 for (uint32_t n = 1; n <= 8; n++) {
32661 GemmMicrokernelTester()
32662 .mr(8)
32663 .nr(8)
32664 .kr(1)
32665 .sr(1)
32666 .m(m)
32667 .n(n)
32668 .k(1)
32669 .iterations(1)
32670 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
32671 }
32672 }
32673 }
32674
32675 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
32676 TEST_REQUIRES_X86_FMA3;
32677 for (uint32_t m = 1; m <= 8; m++) {
32678 GemmMicrokernelTester()
32679 .mr(8)
32680 .nr(8)
32681 .kr(1)
32682 .sr(1)
32683 .m(m)
32684 .n(8)
32685 .k(1)
32686 .iterations(1)
32687 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
32688 }
32689 }
32690
32691 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
32692 TEST_REQUIRES_X86_FMA3;
32693 for (uint32_t n = 1; n <= 8; n++) {
32694 GemmMicrokernelTester()
32695 .mr(8)
32696 .nr(8)
32697 .kr(1)
32698 .sr(1)
32699 .m(8)
32700 .n(n)
32701 .k(1)
32702 .iterations(1)
32703 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
32704 }
32705 }
32706
32707 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, k_gt_1) {
32708 TEST_REQUIRES_X86_FMA3;
32709 for (size_t k = 2; k < 10; k++) {
32710 GemmMicrokernelTester()
32711 .mr(8)
32712 .nr(8)
32713 .kr(1)
32714 .sr(1)
32715 .m(8)
32716 .n(8)
32717 .k(k)
32718 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
32719 }
32720 }
32721
32722 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, k_gt_1_subtile) {
32723 TEST_REQUIRES_X86_FMA3;
32724 for (size_t k = 2; k < 10; k++) {
32725 for (uint32_t m = 1; m <= 8; m++) {
32726 for (uint32_t n = 1; n <= 8; n++) {
32727 GemmMicrokernelTester()
32728 .mr(8)
32729 .nr(8)
32730 .kr(1)
32731 .sr(1)
32732 .m(m)
32733 .n(n)
32734 .k(k)
32735 .iterations(1)
32736 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
32737 }
32738 }
32739 }
32740 }
32741
32742 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, n_gt_8) {
32743 TEST_REQUIRES_X86_FMA3;
32744 for (uint32_t n = 9; n < 16; n++) {
32745 for (size_t k = 1; k <= 5; k += 2) {
32746 GemmMicrokernelTester()
32747 .mr(8)
32748 .nr(8)
32749 .kr(1)
32750 .sr(1)
32751 .m(8)
32752 .n(8)
32753 .k(k)
32754 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
32755 }
32756 }
32757 }
32758
32759 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
32760 TEST_REQUIRES_X86_FMA3;
32761 for (uint32_t n = 9; n < 16; n++) {
32762 for (size_t k = 1; k <= 5; k += 2) {
32763 GemmMicrokernelTester()
32764 .mr(8)
32765 .nr(8)
32766 .kr(1)
32767 .sr(1)
32768 .m(8)
32769 .n(8)
32770 .k(k)
32771 .cn_stride(11)
32772 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
32773 }
32774 }
32775 }
32776
32777 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, n_gt_8_subtile) {
32778 TEST_REQUIRES_X86_FMA3;
32779 for (uint32_t n = 9; n < 16; n++) {
32780 for (size_t k = 1; k <= 5; k += 2) {
32781 for (uint32_t m = 1; m <= 8; m++) {
32782 GemmMicrokernelTester()
32783 .mr(8)
32784 .nr(8)
32785 .kr(1)
32786 .sr(1)
32787 .m(m)
32788 .n(n)
32789 .k(k)
32790 .iterations(1)
32791 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
32792 }
32793 }
32794 }
32795 }
32796
32797 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, n_div_8) {
32798 TEST_REQUIRES_X86_FMA3;
32799 for (uint32_t n = 16; n <= 24; n += 8) {
32800 for (size_t k = 1; k <= 5; k += 2) {
32801 GemmMicrokernelTester()
32802 .mr(8)
32803 .nr(8)
32804 .kr(1)
32805 .sr(1)
32806 .m(8)
32807 .n(8)
32808 .k(k)
32809 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
32810 }
32811 }
32812 }
32813
32814 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, n_div_8_strided_cn) {
32815 TEST_REQUIRES_X86_FMA3;
32816 for (uint32_t n = 16; n <= 24; n += 8) {
32817 for (size_t k = 1; k <= 5; k += 2) {
32818 GemmMicrokernelTester()
32819 .mr(8)
32820 .nr(8)
32821 .kr(1)
32822 .sr(1)
32823 .m(8)
32824 .n(n)
32825 .k(k)
32826 .cn_stride(11)
32827 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
32828 }
32829 }
32830 }
32831
32832 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, n_div_8_subtile) {
32833 TEST_REQUIRES_X86_FMA3;
32834 for (uint32_t n = 16; n <= 24; n += 8) {
32835 for (size_t k = 1; k <= 5; k += 2) {
32836 for (uint32_t m = 1; m <= 8; m++) {
32837 GemmMicrokernelTester()
32838 .mr(8)
32839 .nr(8)
32840 .kr(1)
32841 .sr(1)
32842 .m(m)
32843 .n(n)
32844 .k(k)
32845 .iterations(1)
32846 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
32847 }
32848 }
32849 }
32850 }
32851
32852 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, small_kernel) {
32853 TEST_REQUIRES_X86_FMA3;
32854 for (size_t k = 1; k <= 5; k += 2) {
32855 GemmMicrokernelTester()
32856 .mr(8)
32857 .nr(8)
32858 .kr(1)
32859 .sr(1)
32860 .m(8)
32861 .n(8)
32862 .k(k)
32863 .ks(3)
32864 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
32865 }
32866 }
32867
32868 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, small_kernel_subtile) {
32869 TEST_REQUIRES_X86_FMA3;
32870 for (size_t k = 1; k <= 5; k += 2) {
32871 for (uint32_t m = 1; m <= 8; m++) {
32872 for (uint32_t n = 1; n <= 8; n++) {
32873 GemmMicrokernelTester()
32874 .mr(8)
32875 .nr(8)
32876 .kr(1)
32877 .sr(1)
32878 .m(m)
32879 .n(n)
32880 .k(k)
32881 .ks(3)
32882 .iterations(1)
32883 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
32884 }
32885 }
32886 }
32887 }
32888
32889 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, n_gt_8_small_kernel) {
32890 TEST_REQUIRES_X86_FMA3;
32891 for (uint32_t n = 9; n < 16; n++) {
32892 for (size_t k = 1; k <= 5; k += 2) {
32893 GemmMicrokernelTester()
32894 .mr(8)
32895 .nr(8)
32896 .kr(1)
32897 .sr(1)
32898 .m(8)
32899 .n(8)
32900 .k(k)
32901 .ks(3)
32902 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
32903 }
32904 }
32905 }
32906
32907 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, n_div_8_small_kernel) {
32908 TEST_REQUIRES_X86_FMA3;
32909 for (uint32_t n = 16; n <= 24; n += 8) {
32910 for (size_t k = 1; k <= 5; k += 2) {
32911 GemmMicrokernelTester()
32912 .mr(8)
32913 .nr(8)
32914 .kr(1)
32915 .sr(1)
32916 .m(8)
32917 .n(8)
32918 .k(k)
32919 .ks(3)
32920 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
32921 }
32922 }
32923 }
32924
32925 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, strided_cm_subtile) {
32926 TEST_REQUIRES_X86_FMA3;
32927 for (size_t k = 1; k <= 5; k += 2) {
32928 for (uint32_t m = 1; m <= 8; m++) {
32929 for (uint32_t n = 1; n <= 8; n++) {
32930 GemmMicrokernelTester()
32931 .mr(8)
32932 .nr(8)
32933 .kr(1)
32934 .sr(1)
32935 .m(m)
32936 .n(n)
32937 .k(k)
32938 .cm_stride(11)
32939 .iterations(1)
32940 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
32941 }
32942 }
32943 }
32944 }
32945
32946 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, a_offset) {
32947 TEST_REQUIRES_X86_FMA3;
32948 for (size_t k = 1; k <= 5; k += 2) {
32949 GemmMicrokernelTester()
32950 .mr(8)
32951 .nr(8)
32952 .kr(1)
32953 .sr(1)
32954 .m(8)
32955 .n(8)
32956 .k(k)
32957 .ks(3)
32958 .a_offset(43)
32959 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
32960 }
32961 }
32962
32963 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, zero) {
32964 TEST_REQUIRES_X86_FMA3;
32965 for (uint32_t mz = 0; mz < 8; mz++) {
32966 for (size_t k = 1; k <= 5; k += 2) {
32967 GemmMicrokernelTester()
32968 .mr(8)
32969 .nr(8)
32970 .kr(1)
32971 .sr(1)
32972 .m(8)
32973 .n(8)
32974 .k(k)
32975 .ks(3)
32976 .a_offset(43)
32977 .zero_index(mz)
32978 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
32979 }
32980 }
32981 }
32982
32983 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, qmin) {
32984 TEST_REQUIRES_X86_FMA3;
32985 GemmMicrokernelTester()
32986 .mr(8)
32987 .nr(8)
32988 .kr(1)
32989 .sr(1)
32990 .m(8)
32991 .n(8)
32992 .k(1)
32993 .qmin(128)
32994 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
32995 }
32996
32997 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, qmax) {
32998 TEST_REQUIRES_X86_FMA3;
32999 GemmMicrokernelTester()
33000 .mr(8)
33001 .nr(8)
33002 .kr(1)
33003 .sr(1)
33004 .m(8)
33005 .n(8)
33006 .k(1)
33007 .qmax(128)
33008 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
33009 }
33010
33011 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, strided_cm) {
33012 TEST_REQUIRES_X86_FMA3;
33013 GemmMicrokernelTester()
33014 .mr(8)
33015 .nr(8)
33016 .kr(1)
33017 .sr(1)
33018 .m(8)
33019 .n(8)
33020 .k(1)
33021 .cm_stride(11)
33022 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
33023 }
33024#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
33025
33026
Marat Dukhan0f349c42019-11-27 11:58:54 -080033027#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Marat Dukhaneccfd712019-12-08 16:49:27 -080033028 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, k_eq_1) {
33029 TEST_REQUIRES_X86_FMA3;
33030 GemmMicrokernelTester()
33031 .mr(1)
33032 .nr(16)
33033 .kr(1)
33034 .sr(1)
33035 .m(1)
33036 .n(16)
33037 .k(1)
33038 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33039 }
33040
33041 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, strided_cn) {
33042 TEST_REQUIRES_X86_FMA3;
33043 GemmMicrokernelTester()
33044 .mr(1)
33045 .nr(16)
33046 .kr(1)
33047 .sr(1)
33048 .m(1)
33049 .n(16)
33050 .k(1)
33051 .cn_stride(19)
33052 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33053 }
33054
33055 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, k_eq_1_subtile) {
33056 TEST_REQUIRES_X86_FMA3;
33057 for (uint32_t m = 1; m <= 1; m++) {
33058 for (uint32_t n = 1; n <= 16; n++) {
33059 GemmMicrokernelTester()
33060 .mr(1)
33061 .nr(16)
33062 .kr(1)
33063 .sr(1)
33064 .m(m)
33065 .n(n)
33066 .k(1)
33067 .iterations(1)
33068 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33069 }
33070 }
33071 }
33072
33073 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
33074 TEST_REQUIRES_X86_FMA3;
33075 for (uint32_t m = 1; m <= 1; m++) {
33076 GemmMicrokernelTester()
33077 .mr(1)
33078 .nr(16)
33079 .kr(1)
33080 .sr(1)
33081 .m(m)
33082 .n(16)
33083 .k(1)
33084 .iterations(1)
33085 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33086 }
33087 }
33088
33089 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
33090 TEST_REQUIRES_X86_FMA3;
33091 for (uint32_t n = 1; n <= 16; n++) {
33092 GemmMicrokernelTester()
33093 .mr(1)
33094 .nr(16)
33095 .kr(1)
33096 .sr(1)
33097 .m(1)
33098 .n(n)
33099 .k(1)
33100 .iterations(1)
33101 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33102 }
33103 }
33104
33105 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, k_gt_1) {
33106 TEST_REQUIRES_X86_FMA3;
33107 for (size_t k = 2; k < 10; k++) {
33108 GemmMicrokernelTester()
33109 .mr(1)
33110 .nr(16)
33111 .kr(1)
33112 .sr(1)
33113 .m(1)
33114 .n(16)
33115 .k(k)
33116 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33117 }
33118 }
33119
33120 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, k_gt_1_subtile) {
33121 TEST_REQUIRES_X86_FMA3;
33122 for (size_t k = 2; k < 10; k++) {
33123 for (uint32_t m = 1; m <= 1; m++) {
33124 for (uint32_t n = 1; n <= 16; n++) {
33125 GemmMicrokernelTester()
33126 .mr(1)
33127 .nr(16)
33128 .kr(1)
33129 .sr(1)
33130 .m(m)
33131 .n(n)
33132 .k(k)
33133 .iterations(1)
33134 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33135 }
33136 }
33137 }
33138 }
33139
33140 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, n_gt_16) {
33141 TEST_REQUIRES_X86_FMA3;
33142 for (uint32_t n = 17; n < 32; n++) {
33143 for (size_t k = 1; k <= 5; k += 2) {
33144 GemmMicrokernelTester()
33145 .mr(1)
33146 .nr(16)
33147 .kr(1)
33148 .sr(1)
33149 .m(1)
33150 .n(16)
33151 .k(k)
33152 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33153 }
33154 }
33155 }
33156
33157 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
33158 TEST_REQUIRES_X86_FMA3;
33159 for (uint32_t n = 17; n < 32; n++) {
33160 for (size_t k = 1; k <= 5; k += 2) {
33161 GemmMicrokernelTester()
33162 .mr(1)
33163 .nr(16)
33164 .kr(1)
33165 .sr(1)
33166 .m(1)
33167 .n(16)
33168 .k(k)
33169 .cn_stride(19)
33170 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33171 }
33172 }
33173 }
33174
33175 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, n_gt_16_subtile) {
33176 TEST_REQUIRES_X86_FMA3;
33177 for (uint32_t n = 17; n < 32; n++) {
33178 for (size_t k = 1; k <= 5; k += 2) {
33179 for (uint32_t m = 1; m <= 1; m++) {
33180 GemmMicrokernelTester()
33181 .mr(1)
33182 .nr(16)
33183 .kr(1)
33184 .sr(1)
33185 .m(m)
33186 .n(n)
33187 .k(k)
33188 .iterations(1)
33189 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33190 }
33191 }
33192 }
33193 }
33194
33195 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, n_div_16) {
33196 TEST_REQUIRES_X86_FMA3;
33197 for (uint32_t n = 32; n <= 48; n += 16) {
33198 for (size_t k = 1; k <= 5; k += 2) {
33199 GemmMicrokernelTester()
33200 .mr(1)
33201 .nr(16)
33202 .kr(1)
33203 .sr(1)
33204 .m(1)
33205 .n(16)
33206 .k(k)
33207 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33208 }
33209 }
33210 }
33211
33212 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, n_div_16_strided_cn) {
33213 TEST_REQUIRES_X86_FMA3;
33214 for (uint32_t n = 32; n <= 48; n += 16) {
33215 for (size_t k = 1; k <= 5; k += 2) {
33216 GemmMicrokernelTester()
33217 .mr(1)
33218 .nr(16)
33219 .kr(1)
33220 .sr(1)
33221 .m(1)
33222 .n(n)
33223 .k(k)
33224 .cn_stride(19)
33225 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33226 }
33227 }
33228 }
33229
33230 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, n_div_16_subtile) {
33231 TEST_REQUIRES_X86_FMA3;
33232 for (uint32_t n = 32; n <= 48; n += 16) {
33233 for (size_t k = 1; k <= 5; k += 2) {
33234 for (uint32_t m = 1; m <= 1; m++) {
33235 GemmMicrokernelTester()
33236 .mr(1)
33237 .nr(16)
33238 .kr(1)
33239 .sr(1)
33240 .m(m)
33241 .n(n)
33242 .k(k)
33243 .iterations(1)
33244 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33245 }
33246 }
33247 }
33248 }
33249
33250 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, small_kernel) {
33251 TEST_REQUIRES_X86_FMA3;
33252 for (size_t k = 1; k <= 5; k += 2) {
33253 GemmMicrokernelTester()
33254 .mr(1)
33255 .nr(16)
33256 .kr(1)
33257 .sr(1)
33258 .m(1)
33259 .n(16)
33260 .k(k)
33261 .ks(3)
33262 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33263 }
33264 }
33265
33266 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, small_kernel_subtile) {
33267 TEST_REQUIRES_X86_FMA3;
33268 for (size_t k = 1; k <= 5; k += 2) {
33269 for (uint32_t m = 1; m <= 1; m++) {
33270 for (uint32_t n = 1; n <= 16; n++) {
33271 GemmMicrokernelTester()
33272 .mr(1)
33273 .nr(16)
33274 .kr(1)
33275 .sr(1)
33276 .m(m)
33277 .n(n)
33278 .k(k)
33279 .ks(3)
33280 .iterations(1)
33281 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33282 }
33283 }
33284 }
33285 }
33286
33287 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, n_gt_16_small_kernel) {
33288 TEST_REQUIRES_X86_FMA3;
33289 for (uint32_t n = 17; n < 32; n++) {
33290 for (size_t k = 1; k <= 5; k += 2) {
33291 GemmMicrokernelTester()
33292 .mr(1)
33293 .nr(16)
33294 .kr(1)
33295 .sr(1)
33296 .m(1)
33297 .n(16)
33298 .k(k)
33299 .ks(3)
33300 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33301 }
33302 }
33303 }
33304
33305 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, n_div_16_small_kernel) {
33306 TEST_REQUIRES_X86_FMA3;
33307 for (uint32_t n = 32; n <= 48; n += 16) {
33308 for (size_t k = 1; k <= 5; k += 2) {
33309 GemmMicrokernelTester()
33310 .mr(1)
33311 .nr(16)
33312 .kr(1)
33313 .sr(1)
33314 .m(1)
33315 .n(16)
33316 .k(k)
33317 .ks(3)
33318 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33319 }
33320 }
33321 }
33322
33323 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, strided_cm_subtile) {
33324 TEST_REQUIRES_X86_FMA3;
33325 for (size_t k = 1; k <= 5; k += 2) {
33326 for (uint32_t m = 1; m <= 1; m++) {
33327 for (uint32_t n = 1; n <= 16; n++) {
33328 GemmMicrokernelTester()
33329 .mr(1)
33330 .nr(16)
33331 .kr(1)
33332 .sr(1)
33333 .m(m)
33334 .n(n)
33335 .k(k)
33336 .cm_stride(19)
33337 .iterations(1)
33338 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33339 }
33340 }
33341 }
33342 }
33343
33344 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, a_offset) {
33345 TEST_REQUIRES_X86_FMA3;
33346 for (size_t k = 1; k <= 5; k += 2) {
33347 GemmMicrokernelTester()
33348 .mr(1)
33349 .nr(16)
33350 .kr(1)
33351 .sr(1)
33352 .m(1)
33353 .n(16)
33354 .k(k)
33355 .ks(3)
33356 .a_offset(7)
33357 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33358 }
33359 }
33360
33361 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, zero) {
33362 TEST_REQUIRES_X86_FMA3;
33363 for (uint32_t mz = 0; mz < 1; mz++) {
33364 for (size_t k = 1; k <= 5; k += 2) {
33365 GemmMicrokernelTester()
33366 .mr(1)
33367 .nr(16)
33368 .kr(1)
33369 .sr(1)
33370 .m(1)
33371 .n(16)
33372 .k(k)
33373 .ks(3)
33374 .a_offset(7)
33375 .zero_index(mz)
33376 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33377 }
33378 }
33379 }
33380
33381 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, qmin) {
33382 TEST_REQUIRES_X86_FMA3;
33383 GemmMicrokernelTester()
33384 .mr(1)
33385 .nr(16)
33386 .kr(1)
33387 .sr(1)
33388 .m(1)
33389 .n(16)
33390 .k(1)
33391 .qmin(128)
33392 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33393 }
33394
33395 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, qmax) {
33396 TEST_REQUIRES_X86_FMA3;
33397 GemmMicrokernelTester()
33398 .mr(1)
33399 .nr(16)
33400 .kr(1)
33401 .sr(1)
33402 .m(1)
33403 .n(16)
33404 .k(1)
33405 .qmax(128)
33406 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33407 }
33408
33409 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, strided_cm) {
33410 TEST_REQUIRES_X86_FMA3;
33411 GemmMicrokernelTester()
33412 .mr(1)
33413 .nr(16)
33414 .kr(1)
33415 .sr(1)
33416 .m(1)
33417 .n(16)
33418 .k(1)
33419 .cm_stride(19)
33420 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
33421 }
33422#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
33423
33424
33425#if XNN_ARCH_X86 || XNN_ARCH_X86_64
33426 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, k_eq_1) {
33427 TEST_REQUIRES_X86_FMA3;
33428 GemmMicrokernelTester()
33429 .mr(3)
33430 .nr(16)
33431 .kr(1)
33432 .sr(1)
33433 .m(3)
33434 .n(16)
33435 .k(1)
33436 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33437 }
33438
33439 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, strided_cn) {
33440 TEST_REQUIRES_X86_FMA3;
33441 GemmMicrokernelTester()
33442 .mr(3)
33443 .nr(16)
33444 .kr(1)
33445 .sr(1)
33446 .m(3)
33447 .n(16)
33448 .k(1)
33449 .cn_stride(19)
33450 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33451 }
33452
33453 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, k_eq_1_subtile) {
33454 TEST_REQUIRES_X86_FMA3;
33455 for (uint32_t m = 1; m <= 3; m++) {
33456 for (uint32_t n = 1; n <= 16; n++) {
33457 GemmMicrokernelTester()
33458 .mr(3)
33459 .nr(16)
33460 .kr(1)
33461 .sr(1)
33462 .m(m)
33463 .n(n)
33464 .k(1)
33465 .iterations(1)
33466 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33467 }
33468 }
33469 }
33470
33471 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
33472 TEST_REQUIRES_X86_FMA3;
33473 for (uint32_t m = 1; m <= 3; m++) {
33474 GemmMicrokernelTester()
33475 .mr(3)
33476 .nr(16)
33477 .kr(1)
33478 .sr(1)
33479 .m(m)
33480 .n(16)
33481 .k(1)
33482 .iterations(1)
33483 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33484 }
33485 }
33486
33487 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
33488 TEST_REQUIRES_X86_FMA3;
33489 for (uint32_t n = 1; n <= 16; n++) {
33490 GemmMicrokernelTester()
33491 .mr(3)
33492 .nr(16)
33493 .kr(1)
33494 .sr(1)
33495 .m(3)
33496 .n(n)
33497 .k(1)
33498 .iterations(1)
33499 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33500 }
33501 }
33502
33503 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, k_gt_1) {
33504 TEST_REQUIRES_X86_FMA3;
33505 for (size_t k = 2; k < 10; k++) {
33506 GemmMicrokernelTester()
33507 .mr(3)
33508 .nr(16)
33509 .kr(1)
33510 .sr(1)
33511 .m(3)
33512 .n(16)
33513 .k(k)
33514 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33515 }
33516 }
33517
33518 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, k_gt_1_subtile) {
33519 TEST_REQUIRES_X86_FMA3;
33520 for (size_t k = 2; k < 10; k++) {
33521 for (uint32_t m = 1; m <= 3; m++) {
33522 for (uint32_t n = 1; n <= 16; n++) {
33523 GemmMicrokernelTester()
33524 .mr(3)
33525 .nr(16)
33526 .kr(1)
33527 .sr(1)
33528 .m(m)
33529 .n(n)
33530 .k(k)
33531 .iterations(1)
33532 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33533 }
33534 }
33535 }
33536 }
33537
33538 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, n_gt_16) {
33539 TEST_REQUIRES_X86_FMA3;
33540 for (uint32_t n = 17; n < 32; n++) {
33541 for (size_t k = 1; k <= 5; k += 2) {
33542 GemmMicrokernelTester()
33543 .mr(3)
33544 .nr(16)
33545 .kr(1)
33546 .sr(1)
33547 .m(3)
33548 .n(16)
33549 .k(k)
33550 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33551 }
33552 }
33553 }
33554
33555 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
33556 TEST_REQUIRES_X86_FMA3;
33557 for (uint32_t n = 17; n < 32; n++) {
33558 for (size_t k = 1; k <= 5; k += 2) {
33559 GemmMicrokernelTester()
33560 .mr(3)
33561 .nr(16)
33562 .kr(1)
33563 .sr(1)
33564 .m(3)
33565 .n(16)
33566 .k(k)
33567 .cn_stride(19)
33568 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33569 }
33570 }
33571 }
33572
33573 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, n_gt_16_subtile) {
33574 TEST_REQUIRES_X86_FMA3;
33575 for (uint32_t n = 17; n < 32; n++) {
33576 for (size_t k = 1; k <= 5; k += 2) {
33577 for (uint32_t m = 1; m <= 3; m++) {
33578 GemmMicrokernelTester()
33579 .mr(3)
33580 .nr(16)
33581 .kr(1)
33582 .sr(1)
33583 .m(m)
33584 .n(n)
33585 .k(k)
33586 .iterations(1)
33587 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33588 }
33589 }
33590 }
33591 }
33592
33593 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, n_div_16) {
33594 TEST_REQUIRES_X86_FMA3;
33595 for (uint32_t n = 32; n <= 48; n += 16) {
33596 for (size_t k = 1; k <= 5; k += 2) {
33597 GemmMicrokernelTester()
33598 .mr(3)
33599 .nr(16)
33600 .kr(1)
33601 .sr(1)
33602 .m(3)
33603 .n(16)
33604 .k(k)
33605 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33606 }
33607 }
33608 }
33609
33610 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, n_div_16_strided_cn) {
33611 TEST_REQUIRES_X86_FMA3;
33612 for (uint32_t n = 32; n <= 48; n += 16) {
33613 for (size_t k = 1; k <= 5; k += 2) {
33614 GemmMicrokernelTester()
33615 .mr(3)
33616 .nr(16)
33617 .kr(1)
33618 .sr(1)
33619 .m(3)
33620 .n(n)
33621 .k(k)
33622 .cn_stride(19)
33623 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33624 }
33625 }
33626 }
33627
33628 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, n_div_16_subtile) {
33629 TEST_REQUIRES_X86_FMA3;
33630 for (uint32_t n = 32; n <= 48; n += 16) {
33631 for (size_t k = 1; k <= 5; k += 2) {
33632 for (uint32_t m = 1; m <= 3; m++) {
33633 GemmMicrokernelTester()
33634 .mr(3)
33635 .nr(16)
33636 .kr(1)
33637 .sr(1)
33638 .m(m)
33639 .n(n)
33640 .k(k)
33641 .iterations(1)
33642 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33643 }
33644 }
33645 }
33646 }
33647
33648 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, small_kernel) {
33649 TEST_REQUIRES_X86_FMA3;
33650 for (size_t k = 1; k <= 5; k += 2) {
33651 GemmMicrokernelTester()
33652 .mr(3)
33653 .nr(16)
33654 .kr(1)
33655 .sr(1)
33656 .m(3)
33657 .n(16)
33658 .k(k)
33659 .ks(3)
33660 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33661 }
33662 }
33663
33664 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, small_kernel_subtile) {
33665 TEST_REQUIRES_X86_FMA3;
33666 for (size_t k = 1; k <= 5; k += 2) {
33667 for (uint32_t m = 1; m <= 3; m++) {
33668 for (uint32_t n = 1; n <= 16; n++) {
33669 GemmMicrokernelTester()
33670 .mr(3)
33671 .nr(16)
33672 .kr(1)
33673 .sr(1)
33674 .m(m)
33675 .n(n)
33676 .k(k)
33677 .ks(3)
33678 .iterations(1)
33679 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33680 }
33681 }
33682 }
33683 }
33684
33685 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, n_gt_16_small_kernel) {
33686 TEST_REQUIRES_X86_FMA3;
33687 for (uint32_t n = 17; n < 32; n++) {
33688 for (size_t k = 1; k <= 5; k += 2) {
33689 GemmMicrokernelTester()
33690 .mr(3)
33691 .nr(16)
33692 .kr(1)
33693 .sr(1)
33694 .m(3)
33695 .n(16)
33696 .k(k)
33697 .ks(3)
33698 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33699 }
33700 }
33701 }
33702
33703 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, n_div_16_small_kernel) {
33704 TEST_REQUIRES_X86_FMA3;
33705 for (uint32_t n = 32; n <= 48; n += 16) {
33706 for (size_t k = 1; k <= 5; k += 2) {
33707 GemmMicrokernelTester()
33708 .mr(3)
33709 .nr(16)
33710 .kr(1)
33711 .sr(1)
33712 .m(3)
33713 .n(16)
33714 .k(k)
33715 .ks(3)
33716 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33717 }
33718 }
33719 }
33720
33721 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, strided_cm_subtile) {
33722 TEST_REQUIRES_X86_FMA3;
33723 for (size_t k = 1; k <= 5; k += 2) {
33724 for (uint32_t m = 1; m <= 3; m++) {
33725 for (uint32_t n = 1; n <= 16; n++) {
33726 GemmMicrokernelTester()
33727 .mr(3)
33728 .nr(16)
33729 .kr(1)
33730 .sr(1)
33731 .m(m)
33732 .n(n)
33733 .k(k)
33734 .cm_stride(19)
33735 .iterations(1)
33736 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33737 }
33738 }
33739 }
33740 }
33741
33742 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, a_offset) {
33743 TEST_REQUIRES_X86_FMA3;
33744 for (size_t k = 1; k <= 5; k += 2) {
33745 GemmMicrokernelTester()
33746 .mr(3)
33747 .nr(16)
33748 .kr(1)
33749 .sr(1)
33750 .m(3)
33751 .n(16)
33752 .k(k)
33753 .ks(3)
33754 .a_offset(17)
33755 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33756 }
33757 }
33758
33759 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, zero) {
33760 TEST_REQUIRES_X86_FMA3;
33761 for (uint32_t mz = 0; mz < 3; mz++) {
33762 for (size_t k = 1; k <= 5; k += 2) {
33763 GemmMicrokernelTester()
33764 .mr(3)
33765 .nr(16)
33766 .kr(1)
33767 .sr(1)
33768 .m(3)
33769 .n(16)
33770 .k(k)
33771 .ks(3)
33772 .a_offset(17)
33773 .zero_index(mz)
33774 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33775 }
33776 }
33777 }
33778
33779 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, qmin) {
33780 TEST_REQUIRES_X86_FMA3;
33781 GemmMicrokernelTester()
33782 .mr(3)
33783 .nr(16)
33784 .kr(1)
33785 .sr(1)
33786 .m(3)
33787 .n(16)
33788 .k(1)
33789 .qmin(128)
33790 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33791 }
33792
33793 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, qmax) {
33794 TEST_REQUIRES_X86_FMA3;
33795 GemmMicrokernelTester()
33796 .mr(3)
33797 .nr(16)
33798 .kr(1)
33799 .sr(1)
33800 .m(3)
33801 .n(16)
33802 .k(1)
33803 .qmax(128)
33804 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33805 }
33806
33807 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, strided_cm) {
33808 TEST_REQUIRES_X86_FMA3;
33809 GemmMicrokernelTester()
33810 .mr(3)
33811 .nr(16)
33812 .kr(1)
33813 .sr(1)
33814 .m(3)
33815 .n(16)
33816 .k(1)
33817 .cm_stride(19)
33818 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
33819 }
33820#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
33821
33822
33823#if XNN_ARCH_X86 || XNN_ARCH_X86_64
33824 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, k_eq_1) {
33825 TEST_REQUIRES_X86_FMA3;
33826 GemmMicrokernelTester()
33827 .mr(4)
33828 .nr(16)
33829 .kr(1)
33830 .sr(1)
33831 .m(4)
33832 .n(16)
33833 .k(1)
33834 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
33835 }
33836
33837 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, strided_cn) {
33838 TEST_REQUIRES_X86_FMA3;
33839 GemmMicrokernelTester()
33840 .mr(4)
33841 .nr(16)
33842 .kr(1)
33843 .sr(1)
33844 .m(4)
33845 .n(16)
33846 .k(1)
33847 .cn_stride(19)
33848 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
33849 }
33850
33851 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, k_eq_1_subtile) {
33852 TEST_REQUIRES_X86_FMA3;
33853 for (uint32_t m = 1; m <= 4; m++) {
33854 for (uint32_t n = 1; n <= 16; n++) {
33855 GemmMicrokernelTester()
33856 .mr(4)
33857 .nr(16)
33858 .kr(1)
33859 .sr(1)
33860 .m(m)
33861 .n(n)
33862 .k(1)
33863 .iterations(1)
33864 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
33865 }
33866 }
33867 }
33868
33869 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
33870 TEST_REQUIRES_X86_FMA3;
33871 for (uint32_t m = 1; m <= 4; m++) {
33872 GemmMicrokernelTester()
33873 .mr(4)
33874 .nr(16)
33875 .kr(1)
33876 .sr(1)
33877 .m(m)
33878 .n(16)
33879 .k(1)
33880 .iterations(1)
33881 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
33882 }
33883 }
33884
33885 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
33886 TEST_REQUIRES_X86_FMA3;
33887 for (uint32_t n = 1; n <= 16; n++) {
33888 GemmMicrokernelTester()
33889 .mr(4)
33890 .nr(16)
33891 .kr(1)
33892 .sr(1)
33893 .m(4)
33894 .n(n)
33895 .k(1)
33896 .iterations(1)
33897 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
33898 }
33899 }
33900
33901 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, k_gt_1) {
33902 TEST_REQUIRES_X86_FMA3;
33903 for (size_t k = 2; k < 10; k++) {
33904 GemmMicrokernelTester()
33905 .mr(4)
33906 .nr(16)
33907 .kr(1)
33908 .sr(1)
33909 .m(4)
33910 .n(16)
33911 .k(k)
33912 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
33913 }
33914 }
33915
33916 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, k_gt_1_subtile) {
33917 TEST_REQUIRES_X86_FMA3;
33918 for (size_t k = 2; k < 10; k++) {
33919 for (uint32_t m = 1; m <= 4; m++) {
33920 for (uint32_t n = 1; n <= 16; n++) {
33921 GemmMicrokernelTester()
33922 .mr(4)
33923 .nr(16)
33924 .kr(1)
33925 .sr(1)
33926 .m(m)
33927 .n(n)
33928 .k(k)
33929 .iterations(1)
33930 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
33931 }
33932 }
33933 }
33934 }
33935
33936 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, n_gt_16) {
33937 TEST_REQUIRES_X86_FMA3;
33938 for (uint32_t n = 17; n < 32; n++) {
33939 for (size_t k = 1; k <= 5; k += 2) {
33940 GemmMicrokernelTester()
33941 .mr(4)
33942 .nr(16)
33943 .kr(1)
33944 .sr(1)
33945 .m(4)
33946 .n(16)
33947 .k(k)
33948 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
33949 }
33950 }
33951 }
33952
33953 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
33954 TEST_REQUIRES_X86_FMA3;
33955 for (uint32_t n = 17; n < 32; n++) {
33956 for (size_t k = 1; k <= 5; k += 2) {
33957 GemmMicrokernelTester()
33958 .mr(4)
33959 .nr(16)
33960 .kr(1)
33961 .sr(1)
33962 .m(4)
33963 .n(16)
33964 .k(k)
33965 .cn_stride(19)
33966 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
33967 }
33968 }
33969 }
33970
33971 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, n_gt_16_subtile) {
33972 TEST_REQUIRES_X86_FMA3;
33973 for (uint32_t n = 17; n < 32; n++) {
33974 for (size_t k = 1; k <= 5; k += 2) {
33975 for (uint32_t m = 1; m <= 4; m++) {
33976 GemmMicrokernelTester()
33977 .mr(4)
33978 .nr(16)
33979 .kr(1)
33980 .sr(1)
33981 .m(m)
33982 .n(n)
33983 .k(k)
33984 .iterations(1)
33985 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
33986 }
33987 }
33988 }
33989 }
33990
33991 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, n_div_16) {
33992 TEST_REQUIRES_X86_FMA3;
33993 for (uint32_t n = 32; n <= 48; n += 16) {
33994 for (size_t k = 1; k <= 5; k += 2) {
33995 GemmMicrokernelTester()
33996 .mr(4)
33997 .nr(16)
33998 .kr(1)
33999 .sr(1)
34000 .m(4)
34001 .n(16)
34002 .k(k)
34003 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
34004 }
34005 }
34006 }
34007
34008 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, n_div_16_strided_cn) {
34009 TEST_REQUIRES_X86_FMA3;
34010 for (uint32_t n = 32; n <= 48; n += 16) {
34011 for (size_t k = 1; k <= 5; k += 2) {
34012 GemmMicrokernelTester()
34013 .mr(4)
34014 .nr(16)
34015 .kr(1)
34016 .sr(1)
34017 .m(4)
34018 .n(n)
34019 .k(k)
34020 .cn_stride(19)
34021 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
34022 }
34023 }
34024 }
34025
34026 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, n_div_16_subtile) {
34027 TEST_REQUIRES_X86_FMA3;
34028 for (uint32_t n = 32; n <= 48; n += 16) {
34029 for (size_t k = 1; k <= 5; k += 2) {
34030 for (uint32_t m = 1; m <= 4; m++) {
34031 GemmMicrokernelTester()
34032 .mr(4)
34033 .nr(16)
34034 .kr(1)
34035 .sr(1)
34036 .m(m)
34037 .n(n)
34038 .k(k)
34039 .iterations(1)
34040 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
34041 }
34042 }
34043 }
34044 }
34045
34046 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, small_kernel) {
34047 TEST_REQUIRES_X86_FMA3;
34048 for (size_t k = 1; k <= 5; k += 2) {
34049 GemmMicrokernelTester()
34050 .mr(4)
34051 .nr(16)
34052 .kr(1)
34053 .sr(1)
34054 .m(4)
34055 .n(16)
34056 .k(k)
34057 .ks(3)
34058 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
34059 }
34060 }
34061
34062 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, small_kernel_subtile) {
34063 TEST_REQUIRES_X86_FMA3;
34064 for (size_t k = 1; k <= 5; k += 2) {
34065 for (uint32_t m = 1; m <= 4; m++) {
34066 for (uint32_t n = 1; n <= 16; n++) {
34067 GemmMicrokernelTester()
34068 .mr(4)
34069 .nr(16)
34070 .kr(1)
34071 .sr(1)
34072 .m(m)
34073 .n(n)
34074 .k(k)
34075 .ks(3)
34076 .iterations(1)
34077 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
34078 }
34079 }
34080 }
34081 }
34082
34083 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, n_gt_16_small_kernel) {
34084 TEST_REQUIRES_X86_FMA3;
34085 for (uint32_t n = 17; n < 32; n++) {
34086 for (size_t k = 1; k <= 5; k += 2) {
34087 GemmMicrokernelTester()
34088 .mr(4)
34089 .nr(16)
34090 .kr(1)
34091 .sr(1)
34092 .m(4)
34093 .n(16)
34094 .k(k)
34095 .ks(3)
34096 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
34097 }
34098 }
34099 }
34100
34101 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, n_div_16_small_kernel) {
34102 TEST_REQUIRES_X86_FMA3;
34103 for (uint32_t n = 32; n <= 48; n += 16) {
34104 for (size_t k = 1; k <= 5; k += 2) {
34105 GemmMicrokernelTester()
34106 .mr(4)
34107 .nr(16)
34108 .kr(1)
34109 .sr(1)
34110 .m(4)
34111 .n(16)
34112 .k(k)
34113 .ks(3)
34114 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
34115 }
34116 }
34117 }
34118
34119 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, strided_cm_subtile) {
34120 TEST_REQUIRES_X86_FMA3;
34121 for (size_t k = 1; k <= 5; k += 2) {
34122 for (uint32_t m = 1; m <= 4; m++) {
34123 for (uint32_t n = 1; n <= 16; n++) {
34124 GemmMicrokernelTester()
34125 .mr(4)
34126 .nr(16)
34127 .kr(1)
34128 .sr(1)
34129 .m(m)
34130 .n(n)
34131 .k(k)
34132 .cm_stride(19)
34133 .iterations(1)
34134 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
34135 }
34136 }
34137 }
34138 }
34139
34140 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, a_offset) {
34141 TEST_REQUIRES_X86_FMA3;
34142 for (size_t k = 1; k <= 5; k += 2) {
34143 GemmMicrokernelTester()
34144 .mr(4)
34145 .nr(16)
34146 .kr(1)
34147 .sr(1)
34148 .m(4)
34149 .n(16)
34150 .k(k)
34151 .ks(3)
34152 .a_offset(23)
34153 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
34154 }
34155 }
34156
34157 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, zero) {
34158 TEST_REQUIRES_X86_FMA3;
34159 for (uint32_t mz = 0; mz < 4; mz++) {
34160 for (size_t k = 1; k <= 5; k += 2) {
34161 GemmMicrokernelTester()
34162 .mr(4)
34163 .nr(16)
34164 .kr(1)
34165 .sr(1)
34166 .m(4)
34167 .n(16)
34168 .k(k)
34169 .ks(3)
34170 .a_offset(23)
34171 .zero_index(mz)
34172 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
34173 }
34174 }
34175 }
34176
34177 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, qmin) {
34178 TEST_REQUIRES_X86_FMA3;
34179 GemmMicrokernelTester()
34180 .mr(4)
34181 .nr(16)
34182 .kr(1)
34183 .sr(1)
34184 .m(4)
34185 .n(16)
34186 .k(1)
34187 .qmin(128)
34188 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
34189 }
34190
34191 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, qmax) {
34192 TEST_REQUIRES_X86_FMA3;
34193 GemmMicrokernelTester()
34194 .mr(4)
34195 .nr(16)
34196 .kr(1)
34197 .sr(1)
34198 .m(4)
34199 .n(16)
34200 .k(1)
34201 .qmax(128)
34202 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
34203 }
34204
34205 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, strided_cm) {
34206 TEST_REQUIRES_X86_FMA3;
34207 GemmMicrokernelTester()
34208 .mr(4)
34209 .nr(16)
34210 .kr(1)
34211 .sr(1)
34212 .m(4)
34213 .n(16)
34214 .k(1)
34215 .cm_stride(19)
34216 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
34217 }
34218#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
34219
34220
34221#if XNN_ARCH_X86 || XNN_ARCH_X86_64
34222 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, k_eq_1) {
34223 TEST_REQUIRES_X86_FMA3;
34224 GemmMicrokernelTester()
34225 .mr(5)
34226 .nr(16)
34227 .kr(1)
34228 .sr(1)
34229 .m(5)
34230 .n(16)
34231 .k(1)
34232 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34233 }
34234
34235 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, strided_cn) {
34236 TEST_REQUIRES_X86_FMA3;
34237 GemmMicrokernelTester()
34238 .mr(5)
34239 .nr(16)
34240 .kr(1)
34241 .sr(1)
34242 .m(5)
34243 .n(16)
34244 .k(1)
34245 .cn_stride(19)
34246 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34247 }
34248
34249 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, k_eq_1_subtile) {
34250 TEST_REQUIRES_X86_FMA3;
34251 for (uint32_t m = 1; m <= 5; m++) {
34252 for (uint32_t n = 1; n <= 16; n++) {
34253 GemmMicrokernelTester()
34254 .mr(5)
34255 .nr(16)
34256 .kr(1)
34257 .sr(1)
34258 .m(m)
34259 .n(n)
34260 .k(1)
34261 .iterations(1)
34262 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34263 }
34264 }
34265 }
34266
34267 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
34268 TEST_REQUIRES_X86_FMA3;
34269 for (uint32_t m = 1; m <= 5; m++) {
34270 GemmMicrokernelTester()
34271 .mr(5)
34272 .nr(16)
34273 .kr(1)
34274 .sr(1)
34275 .m(m)
34276 .n(16)
34277 .k(1)
34278 .iterations(1)
34279 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34280 }
34281 }
34282
34283 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
34284 TEST_REQUIRES_X86_FMA3;
34285 for (uint32_t n = 1; n <= 16; n++) {
34286 GemmMicrokernelTester()
34287 .mr(5)
34288 .nr(16)
34289 .kr(1)
34290 .sr(1)
34291 .m(5)
34292 .n(n)
34293 .k(1)
34294 .iterations(1)
34295 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34296 }
34297 }
34298
34299 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, k_gt_1) {
34300 TEST_REQUIRES_X86_FMA3;
34301 for (size_t k = 2; k < 10; k++) {
34302 GemmMicrokernelTester()
34303 .mr(5)
34304 .nr(16)
34305 .kr(1)
34306 .sr(1)
34307 .m(5)
34308 .n(16)
34309 .k(k)
34310 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34311 }
34312 }
34313
34314 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, k_gt_1_subtile) {
34315 TEST_REQUIRES_X86_FMA3;
34316 for (size_t k = 2; k < 10; k++) {
34317 for (uint32_t m = 1; m <= 5; m++) {
34318 for (uint32_t n = 1; n <= 16; n++) {
34319 GemmMicrokernelTester()
34320 .mr(5)
34321 .nr(16)
34322 .kr(1)
34323 .sr(1)
34324 .m(m)
34325 .n(n)
34326 .k(k)
34327 .iterations(1)
34328 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34329 }
34330 }
34331 }
34332 }
34333
34334 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, n_gt_16) {
34335 TEST_REQUIRES_X86_FMA3;
34336 for (uint32_t n = 17; n < 32; n++) {
34337 for (size_t k = 1; k <= 5; k += 2) {
34338 GemmMicrokernelTester()
34339 .mr(5)
34340 .nr(16)
34341 .kr(1)
34342 .sr(1)
34343 .m(5)
34344 .n(16)
34345 .k(k)
34346 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34347 }
34348 }
34349 }
34350
34351 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
34352 TEST_REQUIRES_X86_FMA3;
34353 for (uint32_t n = 17; n < 32; n++) {
34354 for (size_t k = 1; k <= 5; k += 2) {
34355 GemmMicrokernelTester()
34356 .mr(5)
34357 .nr(16)
34358 .kr(1)
34359 .sr(1)
34360 .m(5)
34361 .n(16)
34362 .k(k)
34363 .cn_stride(19)
34364 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34365 }
34366 }
34367 }
34368
34369 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, n_gt_16_subtile) {
34370 TEST_REQUIRES_X86_FMA3;
34371 for (uint32_t n = 17; n < 32; n++) {
34372 for (size_t k = 1; k <= 5; k += 2) {
34373 for (uint32_t m = 1; m <= 5; m++) {
34374 GemmMicrokernelTester()
34375 .mr(5)
34376 .nr(16)
34377 .kr(1)
34378 .sr(1)
34379 .m(m)
34380 .n(n)
34381 .k(k)
34382 .iterations(1)
34383 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34384 }
34385 }
34386 }
34387 }
34388
34389 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, n_div_16) {
34390 TEST_REQUIRES_X86_FMA3;
34391 for (uint32_t n = 32; n <= 48; n += 16) {
34392 for (size_t k = 1; k <= 5; k += 2) {
34393 GemmMicrokernelTester()
34394 .mr(5)
34395 .nr(16)
34396 .kr(1)
34397 .sr(1)
34398 .m(5)
34399 .n(16)
34400 .k(k)
34401 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34402 }
34403 }
34404 }
34405
34406 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, n_div_16_strided_cn) {
34407 TEST_REQUIRES_X86_FMA3;
34408 for (uint32_t n = 32; n <= 48; n += 16) {
34409 for (size_t k = 1; k <= 5; k += 2) {
34410 GemmMicrokernelTester()
34411 .mr(5)
34412 .nr(16)
34413 .kr(1)
34414 .sr(1)
34415 .m(5)
34416 .n(n)
34417 .k(k)
34418 .cn_stride(19)
34419 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34420 }
34421 }
34422 }
34423
34424 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, n_div_16_subtile) {
34425 TEST_REQUIRES_X86_FMA3;
34426 for (uint32_t n = 32; n <= 48; n += 16) {
34427 for (size_t k = 1; k <= 5; k += 2) {
34428 for (uint32_t m = 1; m <= 5; m++) {
34429 GemmMicrokernelTester()
34430 .mr(5)
34431 .nr(16)
34432 .kr(1)
34433 .sr(1)
34434 .m(m)
34435 .n(n)
34436 .k(k)
34437 .iterations(1)
34438 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34439 }
34440 }
34441 }
34442 }
34443
34444 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, small_kernel) {
34445 TEST_REQUIRES_X86_FMA3;
34446 for (size_t k = 1; k <= 5; k += 2) {
34447 GemmMicrokernelTester()
34448 .mr(5)
34449 .nr(16)
34450 .kr(1)
34451 .sr(1)
34452 .m(5)
34453 .n(16)
34454 .k(k)
34455 .ks(3)
34456 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34457 }
34458 }
34459
34460 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, small_kernel_subtile) {
34461 TEST_REQUIRES_X86_FMA3;
34462 for (size_t k = 1; k <= 5; k += 2) {
34463 for (uint32_t m = 1; m <= 5; m++) {
34464 for (uint32_t n = 1; n <= 16; n++) {
34465 GemmMicrokernelTester()
34466 .mr(5)
34467 .nr(16)
34468 .kr(1)
34469 .sr(1)
34470 .m(m)
34471 .n(n)
34472 .k(k)
34473 .ks(3)
34474 .iterations(1)
34475 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34476 }
34477 }
34478 }
34479 }
34480
34481 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, n_gt_16_small_kernel) {
34482 TEST_REQUIRES_X86_FMA3;
34483 for (uint32_t n = 17; n < 32; n++) {
34484 for (size_t k = 1; k <= 5; k += 2) {
34485 GemmMicrokernelTester()
34486 .mr(5)
34487 .nr(16)
34488 .kr(1)
34489 .sr(1)
34490 .m(5)
34491 .n(16)
34492 .k(k)
34493 .ks(3)
34494 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34495 }
34496 }
34497 }
34498
34499 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, n_div_16_small_kernel) {
34500 TEST_REQUIRES_X86_FMA3;
34501 for (uint32_t n = 32; n <= 48; n += 16) {
34502 for (size_t k = 1; k <= 5; k += 2) {
34503 GemmMicrokernelTester()
34504 .mr(5)
34505 .nr(16)
34506 .kr(1)
34507 .sr(1)
34508 .m(5)
34509 .n(16)
34510 .k(k)
34511 .ks(3)
34512 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34513 }
34514 }
34515 }
34516
34517 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, strided_cm_subtile) {
34518 TEST_REQUIRES_X86_FMA3;
34519 for (size_t k = 1; k <= 5; k += 2) {
34520 for (uint32_t m = 1; m <= 5; m++) {
34521 for (uint32_t n = 1; n <= 16; n++) {
34522 GemmMicrokernelTester()
34523 .mr(5)
34524 .nr(16)
34525 .kr(1)
34526 .sr(1)
34527 .m(m)
34528 .n(n)
34529 .k(k)
34530 .cm_stride(19)
34531 .iterations(1)
34532 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34533 }
34534 }
34535 }
34536 }
34537
34538 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, a_offset) {
34539 TEST_REQUIRES_X86_FMA3;
34540 for (size_t k = 1; k <= 5; k += 2) {
34541 GemmMicrokernelTester()
34542 .mr(5)
34543 .nr(16)
34544 .kr(1)
34545 .sr(1)
34546 .m(5)
34547 .n(16)
34548 .k(k)
34549 .ks(3)
34550 .a_offset(29)
34551 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34552 }
34553 }
34554
34555 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, zero) {
34556 TEST_REQUIRES_X86_FMA3;
34557 for (uint32_t mz = 0; mz < 5; mz++) {
34558 for (size_t k = 1; k <= 5; k += 2) {
34559 GemmMicrokernelTester()
34560 .mr(5)
34561 .nr(16)
34562 .kr(1)
34563 .sr(1)
34564 .m(5)
34565 .n(16)
34566 .k(k)
34567 .ks(3)
34568 .a_offset(29)
34569 .zero_index(mz)
34570 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34571 }
34572 }
34573 }
34574
34575 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, qmin) {
34576 TEST_REQUIRES_X86_FMA3;
34577 GemmMicrokernelTester()
34578 .mr(5)
34579 .nr(16)
34580 .kr(1)
34581 .sr(1)
34582 .m(5)
34583 .n(16)
34584 .k(1)
34585 .qmin(128)
34586 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34587 }
34588
34589 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, qmax) {
34590 TEST_REQUIRES_X86_FMA3;
34591 GemmMicrokernelTester()
34592 .mr(5)
34593 .nr(16)
34594 .kr(1)
34595 .sr(1)
34596 .m(5)
34597 .n(16)
34598 .k(1)
34599 .qmax(128)
34600 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34601 }
34602
34603 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, strided_cm) {
34604 TEST_REQUIRES_X86_FMA3;
34605 GemmMicrokernelTester()
34606 .mr(5)
34607 .nr(16)
34608 .kr(1)
34609 .sr(1)
34610 .m(5)
34611 .n(16)
34612 .k(1)
34613 .cm_stride(19)
34614 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
34615 }
34616#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
34617
34618
34619#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Marat Dukhan27121322019-12-09 14:57:40 -080034620 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, k_eq_4) {
34621 TEST_REQUIRES_X86_FMA3;
34622 GemmMicrokernelTester()
34623 .mr(1)
34624 .nr(16)
34625 .kr(1)
34626 .sr(4)
34627 .m(1)
34628 .n(16)
34629 .k(4)
34630 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
34631 }
34632
34633 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, strided_cn) {
34634 TEST_REQUIRES_X86_FMA3;
34635 GemmMicrokernelTester()
34636 .mr(1)
34637 .nr(16)
34638 .kr(1)
34639 .sr(4)
34640 .m(1)
34641 .n(16)
34642 .k(4)
34643 .cn_stride(19)
34644 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
34645 }
34646
34647 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
34648 TEST_REQUIRES_X86_FMA3;
34649 for (uint32_t m = 1; m <= 1; m++) {
34650 for (uint32_t n = 1; n <= 16; n++) {
34651 GemmMicrokernelTester()
34652 .mr(1)
34653 .nr(16)
34654 .kr(1)
34655 .sr(4)
34656 .m(m)
34657 .n(n)
34658 .k(4)
34659 .iterations(1)
34660 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
34661 }
34662 }
34663 }
34664
34665 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
34666 TEST_REQUIRES_X86_FMA3;
34667 for (uint32_t m = 1; m <= 1; m++) {
34668 GemmMicrokernelTester()
34669 .mr(1)
34670 .nr(16)
34671 .kr(1)
34672 .sr(4)
34673 .m(m)
34674 .n(16)
34675 .k(4)
34676 .iterations(1)
34677 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
34678 }
34679 }
34680
34681 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
34682 TEST_REQUIRES_X86_FMA3;
34683 for (uint32_t n = 1; n <= 16; n++) {
34684 GemmMicrokernelTester()
34685 .mr(1)
34686 .nr(16)
34687 .kr(1)
34688 .sr(4)
34689 .m(1)
34690 .n(n)
34691 .k(4)
34692 .iterations(1)
34693 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
34694 }
34695 }
34696
34697 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, k_lt_4) {
34698 TEST_REQUIRES_X86_FMA3;
34699 for (size_t k = 1; k < 4; k++) {
34700 GemmMicrokernelTester()
34701 .mr(1)
34702 .nr(16)
34703 .kr(1)
34704 .sr(4)
34705 .m(1)
34706 .n(16)
34707 .k(k)
34708 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
34709 }
34710 }
34711
34712 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
34713 TEST_REQUIRES_X86_FMA3;
34714 for (size_t k = 1; k < 4; k++) {
34715 for (uint32_t m = 1; m <= 1; m++) {
34716 for (uint32_t n = 1; n <= 16; n++) {
34717 GemmMicrokernelTester()
34718 .mr(1)
34719 .nr(16)
34720 .kr(1)
34721 .sr(4)
34722 .m(m)
34723 .n(n)
34724 .k(k)
34725 .iterations(1)
34726 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
34727 }
34728 }
34729 }
34730 }
34731
34732 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, k_gt_4) {
34733 TEST_REQUIRES_X86_FMA3;
34734 for (size_t k = 5; k < 8; k++) {
34735 GemmMicrokernelTester()
34736 .mr(1)
34737 .nr(16)
34738 .kr(1)
34739 .sr(4)
34740 .m(1)
34741 .n(16)
34742 .k(k)
34743 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
34744 }
34745 }
34746
34747 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
34748 TEST_REQUIRES_X86_FMA3;
34749 for (size_t k = 5; k < 8; k++) {
34750 for (uint32_t m = 1; m <= 1; m++) {
34751 for (uint32_t n = 1; n <= 16; n++) {
34752 GemmMicrokernelTester()
34753 .mr(1)
34754 .nr(16)
34755 .kr(1)
34756 .sr(4)
34757 .m(m)
34758 .n(n)
34759 .k(k)
34760 .iterations(1)
34761 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
34762 }
34763 }
34764 }
34765 }
34766
34767 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, k_div_4) {
34768 TEST_REQUIRES_X86_FMA3;
34769 for (size_t k = 8; k <= 40; k += 4) {
34770 GemmMicrokernelTester()
34771 .mr(1)
34772 .nr(16)
34773 .kr(1)
34774 .sr(4)
34775 .m(1)
34776 .n(16)
34777 .k(k)
34778 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
34779 }
34780 }
34781
34782 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, k_div_4_subtile) {
34783 TEST_REQUIRES_X86_FMA3;
34784 for (size_t k = 8; k <= 40; k += 4) {
34785 for (uint32_t m = 1; m <= 1; m++) {
34786 for (uint32_t n = 1; n <= 16; n++) {
34787 GemmMicrokernelTester()
34788 .mr(1)
34789 .nr(16)
34790 .kr(1)
34791 .sr(4)
34792 .m(m)
34793 .n(n)
34794 .k(k)
34795 .iterations(1)
34796 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
34797 }
34798 }
34799 }
34800 }
34801
34802 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, n_gt_16) {
34803 TEST_REQUIRES_X86_FMA3;
34804 for (uint32_t n = 17; n < 32; n++) {
34805 for (size_t k = 1; k <= 20; k += 5) {
34806 GemmMicrokernelTester()
34807 .mr(1)
34808 .nr(16)
34809 .kr(1)
34810 .sr(4)
34811 .m(1)
34812 .n(16)
34813 .k(k)
34814 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
34815 }
34816 }
34817 }
34818
34819 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
34820 TEST_REQUIRES_X86_FMA3;
34821 for (uint32_t n = 17; n < 32; n++) {
34822 for (size_t k = 1; k <= 20; k += 5) {
34823 GemmMicrokernelTester()
34824 .mr(1)
34825 .nr(16)
34826 .kr(1)
34827 .sr(4)
34828 .m(1)
34829 .n(16)
34830 .k(k)
34831 .cn_stride(19)
34832 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
34833 }
34834 }
34835 }
34836
34837 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
34838 TEST_REQUIRES_X86_FMA3;
34839 for (uint32_t n = 17; n < 32; n++) {
34840 for (size_t k = 1; k <= 20; k += 5) {
34841 for (uint32_t m = 1; m <= 1; m++) {
34842 GemmMicrokernelTester()
34843 .mr(1)
34844 .nr(16)
34845 .kr(1)
34846 .sr(4)
34847 .m(m)
34848 .n(n)
34849 .k(k)
34850 .iterations(1)
34851 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
34852 }
34853 }
34854 }
34855 }
34856
34857 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, n_div_16) {
34858 TEST_REQUIRES_X86_FMA3;
34859 for (uint32_t n = 32; n <= 48; n += 16) {
34860 for (size_t k = 1; k <= 20; k += 5) {
34861 GemmMicrokernelTester()
34862 .mr(1)
34863 .nr(16)
34864 .kr(1)
34865 .sr(4)
34866 .m(1)
34867 .n(16)
34868 .k(k)
34869 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
34870 }
34871 }
34872 }
34873
34874 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
34875 TEST_REQUIRES_X86_FMA3;
34876 for (uint32_t n = 32; n <= 48; n += 16) {
34877 for (size_t k = 1; k <= 20; k += 5) {
34878 GemmMicrokernelTester()
34879 .mr(1)
34880 .nr(16)
34881 .kr(1)
34882 .sr(4)
34883 .m(1)
34884 .n(n)
34885 .k(k)
34886 .cn_stride(19)
34887 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
34888 }
34889 }
34890 }
34891
34892 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, n_div_16_subtile) {
34893 TEST_REQUIRES_X86_FMA3;
34894 for (uint32_t n = 32; n <= 48; n += 16) {
34895 for (size_t k = 1; k <= 20; k += 5) {
34896 for (uint32_t m = 1; m <= 1; m++) {
34897 GemmMicrokernelTester()
34898 .mr(1)
34899 .nr(16)
34900 .kr(1)
34901 .sr(4)
34902 .m(m)
34903 .n(n)
34904 .k(k)
34905 .iterations(1)
34906 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
34907 }
34908 }
34909 }
34910 }
34911
34912 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, small_kernel) {
34913 TEST_REQUIRES_X86_FMA3;
34914 for (size_t k = 1; k <= 20; k += 5) {
34915 GemmMicrokernelTester()
34916 .mr(1)
34917 .nr(16)
34918 .kr(1)
34919 .sr(4)
34920 .m(1)
34921 .n(16)
34922 .k(k)
34923 .ks(3)
34924 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
34925 }
34926 }
34927
34928 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, small_kernel_subtile) {
34929 TEST_REQUIRES_X86_FMA3;
34930 for (size_t k = 1; k <= 20; k += 5) {
34931 for (uint32_t m = 1; m <= 1; m++) {
34932 for (uint32_t n = 1; n <= 16; n++) {
34933 GemmMicrokernelTester()
34934 .mr(1)
34935 .nr(16)
34936 .kr(1)
34937 .sr(4)
34938 .m(m)
34939 .n(n)
34940 .k(k)
34941 .ks(3)
34942 .iterations(1)
34943 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
34944 }
34945 }
34946 }
34947 }
34948
34949 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, n_gt_16_small_kernel) {
34950 TEST_REQUIRES_X86_FMA3;
34951 for (uint32_t n = 17; n < 32; n++) {
34952 for (size_t k = 1; k <= 20; k += 5) {
34953 GemmMicrokernelTester()
34954 .mr(1)
34955 .nr(16)
34956 .kr(1)
34957 .sr(4)
34958 .m(1)
34959 .n(16)
34960 .k(k)
34961 .ks(3)
34962 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
34963 }
34964 }
34965 }
34966
34967 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, n_div_16_small_kernel) {
34968 TEST_REQUIRES_X86_FMA3;
34969 for (uint32_t n = 32; n <= 48; n += 16) {
34970 for (size_t k = 1; k <= 20; k += 5) {
34971 GemmMicrokernelTester()
34972 .mr(1)
34973 .nr(16)
34974 .kr(1)
34975 .sr(4)
34976 .m(1)
34977 .n(16)
34978 .k(k)
34979 .ks(3)
34980 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
34981 }
34982 }
34983 }
34984
34985 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, strided_cm_subtile) {
34986 TEST_REQUIRES_X86_FMA3;
34987 for (size_t k = 1; k <= 20; k += 5) {
34988 for (uint32_t m = 1; m <= 1; m++) {
34989 for (uint32_t n = 1; n <= 16; n++) {
34990 GemmMicrokernelTester()
34991 .mr(1)
34992 .nr(16)
34993 .kr(1)
34994 .sr(4)
34995 .m(m)
34996 .n(n)
34997 .k(k)
34998 .cm_stride(19)
34999 .iterations(1)
35000 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
35001 }
35002 }
35003 }
35004 }
35005
35006 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, a_offset) {
35007 TEST_REQUIRES_X86_FMA3;
35008 for (size_t k = 1; k <= 20; k += 5) {
35009 GemmMicrokernelTester()
35010 .mr(1)
35011 .nr(16)
35012 .kr(1)
35013 .sr(4)
35014 .m(1)
35015 .n(16)
35016 .k(k)
35017 .ks(3)
35018 .a_offset(23)
35019 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
35020 }
35021 }
35022
35023 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, zero) {
35024 TEST_REQUIRES_X86_FMA3;
35025 for (uint32_t mz = 0; mz < 1; mz++) {
35026 for (size_t k = 1; k <= 20; k += 5) {
35027 GemmMicrokernelTester()
35028 .mr(1)
35029 .nr(16)
35030 .kr(1)
35031 .sr(4)
35032 .m(1)
35033 .n(16)
35034 .k(k)
35035 .ks(3)
35036 .a_offset(23)
35037 .zero_index(mz)
35038 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
35039 }
35040 }
35041 }
35042
35043 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, qmin) {
35044 TEST_REQUIRES_X86_FMA3;
35045 GemmMicrokernelTester()
35046 .mr(1)
35047 .nr(16)
35048 .kr(1)
35049 .sr(4)
35050 .m(1)
35051 .n(16)
35052 .k(4)
35053 .qmin(128)
35054 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
35055 }
35056
35057 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, qmax) {
35058 TEST_REQUIRES_X86_FMA3;
35059 GemmMicrokernelTester()
35060 .mr(1)
35061 .nr(16)
35062 .kr(1)
35063 .sr(4)
35064 .m(1)
35065 .n(16)
35066 .k(4)
35067 .qmax(128)
35068 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
35069 }
35070
35071 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, strided_cm) {
35072 TEST_REQUIRES_X86_FMA3;
35073 GemmMicrokernelTester()
35074 .mr(1)
35075 .nr(16)
35076 .kr(1)
35077 .sr(4)
35078 .m(1)
35079 .n(16)
35080 .k(4)
35081 .cm_stride(19)
35082 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
35083 }
35084#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
35085
35086
35087#if XNN_ARCH_X86 || XNN_ARCH_X86_64
35088 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, k_eq_4) {
35089 TEST_REQUIRES_X86_FMA3;
35090 GemmMicrokernelTester()
35091 .mr(3)
35092 .nr(16)
35093 .kr(1)
35094 .sr(4)
35095 .m(3)
35096 .n(16)
35097 .k(4)
35098 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35099 }
35100
35101 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, strided_cn) {
35102 TEST_REQUIRES_X86_FMA3;
35103 GemmMicrokernelTester()
35104 .mr(3)
35105 .nr(16)
35106 .kr(1)
35107 .sr(4)
35108 .m(3)
35109 .n(16)
35110 .k(4)
35111 .cn_stride(19)
35112 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35113 }
35114
35115 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
35116 TEST_REQUIRES_X86_FMA3;
35117 for (uint32_t m = 1; m <= 3; m++) {
35118 for (uint32_t n = 1; n <= 16; n++) {
35119 GemmMicrokernelTester()
35120 .mr(3)
35121 .nr(16)
35122 .kr(1)
35123 .sr(4)
35124 .m(m)
35125 .n(n)
35126 .k(4)
35127 .iterations(1)
35128 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35129 }
35130 }
35131 }
35132
35133 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
35134 TEST_REQUIRES_X86_FMA3;
35135 for (uint32_t m = 1; m <= 3; m++) {
35136 GemmMicrokernelTester()
35137 .mr(3)
35138 .nr(16)
35139 .kr(1)
35140 .sr(4)
35141 .m(m)
35142 .n(16)
35143 .k(4)
35144 .iterations(1)
35145 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35146 }
35147 }
35148
35149 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
35150 TEST_REQUIRES_X86_FMA3;
35151 for (uint32_t n = 1; n <= 16; n++) {
35152 GemmMicrokernelTester()
35153 .mr(3)
35154 .nr(16)
35155 .kr(1)
35156 .sr(4)
35157 .m(3)
35158 .n(n)
35159 .k(4)
35160 .iterations(1)
35161 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35162 }
35163 }
35164
35165 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, k_lt_4) {
35166 TEST_REQUIRES_X86_FMA3;
35167 for (size_t k = 1; k < 4; k++) {
35168 GemmMicrokernelTester()
35169 .mr(3)
35170 .nr(16)
35171 .kr(1)
35172 .sr(4)
35173 .m(3)
35174 .n(16)
35175 .k(k)
35176 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35177 }
35178 }
35179
35180 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
35181 TEST_REQUIRES_X86_FMA3;
35182 for (size_t k = 1; k < 4; k++) {
35183 for (uint32_t m = 1; m <= 3; m++) {
35184 for (uint32_t n = 1; n <= 16; n++) {
35185 GemmMicrokernelTester()
35186 .mr(3)
35187 .nr(16)
35188 .kr(1)
35189 .sr(4)
35190 .m(m)
35191 .n(n)
35192 .k(k)
35193 .iterations(1)
35194 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35195 }
35196 }
35197 }
35198 }
35199
35200 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, k_gt_4) {
35201 TEST_REQUIRES_X86_FMA3;
35202 for (size_t k = 5; k < 8; k++) {
35203 GemmMicrokernelTester()
35204 .mr(3)
35205 .nr(16)
35206 .kr(1)
35207 .sr(4)
35208 .m(3)
35209 .n(16)
35210 .k(k)
35211 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35212 }
35213 }
35214
35215 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
35216 TEST_REQUIRES_X86_FMA3;
35217 for (size_t k = 5; k < 8; k++) {
35218 for (uint32_t m = 1; m <= 3; m++) {
35219 for (uint32_t n = 1; n <= 16; n++) {
35220 GemmMicrokernelTester()
35221 .mr(3)
35222 .nr(16)
35223 .kr(1)
35224 .sr(4)
35225 .m(m)
35226 .n(n)
35227 .k(k)
35228 .iterations(1)
35229 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35230 }
35231 }
35232 }
35233 }
35234
35235 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, k_div_4) {
35236 TEST_REQUIRES_X86_FMA3;
35237 for (size_t k = 8; k <= 40; k += 4) {
35238 GemmMicrokernelTester()
35239 .mr(3)
35240 .nr(16)
35241 .kr(1)
35242 .sr(4)
35243 .m(3)
35244 .n(16)
35245 .k(k)
35246 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35247 }
35248 }
35249
35250 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, k_div_4_subtile) {
35251 TEST_REQUIRES_X86_FMA3;
35252 for (size_t k = 8; k <= 40; k += 4) {
35253 for (uint32_t m = 1; m <= 3; m++) {
35254 for (uint32_t n = 1; n <= 16; n++) {
35255 GemmMicrokernelTester()
35256 .mr(3)
35257 .nr(16)
35258 .kr(1)
35259 .sr(4)
35260 .m(m)
35261 .n(n)
35262 .k(k)
35263 .iterations(1)
35264 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35265 }
35266 }
35267 }
35268 }
35269
35270 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, n_gt_16) {
35271 TEST_REQUIRES_X86_FMA3;
35272 for (uint32_t n = 17; n < 32; n++) {
35273 for (size_t k = 1; k <= 20; k += 5) {
35274 GemmMicrokernelTester()
35275 .mr(3)
35276 .nr(16)
35277 .kr(1)
35278 .sr(4)
35279 .m(3)
35280 .n(16)
35281 .k(k)
35282 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35283 }
35284 }
35285 }
35286
35287 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
35288 TEST_REQUIRES_X86_FMA3;
35289 for (uint32_t n = 17; n < 32; n++) {
35290 for (size_t k = 1; k <= 20; k += 5) {
35291 GemmMicrokernelTester()
35292 .mr(3)
35293 .nr(16)
35294 .kr(1)
35295 .sr(4)
35296 .m(3)
35297 .n(16)
35298 .k(k)
35299 .cn_stride(19)
35300 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35301 }
35302 }
35303 }
35304
35305 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
35306 TEST_REQUIRES_X86_FMA3;
35307 for (uint32_t n = 17; n < 32; n++) {
35308 for (size_t k = 1; k <= 20; k += 5) {
35309 for (uint32_t m = 1; m <= 3; m++) {
35310 GemmMicrokernelTester()
35311 .mr(3)
35312 .nr(16)
35313 .kr(1)
35314 .sr(4)
35315 .m(m)
35316 .n(n)
35317 .k(k)
35318 .iterations(1)
35319 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35320 }
35321 }
35322 }
35323 }
35324
35325 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, n_div_16) {
35326 TEST_REQUIRES_X86_FMA3;
35327 for (uint32_t n = 32; n <= 48; n += 16) {
35328 for (size_t k = 1; k <= 20; k += 5) {
35329 GemmMicrokernelTester()
35330 .mr(3)
35331 .nr(16)
35332 .kr(1)
35333 .sr(4)
35334 .m(3)
35335 .n(16)
35336 .k(k)
35337 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35338 }
35339 }
35340 }
35341
35342 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
35343 TEST_REQUIRES_X86_FMA3;
35344 for (uint32_t n = 32; n <= 48; n += 16) {
35345 for (size_t k = 1; k <= 20; k += 5) {
35346 GemmMicrokernelTester()
35347 .mr(3)
35348 .nr(16)
35349 .kr(1)
35350 .sr(4)
35351 .m(3)
35352 .n(n)
35353 .k(k)
35354 .cn_stride(19)
35355 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35356 }
35357 }
35358 }
35359
35360 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, n_div_16_subtile) {
35361 TEST_REQUIRES_X86_FMA3;
35362 for (uint32_t n = 32; n <= 48; n += 16) {
35363 for (size_t k = 1; k <= 20; k += 5) {
35364 for (uint32_t m = 1; m <= 3; m++) {
35365 GemmMicrokernelTester()
35366 .mr(3)
35367 .nr(16)
35368 .kr(1)
35369 .sr(4)
35370 .m(m)
35371 .n(n)
35372 .k(k)
35373 .iterations(1)
35374 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35375 }
35376 }
35377 }
35378 }
35379
35380 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, small_kernel) {
35381 TEST_REQUIRES_X86_FMA3;
35382 for (size_t k = 1; k <= 20; k += 5) {
35383 GemmMicrokernelTester()
35384 .mr(3)
35385 .nr(16)
35386 .kr(1)
35387 .sr(4)
35388 .m(3)
35389 .n(16)
35390 .k(k)
35391 .ks(3)
35392 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35393 }
35394 }
35395
35396 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, small_kernel_subtile) {
35397 TEST_REQUIRES_X86_FMA3;
35398 for (size_t k = 1; k <= 20; k += 5) {
35399 for (uint32_t m = 1; m <= 3; m++) {
35400 for (uint32_t n = 1; n <= 16; n++) {
35401 GemmMicrokernelTester()
35402 .mr(3)
35403 .nr(16)
35404 .kr(1)
35405 .sr(4)
35406 .m(m)
35407 .n(n)
35408 .k(k)
35409 .ks(3)
35410 .iterations(1)
35411 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35412 }
35413 }
35414 }
35415 }
35416
35417 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, n_gt_16_small_kernel) {
35418 TEST_REQUIRES_X86_FMA3;
35419 for (uint32_t n = 17; n < 32; n++) {
35420 for (size_t k = 1; k <= 20; k += 5) {
35421 GemmMicrokernelTester()
35422 .mr(3)
35423 .nr(16)
35424 .kr(1)
35425 .sr(4)
35426 .m(3)
35427 .n(16)
35428 .k(k)
35429 .ks(3)
35430 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35431 }
35432 }
35433 }
35434
35435 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, n_div_16_small_kernel) {
35436 TEST_REQUIRES_X86_FMA3;
35437 for (uint32_t n = 32; n <= 48; n += 16) {
35438 for (size_t k = 1; k <= 20; k += 5) {
35439 GemmMicrokernelTester()
35440 .mr(3)
35441 .nr(16)
35442 .kr(1)
35443 .sr(4)
35444 .m(3)
35445 .n(16)
35446 .k(k)
35447 .ks(3)
35448 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35449 }
35450 }
35451 }
35452
35453 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, strided_cm_subtile) {
35454 TEST_REQUIRES_X86_FMA3;
35455 for (size_t k = 1; k <= 20; k += 5) {
35456 for (uint32_t m = 1; m <= 3; m++) {
35457 for (uint32_t n = 1; n <= 16; n++) {
35458 GemmMicrokernelTester()
35459 .mr(3)
35460 .nr(16)
35461 .kr(1)
35462 .sr(4)
35463 .m(m)
35464 .n(n)
35465 .k(k)
35466 .cm_stride(19)
35467 .iterations(1)
35468 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35469 }
35470 }
35471 }
35472 }
35473
35474 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, a_offset) {
35475 TEST_REQUIRES_X86_FMA3;
35476 for (size_t k = 1; k <= 20; k += 5) {
35477 GemmMicrokernelTester()
35478 .mr(3)
35479 .nr(16)
35480 .kr(1)
35481 .sr(4)
35482 .m(3)
35483 .n(16)
35484 .k(k)
35485 .ks(3)
35486 .a_offset(67)
35487 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35488 }
35489 }
35490
35491 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, zero) {
35492 TEST_REQUIRES_X86_FMA3;
35493 for (uint32_t mz = 0; mz < 3; mz++) {
35494 for (size_t k = 1; k <= 20; k += 5) {
35495 GemmMicrokernelTester()
35496 .mr(3)
35497 .nr(16)
35498 .kr(1)
35499 .sr(4)
35500 .m(3)
35501 .n(16)
35502 .k(k)
35503 .ks(3)
35504 .a_offset(67)
35505 .zero_index(mz)
35506 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35507 }
35508 }
35509 }
35510
35511 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, qmin) {
35512 TEST_REQUIRES_X86_FMA3;
35513 GemmMicrokernelTester()
35514 .mr(3)
35515 .nr(16)
35516 .kr(1)
35517 .sr(4)
35518 .m(3)
35519 .n(16)
35520 .k(4)
35521 .qmin(128)
35522 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35523 }
35524
35525 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, qmax) {
35526 TEST_REQUIRES_X86_FMA3;
35527 GemmMicrokernelTester()
35528 .mr(3)
35529 .nr(16)
35530 .kr(1)
35531 .sr(4)
35532 .m(3)
35533 .n(16)
35534 .k(4)
35535 .qmax(128)
35536 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35537 }
35538
35539 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, strided_cm) {
35540 TEST_REQUIRES_X86_FMA3;
35541 GemmMicrokernelTester()
35542 .mr(3)
35543 .nr(16)
35544 .kr(1)
35545 .sr(4)
35546 .m(3)
35547 .n(16)
35548 .k(4)
35549 .cm_stride(19)
35550 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
35551 }
35552#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
35553
35554
35555#if XNN_ARCH_X86 || XNN_ARCH_X86_64
35556 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, k_eq_4) {
35557 TEST_REQUIRES_X86_FMA3;
35558 GemmMicrokernelTester()
35559 .mr(4)
35560 .nr(16)
35561 .kr(1)
35562 .sr(4)
35563 .m(4)
35564 .n(16)
35565 .k(4)
35566 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35567 }
35568
35569 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, strided_cn) {
35570 TEST_REQUIRES_X86_FMA3;
35571 GemmMicrokernelTester()
35572 .mr(4)
35573 .nr(16)
35574 .kr(1)
35575 .sr(4)
35576 .m(4)
35577 .n(16)
35578 .k(4)
35579 .cn_stride(19)
35580 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35581 }
35582
35583 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
35584 TEST_REQUIRES_X86_FMA3;
35585 for (uint32_t m = 1; m <= 4; m++) {
35586 for (uint32_t n = 1; n <= 16; n++) {
35587 GemmMicrokernelTester()
35588 .mr(4)
35589 .nr(16)
35590 .kr(1)
35591 .sr(4)
35592 .m(m)
35593 .n(n)
35594 .k(4)
35595 .iterations(1)
35596 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35597 }
35598 }
35599 }
35600
35601 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
35602 TEST_REQUIRES_X86_FMA3;
35603 for (uint32_t m = 1; m <= 4; m++) {
35604 GemmMicrokernelTester()
35605 .mr(4)
35606 .nr(16)
35607 .kr(1)
35608 .sr(4)
35609 .m(m)
35610 .n(16)
35611 .k(4)
35612 .iterations(1)
35613 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35614 }
35615 }
35616
35617 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
35618 TEST_REQUIRES_X86_FMA3;
35619 for (uint32_t n = 1; n <= 16; n++) {
35620 GemmMicrokernelTester()
35621 .mr(4)
35622 .nr(16)
35623 .kr(1)
35624 .sr(4)
35625 .m(4)
35626 .n(n)
35627 .k(4)
35628 .iterations(1)
35629 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35630 }
35631 }
35632
35633 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, k_lt_4) {
35634 TEST_REQUIRES_X86_FMA3;
35635 for (size_t k = 1; k < 4; k++) {
35636 GemmMicrokernelTester()
35637 .mr(4)
35638 .nr(16)
35639 .kr(1)
35640 .sr(4)
35641 .m(4)
35642 .n(16)
35643 .k(k)
35644 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35645 }
35646 }
35647
35648 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
35649 TEST_REQUIRES_X86_FMA3;
35650 for (size_t k = 1; k < 4; k++) {
35651 for (uint32_t m = 1; m <= 4; m++) {
35652 for (uint32_t n = 1; n <= 16; n++) {
35653 GemmMicrokernelTester()
35654 .mr(4)
35655 .nr(16)
35656 .kr(1)
35657 .sr(4)
35658 .m(m)
35659 .n(n)
35660 .k(k)
35661 .iterations(1)
35662 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35663 }
35664 }
35665 }
35666 }
35667
35668 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, k_gt_4) {
35669 TEST_REQUIRES_X86_FMA3;
35670 for (size_t k = 5; k < 8; k++) {
35671 GemmMicrokernelTester()
35672 .mr(4)
35673 .nr(16)
35674 .kr(1)
35675 .sr(4)
35676 .m(4)
35677 .n(16)
35678 .k(k)
35679 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35680 }
35681 }
35682
35683 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
35684 TEST_REQUIRES_X86_FMA3;
35685 for (size_t k = 5; k < 8; k++) {
35686 for (uint32_t m = 1; m <= 4; m++) {
35687 for (uint32_t n = 1; n <= 16; n++) {
35688 GemmMicrokernelTester()
35689 .mr(4)
35690 .nr(16)
35691 .kr(1)
35692 .sr(4)
35693 .m(m)
35694 .n(n)
35695 .k(k)
35696 .iterations(1)
35697 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35698 }
35699 }
35700 }
35701 }
35702
35703 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, k_div_4) {
35704 TEST_REQUIRES_X86_FMA3;
35705 for (size_t k = 8; k <= 40; k += 4) {
35706 GemmMicrokernelTester()
35707 .mr(4)
35708 .nr(16)
35709 .kr(1)
35710 .sr(4)
35711 .m(4)
35712 .n(16)
35713 .k(k)
35714 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35715 }
35716 }
35717
35718 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, k_div_4_subtile) {
35719 TEST_REQUIRES_X86_FMA3;
35720 for (size_t k = 8; k <= 40; k += 4) {
35721 for (uint32_t m = 1; m <= 4; m++) {
35722 for (uint32_t n = 1; n <= 16; n++) {
35723 GemmMicrokernelTester()
35724 .mr(4)
35725 .nr(16)
35726 .kr(1)
35727 .sr(4)
35728 .m(m)
35729 .n(n)
35730 .k(k)
35731 .iterations(1)
35732 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35733 }
35734 }
35735 }
35736 }
35737
35738 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, n_gt_16) {
35739 TEST_REQUIRES_X86_FMA3;
35740 for (uint32_t n = 17; n < 32; n++) {
35741 for (size_t k = 1; k <= 20; k += 5) {
35742 GemmMicrokernelTester()
35743 .mr(4)
35744 .nr(16)
35745 .kr(1)
35746 .sr(4)
35747 .m(4)
35748 .n(16)
35749 .k(k)
35750 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35751 }
35752 }
35753 }
35754
35755 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
35756 TEST_REQUIRES_X86_FMA3;
35757 for (uint32_t n = 17; n < 32; n++) {
35758 for (size_t k = 1; k <= 20; k += 5) {
35759 GemmMicrokernelTester()
35760 .mr(4)
35761 .nr(16)
35762 .kr(1)
35763 .sr(4)
35764 .m(4)
35765 .n(16)
35766 .k(k)
35767 .cn_stride(19)
35768 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35769 }
35770 }
35771 }
35772
35773 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
35774 TEST_REQUIRES_X86_FMA3;
35775 for (uint32_t n = 17; n < 32; n++) {
35776 for (size_t k = 1; k <= 20; k += 5) {
35777 for (uint32_t m = 1; m <= 4; m++) {
35778 GemmMicrokernelTester()
35779 .mr(4)
35780 .nr(16)
35781 .kr(1)
35782 .sr(4)
35783 .m(m)
35784 .n(n)
35785 .k(k)
35786 .iterations(1)
35787 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35788 }
35789 }
35790 }
35791 }
35792
35793 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, n_div_16) {
35794 TEST_REQUIRES_X86_FMA3;
35795 for (uint32_t n = 32; n <= 48; n += 16) {
35796 for (size_t k = 1; k <= 20; k += 5) {
35797 GemmMicrokernelTester()
35798 .mr(4)
35799 .nr(16)
35800 .kr(1)
35801 .sr(4)
35802 .m(4)
35803 .n(16)
35804 .k(k)
35805 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35806 }
35807 }
35808 }
35809
35810 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
35811 TEST_REQUIRES_X86_FMA3;
35812 for (uint32_t n = 32; n <= 48; n += 16) {
35813 for (size_t k = 1; k <= 20; k += 5) {
35814 GemmMicrokernelTester()
35815 .mr(4)
35816 .nr(16)
35817 .kr(1)
35818 .sr(4)
35819 .m(4)
35820 .n(n)
35821 .k(k)
35822 .cn_stride(19)
35823 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35824 }
35825 }
35826 }
35827
35828 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, n_div_16_subtile) {
35829 TEST_REQUIRES_X86_FMA3;
35830 for (uint32_t n = 32; n <= 48; n += 16) {
35831 for (size_t k = 1; k <= 20; k += 5) {
35832 for (uint32_t m = 1; m <= 4; m++) {
35833 GemmMicrokernelTester()
35834 .mr(4)
35835 .nr(16)
35836 .kr(1)
35837 .sr(4)
35838 .m(m)
35839 .n(n)
35840 .k(k)
35841 .iterations(1)
35842 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35843 }
35844 }
35845 }
35846 }
35847
35848 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, small_kernel) {
35849 TEST_REQUIRES_X86_FMA3;
35850 for (size_t k = 1; k <= 20; k += 5) {
35851 GemmMicrokernelTester()
35852 .mr(4)
35853 .nr(16)
35854 .kr(1)
35855 .sr(4)
35856 .m(4)
35857 .n(16)
35858 .k(k)
35859 .ks(3)
35860 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35861 }
35862 }
35863
35864 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, small_kernel_subtile) {
35865 TEST_REQUIRES_X86_FMA3;
35866 for (size_t k = 1; k <= 20; k += 5) {
35867 for (uint32_t m = 1; m <= 4; m++) {
35868 for (uint32_t n = 1; n <= 16; n++) {
35869 GemmMicrokernelTester()
35870 .mr(4)
35871 .nr(16)
35872 .kr(1)
35873 .sr(4)
35874 .m(m)
35875 .n(n)
35876 .k(k)
35877 .ks(3)
35878 .iterations(1)
35879 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35880 }
35881 }
35882 }
35883 }
35884
35885 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, n_gt_16_small_kernel) {
35886 TEST_REQUIRES_X86_FMA3;
35887 for (uint32_t n = 17; n < 32; n++) {
35888 for (size_t k = 1; k <= 20; k += 5) {
35889 GemmMicrokernelTester()
35890 .mr(4)
35891 .nr(16)
35892 .kr(1)
35893 .sr(4)
35894 .m(4)
35895 .n(16)
35896 .k(k)
35897 .ks(3)
35898 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35899 }
35900 }
35901 }
35902
35903 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, n_div_16_small_kernel) {
35904 TEST_REQUIRES_X86_FMA3;
35905 for (uint32_t n = 32; n <= 48; n += 16) {
35906 for (size_t k = 1; k <= 20; k += 5) {
35907 GemmMicrokernelTester()
35908 .mr(4)
35909 .nr(16)
35910 .kr(1)
35911 .sr(4)
35912 .m(4)
35913 .n(16)
35914 .k(k)
35915 .ks(3)
35916 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35917 }
35918 }
35919 }
35920
35921 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, strided_cm_subtile) {
35922 TEST_REQUIRES_X86_FMA3;
35923 for (size_t k = 1; k <= 20; k += 5) {
35924 for (uint32_t m = 1; m <= 4; m++) {
35925 for (uint32_t n = 1; n <= 16; n++) {
35926 GemmMicrokernelTester()
35927 .mr(4)
35928 .nr(16)
35929 .kr(1)
35930 .sr(4)
35931 .m(m)
35932 .n(n)
35933 .k(k)
35934 .cm_stride(19)
35935 .iterations(1)
35936 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35937 }
35938 }
35939 }
35940 }
35941
35942 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, a_offset) {
35943 TEST_REQUIRES_X86_FMA3;
35944 for (size_t k = 1; k <= 20; k += 5) {
35945 GemmMicrokernelTester()
35946 .mr(4)
35947 .nr(16)
35948 .kr(1)
35949 .sr(4)
35950 .m(4)
35951 .n(16)
35952 .k(k)
35953 .ks(3)
35954 .a_offset(83)
35955 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35956 }
35957 }
35958
35959 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, zero) {
35960 TEST_REQUIRES_X86_FMA3;
35961 for (uint32_t mz = 0; mz < 4; mz++) {
35962 for (size_t k = 1; k <= 20; k += 5) {
35963 GemmMicrokernelTester()
35964 .mr(4)
35965 .nr(16)
35966 .kr(1)
35967 .sr(4)
35968 .m(4)
35969 .n(16)
35970 .k(k)
35971 .ks(3)
35972 .a_offset(83)
35973 .zero_index(mz)
35974 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35975 }
35976 }
35977 }
35978
35979 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, qmin) {
35980 TEST_REQUIRES_X86_FMA3;
35981 GemmMicrokernelTester()
35982 .mr(4)
35983 .nr(16)
35984 .kr(1)
35985 .sr(4)
35986 .m(4)
35987 .n(16)
35988 .k(4)
35989 .qmin(128)
35990 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
35991 }
35992
35993 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, qmax) {
35994 TEST_REQUIRES_X86_FMA3;
35995 GemmMicrokernelTester()
35996 .mr(4)
35997 .nr(16)
35998 .kr(1)
35999 .sr(4)
36000 .m(4)
36001 .n(16)
36002 .k(4)
36003 .qmax(128)
36004 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
36005 }
36006
36007 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, strided_cm) {
36008 TEST_REQUIRES_X86_FMA3;
36009 GemmMicrokernelTester()
36010 .mr(4)
36011 .nr(16)
36012 .kr(1)
36013 .sr(4)
36014 .m(4)
36015 .n(16)
36016 .k(4)
36017 .cm_stride(19)
36018 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
36019 }
36020#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
36021
36022
36023#if XNN_ARCH_X86 || XNN_ARCH_X86_64
36024 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, k_eq_4) {
36025 TEST_REQUIRES_X86_FMA3;
36026 GemmMicrokernelTester()
36027 .mr(5)
36028 .nr(16)
36029 .kr(1)
36030 .sr(4)
36031 .m(5)
36032 .n(16)
36033 .k(4)
36034 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36035 }
36036
36037 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, strided_cn) {
36038 TEST_REQUIRES_X86_FMA3;
36039 GemmMicrokernelTester()
36040 .mr(5)
36041 .nr(16)
36042 .kr(1)
36043 .sr(4)
36044 .m(5)
36045 .n(16)
36046 .k(4)
36047 .cn_stride(19)
36048 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36049 }
36050
36051 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
36052 TEST_REQUIRES_X86_FMA3;
36053 for (uint32_t m = 1; m <= 5; m++) {
36054 for (uint32_t n = 1; n <= 16; n++) {
36055 GemmMicrokernelTester()
36056 .mr(5)
36057 .nr(16)
36058 .kr(1)
36059 .sr(4)
36060 .m(m)
36061 .n(n)
36062 .k(4)
36063 .iterations(1)
36064 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36065 }
36066 }
36067 }
36068
36069 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
36070 TEST_REQUIRES_X86_FMA3;
36071 for (uint32_t m = 1; m <= 5; m++) {
36072 GemmMicrokernelTester()
36073 .mr(5)
36074 .nr(16)
36075 .kr(1)
36076 .sr(4)
36077 .m(m)
36078 .n(16)
36079 .k(4)
36080 .iterations(1)
36081 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36082 }
36083 }
36084
36085 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
36086 TEST_REQUIRES_X86_FMA3;
36087 for (uint32_t n = 1; n <= 16; n++) {
36088 GemmMicrokernelTester()
36089 .mr(5)
36090 .nr(16)
36091 .kr(1)
36092 .sr(4)
36093 .m(5)
36094 .n(n)
36095 .k(4)
36096 .iterations(1)
36097 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36098 }
36099 }
36100
36101 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, k_lt_4) {
36102 TEST_REQUIRES_X86_FMA3;
36103 for (size_t k = 1; k < 4; k++) {
36104 GemmMicrokernelTester()
36105 .mr(5)
36106 .nr(16)
36107 .kr(1)
36108 .sr(4)
36109 .m(5)
36110 .n(16)
36111 .k(k)
36112 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36113 }
36114 }
36115
36116 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
36117 TEST_REQUIRES_X86_FMA3;
36118 for (size_t k = 1; k < 4; k++) {
36119 for (uint32_t m = 1; m <= 5; m++) {
36120 for (uint32_t n = 1; n <= 16; n++) {
36121 GemmMicrokernelTester()
36122 .mr(5)
36123 .nr(16)
36124 .kr(1)
36125 .sr(4)
36126 .m(m)
36127 .n(n)
36128 .k(k)
36129 .iterations(1)
36130 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36131 }
36132 }
36133 }
36134 }
36135
36136 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, k_gt_4) {
36137 TEST_REQUIRES_X86_FMA3;
36138 for (size_t k = 5; k < 8; k++) {
36139 GemmMicrokernelTester()
36140 .mr(5)
36141 .nr(16)
36142 .kr(1)
36143 .sr(4)
36144 .m(5)
36145 .n(16)
36146 .k(k)
36147 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36148 }
36149 }
36150
36151 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
36152 TEST_REQUIRES_X86_FMA3;
36153 for (size_t k = 5; k < 8; k++) {
36154 for (uint32_t m = 1; m <= 5; m++) {
36155 for (uint32_t n = 1; n <= 16; n++) {
36156 GemmMicrokernelTester()
36157 .mr(5)
36158 .nr(16)
36159 .kr(1)
36160 .sr(4)
36161 .m(m)
36162 .n(n)
36163 .k(k)
36164 .iterations(1)
36165 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36166 }
36167 }
36168 }
36169 }
36170
36171 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, k_div_4) {
36172 TEST_REQUIRES_X86_FMA3;
36173 for (size_t k = 8; k <= 40; k += 4) {
36174 GemmMicrokernelTester()
36175 .mr(5)
36176 .nr(16)
36177 .kr(1)
36178 .sr(4)
36179 .m(5)
36180 .n(16)
36181 .k(k)
36182 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36183 }
36184 }
36185
36186 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, k_div_4_subtile) {
36187 TEST_REQUIRES_X86_FMA3;
36188 for (size_t k = 8; k <= 40; k += 4) {
36189 for (uint32_t m = 1; m <= 5; m++) {
36190 for (uint32_t n = 1; n <= 16; n++) {
36191 GemmMicrokernelTester()
36192 .mr(5)
36193 .nr(16)
36194 .kr(1)
36195 .sr(4)
36196 .m(m)
36197 .n(n)
36198 .k(k)
36199 .iterations(1)
36200 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36201 }
36202 }
36203 }
36204 }
36205
36206 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, n_gt_16) {
36207 TEST_REQUIRES_X86_FMA3;
36208 for (uint32_t n = 17; n < 32; n++) {
36209 for (size_t k = 1; k <= 20; k += 5) {
36210 GemmMicrokernelTester()
36211 .mr(5)
36212 .nr(16)
36213 .kr(1)
36214 .sr(4)
36215 .m(5)
36216 .n(16)
36217 .k(k)
36218 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36219 }
36220 }
36221 }
36222
36223 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
36224 TEST_REQUIRES_X86_FMA3;
36225 for (uint32_t n = 17; n < 32; n++) {
36226 for (size_t k = 1; k <= 20; k += 5) {
36227 GemmMicrokernelTester()
36228 .mr(5)
36229 .nr(16)
36230 .kr(1)
36231 .sr(4)
36232 .m(5)
36233 .n(16)
36234 .k(k)
36235 .cn_stride(19)
36236 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36237 }
36238 }
36239 }
36240
36241 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
36242 TEST_REQUIRES_X86_FMA3;
36243 for (uint32_t n = 17; n < 32; n++) {
36244 for (size_t k = 1; k <= 20; k += 5) {
36245 for (uint32_t m = 1; m <= 5; m++) {
36246 GemmMicrokernelTester()
36247 .mr(5)
36248 .nr(16)
36249 .kr(1)
36250 .sr(4)
36251 .m(m)
36252 .n(n)
36253 .k(k)
36254 .iterations(1)
36255 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36256 }
36257 }
36258 }
36259 }
36260
36261 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, n_div_16) {
36262 TEST_REQUIRES_X86_FMA3;
36263 for (uint32_t n = 32; n <= 48; n += 16) {
36264 for (size_t k = 1; k <= 20; k += 5) {
36265 GemmMicrokernelTester()
36266 .mr(5)
36267 .nr(16)
36268 .kr(1)
36269 .sr(4)
36270 .m(5)
36271 .n(16)
36272 .k(k)
36273 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36274 }
36275 }
36276 }
36277
36278 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
36279 TEST_REQUIRES_X86_FMA3;
36280 for (uint32_t n = 32; n <= 48; n += 16) {
36281 for (size_t k = 1; k <= 20; k += 5) {
36282 GemmMicrokernelTester()
36283 .mr(5)
36284 .nr(16)
36285 .kr(1)
36286 .sr(4)
36287 .m(5)
36288 .n(n)
36289 .k(k)
36290 .cn_stride(19)
36291 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36292 }
36293 }
36294 }
36295
36296 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, n_div_16_subtile) {
36297 TEST_REQUIRES_X86_FMA3;
36298 for (uint32_t n = 32; n <= 48; n += 16) {
36299 for (size_t k = 1; k <= 20; k += 5) {
36300 for (uint32_t m = 1; m <= 5; m++) {
36301 GemmMicrokernelTester()
36302 .mr(5)
36303 .nr(16)
36304 .kr(1)
36305 .sr(4)
36306 .m(m)
36307 .n(n)
36308 .k(k)
36309 .iterations(1)
36310 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36311 }
36312 }
36313 }
36314 }
36315
36316 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, small_kernel) {
36317 TEST_REQUIRES_X86_FMA3;
36318 for (size_t k = 1; k <= 20; k += 5) {
36319 GemmMicrokernelTester()
36320 .mr(5)
36321 .nr(16)
36322 .kr(1)
36323 .sr(4)
36324 .m(5)
36325 .n(16)
36326 .k(k)
36327 .ks(3)
36328 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36329 }
36330 }
36331
36332 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, small_kernel_subtile) {
36333 TEST_REQUIRES_X86_FMA3;
36334 for (size_t k = 1; k <= 20; k += 5) {
36335 for (uint32_t m = 1; m <= 5; m++) {
36336 for (uint32_t n = 1; n <= 16; n++) {
36337 GemmMicrokernelTester()
36338 .mr(5)
36339 .nr(16)
36340 .kr(1)
36341 .sr(4)
36342 .m(m)
36343 .n(n)
36344 .k(k)
36345 .ks(3)
36346 .iterations(1)
36347 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36348 }
36349 }
36350 }
36351 }
36352
36353 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, n_gt_16_small_kernel) {
36354 TEST_REQUIRES_X86_FMA3;
36355 for (uint32_t n = 17; n < 32; n++) {
36356 for (size_t k = 1; k <= 20; k += 5) {
36357 GemmMicrokernelTester()
36358 .mr(5)
36359 .nr(16)
36360 .kr(1)
36361 .sr(4)
36362 .m(5)
36363 .n(16)
36364 .k(k)
36365 .ks(3)
36366 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36367 }
36368 }
36369 }
36370
36371 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, n_div_16_small_kernel) {
36372 TEST_REQUIRES_X86_FMA3;
36373 for (uint32_t n = 32; n <= 48; n += 16) {
36374 for (size_t k = 1; k <= 20; k += 5) {
36375 GemmMicrokernelTester()
36376 .mr(5)
36377 .nr(16)
36378 .kr(1)
36379 .sr(4)
36380 .m(5)
36381 .n(16)
36382 .k(k)
36383 .ks(3)
36384 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36385 }
36386 }
36387 }
36388
36389 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, strided_cm_subtile) {
36390 TEST_REQUIRES_X86_FMA3;
36391 for (size_t k = 1; k <= 20; k += 5) {
36392 for (uint32_t m = 1; m <= 5; m++) {
36393 for (uint32_t n = 1; n <= 16; n++) {
36394 GemmMicrokernelTester()
36395 .mr(5)
36396 .nr(16)
36397 .kr(1)
36398 .sr(4)
36399 .m(m)
36400 .n(n)
36401 .k(k)
36402 .cm_stride(19)
36403 .iterations(1)
36404 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36405 }
36406 }
36407 }
36408 }
36409
36410 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, a_offset) {
36411 TEST_REQUIRES_X86_FMA3;
36412 for (size_t k = 1; k <= 20; k += 5) {
36413 GemmMicrokernelTester()
36414 .mr(5)
36415 .nr(16)
36416 .kr(1)
36417 .sr(4)
36418 .m(5)
36419 .n(16)
36420 .k(k)
36421 .ks(3)
36422 .a_offset(103)
36423 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36424 }
36425 }
36426
36427 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, zero) {
36428 TEST_REQUIRES_X86_FMA3;
36429 for (uint32_t mz = 0; mz < 5; mz++) {
36430 for (size_t k = 1; k <= 20; k += 5) {
36431 GemmMicrokernelTester()
36432 .mr(5)
36433 .nr(16)
36434 .kr(1)
36435 .sr(4)
36436 .m(5)
36437 .n(16)
36438 .k(k)
36439 .ks(3)
36440 .a_offset(103)
36441 .zero_index(mz)
36442 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36443 }
36444 }
36445 }
36446
36447 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, qmin) {
36448 TEST_REQUIRES_X86_FMA3;
36449 GemmMicrokernelTester()
36450 .mr(5)
36451 .nr(16)
36452 .kr(1)
36453 .sr(4)
36454 .m(5)
36455 .n(16)
36456 .k(4)
36457 .qmin(128)
36458 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36459 }
36460
36461 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, qmax) {
36462 TEST_REQUIRES_X86_FMA3;
36463 GemmMicrokernelTester()
36464 .mr(5)
36465 .nr(16)
36466 .kr(1)
36467 .sr(4)
36468 .m(5)
36469 .n(16)
36470 .k(4)
36471 .qmax(128)
36472 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36473 }
36474
36475 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, strided_cm) {
36476 TEST_REQUIRES_X86_FMA3;
36477 GemmMicrokernelTester()
36478 .mr(5)
36479 .nr(16)
36480 .kr(1)
36481 .sr(4)
36482 .m(5)
36483 .n(16)
36484 .k(4)
36485 .cm_stride(19)
36486 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
36487 }
36488#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
36489
36490
36491#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Marat Dukhan0f349c42019-11-27 11:58:54 -080036492 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, k_eq_1) {
36493 TEST_REQUIRES_X86_AVX512F;
36494 GemmMicrokernelTester()
36495 .mr(1)
36496 .nr(16)
36497 .kr(1)
36498 .sr(1)
36499 .m(1)
36500 .n(16)
36501 .k(1)
36502 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36503 }
36504
36505 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, strided_cn) {
36506 TEST_REQUIRES_X86_AVX512F;
36507 GemmMicrokernelTester()
36508 .mr(1)
36509 .nr(16)
36510 .kr(1)
36511 .sr(1)
36512 .m(1)
36513 .n(16)
36514 .k(1)
36515 .cn_stride(19)
36516 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36517 }
36518
36519 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, k_eq_1_subtile) {
36520 TEST_REQUIRES_X86_AVX512F;
36521 for (uint32_t m = 1; m <= 1; m++) {
36522 for (uint32_t n = 1; n <= 16; n++) {
36523 GemmMicrokernelTester()
36524 .mr(1)
36525 .nr(16)
36526 .kr(1)
36527 .sr(1)
36528 .m(m)
36529 .n(n)
36530 .k(1)
36531 .iterations(1)
36532 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36533 }
36534 }
36535 }
36536
36537 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
36538 TEST_REQUIRES_X86_AVX512F;
36539 for (uint32_t m = 1; m <= 1; m++) {
36540 GemmMicrokernelTester()
36541 .mr(1)
36542 .nr(16)
36543 .kr(1)
36544 .sr(1)
36545 .m(m)
36546 .n(16)
36547 .k(1)
36548 .iterations(1)
36549 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36550 }
36551 }
36552
36553 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
36554 TEST_REQUIRES_X86_AVX512F;
36555 for (uint32_t n = 1; n <= 16; n++) {
36556 GemmMicrokernelTester()
36557 .mr(1)
36558 .nr(16)
36559 .kr(1)
36560 .sr(1)
36561 .m(1)
36562 .n(n)
36563 .k(1)
36564 .iterations(1)
36565 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36566 }
36567 }
36568
36569 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, k_gt_1) {
36570 TEST_REQUIRES_X86_AVX512F;
36571 for (size_t k = 2; k < 10; k++) {
36572 GemmMicrokernelTester()
36573 .mr(1)
36574 .nr(16)
36575 .kr(1)
36576 .sr(1)
36577 .m(1)
36578 .n(16)
36579 .k(k)
36580 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36581 }
36582 }
36583
36584 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, k_gt_1_subtile) {
36585 TEST_REQUIRES_X86_AVX512F;
36586 for (size_t k = 2; k < 10; k++) {
36587 for (uint32_t m = 1; m <= 1; m++) {
36588 for (uint32_t n = 1; n <= 16; n++) {
36589 GemmMicrokernelTester()
36590 .mr(1)
36591 .nr(16)
36592 .kr(1)
36593 .sr(1)
36594 .m(m)
36595 .n(n)
36596 .k(k)
36597 .iterations(1)
36598 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36599 }
36600 }
36601 }
36602 }
36603
36604 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, n_gt_16) {
36605 TEST_REQUIRES_X86_AVX512F;
36606 for (uint32_t n = 17; n < 32; n++) {
36607 for (size_t k = 1; k <= 5; k += 2) {
36608 GemmMicrokernelTester()
36609 .mr(1)
36610 .nr(16)
36611 .kr(1)
36612 .sr(1)
36613 .m(1)
36614 .n(16)
36615 .k(k)
36616 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36617 }
36618 }
36619 }
36620
36621 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
36622 TEST_REQUIRES_X86_AVX512F;
36623 for (uint32_t n = 17; n < 32; n++) {
36624 for (size_t k = 1; k <= 5; k += 2) {
36625 GemmMicrokernelTester()
36626 .mr(1)
36627 .nr(16)
36628 .kr(1)
36629 .sr(1)
36630 .m(1)
36631 .n(16)
36632 .k(k)
36633 .cn_stride(19)
36634 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36635 }
36636 }
36637 }
36638
36639 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, n_gt_16_subtile) {
36640 TEST_REQUIRES_X86_AVX512F;
36641 for (uint32_t n = 17; n < 32; n++) {
36642 for (size_t k = 1; k <= 5; k += 2) {
36643 for (uint32_t m = 1; m <= 1; m++) {
36644 GemmMicrokernelTester()
36645 .mr(1)
36646 .nr(16)
36647 .kr(1)
36648 .sr(1)
36649 .m(m)
36650 .n(n)
36651 .k(k)
36652 .iterations(1)
36653 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36654 }
36655 }
36656 }
36657 }
36658
36659 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, n_div_16) {
36660 TEST_REQUIRES_X86_AVX512F;
36661 for (uint32_t n = 32; n <= 48; n += 16) {
36662 for (size_t k = 1; k <= 5; k += 2) {
36663 GemmMicrokernelTester()
36664 .mr(1)
36665 .nr(16)
36666 .kr(1)
36667 .sr(1)
36668 .m(1)
36669 .n(16)
36670 .k(k)
36671 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36672 }
36673 }
36674 }
36675
36676 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
36677 TEST_REQUIRES_X86_AVX512F;
36678 for (uint32_t n = 32; n <= 48; n += 16) {
36679 for (size_t k = 1; k <= 5; k += 2) {
36680 GemmMicrokernelTester()
36681 .mr(1)
36682 .nr(16)
36683 .kr(1)
36684 .sr(1)
36685 .m(1)
36686 .n(n)
36687 .k(k)
36688 .cn_stride(19)
36689 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36690 }
36691 }
36692 }
36693
36694 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, n_div_16_subtile) {
36695 TEST_REQUIRES_X86_AVX512F;
36696 for (uint32_t n = 32; n <= 48; n += 16) {
36697 for (size_t k = 1; k <= 5; k += 2) {
36698 for (uint32_t m = 1; m <= 1; m++) {
36699 GemmMicrokernelTester()
36700 .mr(1)
36701 .nr(16)
36702 .kr(1)
36703 .sr(1)
36704 .m(m)
36705 .n(n)
36706 .k(k)
36707 .iterations(1)
36708 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36709 }
36710 }
36711 }
36712 }
36713
36714 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, small_kernel) {
36715 TEST_REQUIRES_X86_AVX512F;
36716 for (size_t k = 1; k <= 5; k += 2) {
36717 GemmMicrokernelTester()
36718 .mr(1)
36719 .nr(16)
36720 .kr(1)
36721 .sr(1)
36722 .m(1)
36723 .n(16)
36724 .k(k)
36725 .ks(3)
36726 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36727 }
36728 }
36729
36730 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, small_kernel_subtile) {
36731 TEST_REQUIRES_X86_AVX512F;
36732 for (size_t k = 1; k <= 5; k += 2) {
36733 for (uint32_t m = 1; m <= 1; m++) {
36734 for (uint32_t n = 1; n <= 16; n++) {
36735 GemmMicrokernelTester()
36736 .mr(1)
36737 .nr(16)
36738 .kr(1)
36739 .sr(1)
36740 .m(m)
36741 .n(n)
36742 .k(k)
36743 .ks(3)
36744 .iterations(1)
36745 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36746 }
36747 }
36748 }
36749 }
36750
36751 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, n_gt_16_small_kernel) {
36752 TEST_REQUIRES_X86_AVX512F;
36753 for (uint32_t n = 17; n < 32; n++) {
36754 for (size_t k = 1; k <= 5; k += 2) {
36755 GemmMicrokernelTester()
36756 .mr(1)
36757 .nr(16)
36758 .kr(1)
36759 .sr(1)
36760 .m(1)
36761 .n(16)
36762 .k(k)
36763 .ks(3)
36764 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36765 }
36766 }
36767 }
36768
36769 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, n_div_16_small_kernel) {
36770 TEST_REQUIRES_X86_AVX512F;
36771 for (uint32_t n = 32; n <= 48; n += 16) {
36772 for (size_t k = 1; k <= 5; k += 2) {
36773 GemmMicrokernelTester()
36774 .mr(1)
36775 .nr(16)
36776 .kr(1)
36777 .sr(1)
36778 .m(1)
36779 .n(16)
36780 .k(k)
36781 .ks(3)
36782 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36783 }
36784 }
36785 }
36786
36787 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, strided_cm_subtile) {
36788 TEST_REQUIRES_X86_AVX512F;
36789 for (size_t k = 1; k <= 5; k += 2) {
36790 for (uint32_t m = 1; m <= 1; m++) {
36791 for (uint32_t n = 1; n <= 16; n++) {
36792 GemmMicrokernelTester()
36793 .mr(1)
36794 .nr(16)
36795 .kr(1)
36796 .sr(1)
36797 .m(m)
36798 .n(n)
36799 .k(k)
36800 .cm_stride(19)
36801 .iterations(1)
36802 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36803 }
36804 }
36805 }
36806 }
36807
36808 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, a_offset) {
36809 TEST_REQUIRES_X86_AVX512F;
36810 for (size_t k = 1; k <= 5; k += 2) {
36811 GemmMicrokernelTester()
36812 .mr(1)
36813 .nr(16)
36814 .kr(1)
36815 .sr(1)
36816 .m(1)
36817 .n(16)
36818 .k(k)
36819 .ks(3)
36820 .a_offset(7)
36821 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36822 }
36823 }
36824
36825 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, zero) {
36826 TEST_REQUIRES_X86_AVX512F;
36827 for (uint32_t mz = 0; mz < 1; mz++) {
36828 for (size_t k = 1; k <= 5; k += 2) {
36829 GemmMicrokernelTester()
36830 .mr(1)
36831 .nr(16)
36832 .kr(1)
36833 .sr(1)
36834 .m(1)
36835 .n(16)
36836 .k(k)
36837 .ks(3)
36838 .a_offset(7)
36839 .zero_index(mz)
36840 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36841 }
36842 }
36843 }
36844
36845 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, qmin) {
36846 TEST_REQUIRES_X86_AVX512F;
36847 GemmMicrokernelTester()
36848 .mr(1)
36849 .nr(16)
36850 .kr(1)
36851 .sr(1)
36852 .m(1)
36853 .n(16)
36854 .k(1)
36855 .qmin(128)
36856 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36857 }
36858
36859 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, qmax) {
36860 TEST_REQUIRES_X86_AVX512F;
36861 GemmMicrokernelTester()
36862 .mr(1)
36863 .nr(16)
36864 .kr(1)
36865 .sr(1)
36866 .m(1)
36867 .n(16)
36868 .k(1)
36869 .qmax(128)
36870 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36871 }
36872
36873 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, strided_cm) {
36874 TEST_REQUIRES_X86_AVX512F;
36875 GemmMicrokernelTester()
36876 .mr(1)
36877 .nr(16)
36878 .kr(1)
36879 .sr(1)
36880 .m(1)
36881 .n(16)
36882 .k(1)
36883 .cm_stride(19)
36884 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
36885 }
36886#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
36887
36888
36889#if XNN_ARCH_X86 || XNN_ARCH_X86_64
36890 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, k_eq_1) {
36891 TEST_REQUIRES_X86_AVX512F;
36892 GemmMicrokernelTester()
36893 .mr(4)
36894 .nr(16)
36895 .kr(1)
36896 .sr(1)
36897 .m(4)
36898 .n(16)
36899 .k(1)
36900 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
36901 }
36902
36903 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, strided_cn) {
36904 TEST_REQUIRES_X86_AVX512F;
36905 GemmMicrokernelTester()
36906 .mr(4)
36907 .nr(16)
36908 .kr(1)
36909 .sr(1)
36910 .m(4)
36911 .n(16)
36912 .k(1)
36913 .cn_stride(19)
36914 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
36915 }
36916
36917 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, k_eq_1_subtile) {
36918 TEST_REQUIRES_X86_AVX512F;
36919 for (uint32_t m = 1; m <= 4; m++) {
36920 for (uint32_t n = 1; n <= 16; n++) {
36921 GemmMicrokernelTester()
36922 .mr(4)
36923 .nr(16)
36924 .kr(1)
36925 .sr(1)
36926 .m(m)
36927 .n(n)
36928 .k(1)
36929 .iterations(1)
36930 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
36931 }
36932 }
36933 }
36934
36935 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
36936 TEST_REQUIRES_X86_AVX512F;
36937 for (uint32_t m = 1; m <= 4; m++) {
36938 GemmMicrokernelTester()
36939 .mr(4)
36940 .nr(16)
36941 .kr(1)
36942 .sr(1)
36943 .m(m)
36944 .n(16)
36945 .k(1)
36946 .iterations(1)
36947 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
36948 }
36949 }
36950
36951 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
36952 TEST_REQUIRES_X86_AVX512F;
36953 for (uint32_t n = 1; n <= 16; n++) {
36954 GemmMicrokernelTester()
36955 .mr(4)
36956 .nr(16)
36957 .kr(1)
36958 .sr(1)
36959 .m(4)
36960 .n(n)
36961 .k(1)
36962 .iterations(1)
36963 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
36964 }
36965 }
36966
36967 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, k_gt_1) {
36968 TEST_REQUIRES_X86_AVX512F;
36969 for (size_t k = 2; k < 10; k++) {
36970 GemmMicrokernelTester()
36971 .mr(4)
36972 .nr(16)
36973 .kr(1)
36974 .sr(1)
36975 .m(4)
36976 .n(16)
36977 .k(k)
36978 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
36979 }
36980 }
36981
36982 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, k_gt_1_subtile) {
36983 TEST_REQUIRES_X86_AVX512F;
36984 for (size_t k = 2; k < 10; k++) {
36985 for (uint32_t m = 1; m <= 4; m++) {
36986 for (uint32_t n = 1; n <= 16; n++) {
36987 GemmMicrokernelTester()
36988 .mr(4)
36989 .nr(16)
36990 .kr(1)
36991 .sr(1)
36992 .m(m)
36993 .n(n)
36994 .k(k)
36995 .iterations(1)
36996 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
36997 }
36998 }
36999 }
37000 }
37001
37002 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, n_gt_16) {
37003 TEST_REQUIRES_X86_AVX512F;
37004 for (uint32_t n = 17; n < 32; n++) {
37005 for (size_t k = 1; k <= 5; k += 2) {
37006 GemmMicrokernelTester()
37007 .mr(4)
37008 .nr(16)
37009 .kr(1)
37010 .sr(1)
37011 .m(4)
37012 .n(16)
37013 .k(k)
37014 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
37015 }
37016 }
37017 }
37018
37019 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
37020 TEST_REQUIRES_X86_AVX512F;
37021 for (uint32_t n = 17; n < 32; n++) {
37022 for (size_t k = 1; k <= 5; k += 2) {
37023 GemmMicrokernelTester()
37024 .mr(4)
37025 .nr(16)
37026 .kr(1)
37027 .sr(1)
37028 .m(4)
37029 .n(16)
37030 .k(k)
37031 .cn_stride(19)
37032 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
37033 }
37034 }
37035 }
37036
37037 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, n_gt_16_subtile) {
37038 TEST_REQUIRES_X86_AVX512F;
37039 for (uint32_t n = 17; n < 32; n++) {
37040 for (size_t k = 1; k <= 5; k += 2) {
37041 for (uint32_t m = 1; m <= 4; m++) {
37042 GemmMicrokernelTester()
37043 .mr(4)
37044 .nr(16)
37045 .kr(1)
37046 .sr(1)
37047 .m(m)
37048 .n(n)
37049 .k(k)
37050 .iterations(1)
37051 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
37052 }
37053 }
37054 }
37055 }
37056
37057 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, n_div_16) {
37058 TEST_REQUIRES_X86_AVX512F;
37059 for (uint32_t n = 32; n <= 48; n += 16) {
37060 for (size_t k = 1; k <= 5; k += 2) {
37061 GemmMicrokernelTester()
37062 .mr(4)
37063 .nr(16)
37064 .kr(1)
37065 .sr(1)
37066 .m(4)
37067 .n(16)
37068 .k(k)
37069 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
37070 }
37071 }
37072 }
37073
37074 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
37075 TEST_REQUIRES_X86_AVX512F;
37076 for (uint32_t n = 32; n <= 48; n += 16) {
37077 for (size_t k = 1; k <= 5; k += 2) {
37078 GemmMicrokernelTester()
37079 .mr(4)
37080 .nr(16)
37081 .kr(1)
37082 .sr(1)
37083 .m(4)
37084 .n(n)
37085 .k(k)
37086 .cn_stride(19)
37087 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
37088 }
37089 }
37090 }
37091
37092 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, n_div_16_subtile) {
37093 TEST_REQUIRES_X86_AVX512F;
37094 for (uint32_t n = 32; n <= 48; n += 16) {
37095 for (size_t k = 1; k <= 5; k += 2) {
37096 for (uint32_t m = 1; m <= 4; m++) {
37097 GemmMicrokernelTester()
37098 .mr(4)
37099 .nr(16)
37100 .kr(1)
37101 .sr(1)
37102 .m(m)
37103 .n(n)
37104 .k(k)
37105 .iterations(1)
37106 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
37107 }
37108 }
37109 }
37110 }
37111
37112 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, small_kernel) {
37113 TEST_REQUIRES_X86_AVX512F;
37114 for (size_t k = 1; k <= 5; k += 2) {
37115 GemmMicrokernelTester()
37116 .mr(4)
37117 .nr(16)
37118 .kr(1)
37119 .sr(1)
37120 .m(4)
37121 .n(16)
37122 .k(k)
37123 .ks(3)
37124 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
37125 }
37126 }
37127
37128 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, small_kernel_subtile) {
37129 TEST_REQUIRES_X86_AVX512F;
37130 for (size_t k = 1; k <= 5; k += 2) {
37131 for (uint32_t m = 1; m <= 4; m++) {
37132 for (uint32_t n = 1; n <= 16; n++) {
37133 GemmMicrokernelTester()
37134 .mr(4)
37135 .nr(16)
37136 .kr(1)
37137 .sr(1)
37138 .m(m)
37139 .n(n)
37140 .k(k)
37141 .ks(3)
37142 .iterations(1)
37143 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
37144 }
37145 }
37146 }
37147 }
37148
37149 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, n_gt_16_small_kernel) {
37150 TEST_REQUIRES_X86_AVX512F;
37151 for (uint32_t n = 17; n < 32; n++) {
37152 for (size_t k = 1; k <= 5; k += 2) {
37153 GemmMicrokernelTester()
37154 .mr(4)
37155 .nr(16)
37156 .kr(1)
37157 .sr(1)
37158 .m(4)
37159 .n(16)
37160 .k(k)
37161 .ks(3)
37162 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
37163 }
37164 }
37165 }
37166
37167 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, n_div_16_small_kernel) {
37168 TEST_REQUIRES_X86_AVX512F;
37169 for (uint32_t n = 32; n <= 48; n += 16) {
37170 for (size_t k = 1; k <= 5; k += 2) {
37171 GemmMicrokernelTester()
37172 .mr(4)
37173 .nr(16)
37174 .kr(1)
37175 .sr(1)
37176 .m(4)
37177 .n(16)
37178 .k(k)
37179 .ks(3)
37180 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
37181 }
37182 }
37183 }
37184
37185 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, strided_cm_subtile) {
37186 TEST_REQUIRES_X86_AVX512F;
37187 for (size_t k = 1; k <= 5; k += 2) {
37188 for (uint32_t m = 1; m <= 4; m++) {
37189 for (uint32_t n = 1; n <= 16; n++) {
37190 GemmMicrokernelTester()
37191 .mr(4)
37192 .nr(16)
37193 .kr(1)
37194 .sr(1)
37195 .m(m)
37196 .n(n)
37197 .k(k)
37198 .cm_stride(19)
37199 .iterations(1)
37200 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
37201 }
37202 }
37203 }
37204 }
37205
37206 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, a_offset) {
37207 TEST_REQUIRES_X86_AVX512F;
37208 for (size_t k = 1; k <= 5; k += 2) {
37209 GemmMicrokernelTester()
37210 .mr(4)
37211 .nr(16)
37212 .kr(1)
37213 .sr(1)
37214 .m(4)
37215 .n(16)
37216 .k(k)
37217 .ks(3)
37218 .a_offset(23)
37219 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
37220 }
37221 }
37222
37223 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, zero) {
37224 TEST_REQUIRES_X86_AVX512F;
37225 for (uint32_t mz = 0; mz < 4; mz++) {
37226 for (size_t k = 1; k <= 5; k += 2) {
37227 GemmMicrokernelTester()
37228 .mr(4)
37229 .nr(16)
37230 .kr(1)
37231 .sr(1)
37232 .m(4)
37233 .n(16)
37234 .k(k)
37235 .ks(3)
37236 .a_offset(23)
37237 .zero_index(mz)
37238 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
37239 }
37240 }
37241 }
37242
37243 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, qmin) {
37244 TEST_REQUIRES_X86_AVX512F;
37245 GemmMicrokernelTester()
37246 .mr(4)
37247 .nr(16)
37248 .kr(1)
37249 .sr(1)
37250 .m(4)
37251 .n(16)
37252 .k(1)
37253 .qmin(128)
37254 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
37255 }
37256
37257 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, qmax) {
37258 TEST_REQUIRES_X86_AVX512F;
37259 GemmMicrokernelTester()
37260 .mr(4)
37261 .nr(16)
37262 .kr(1)
37263 .sr(1)
37264 .m(4)
37265 .n(16)
37266 .k(1)
37267 .qmax(128)
37268 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
37269 }
37270
37271 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, strided_cm) {
37272 TEST_REQUIRES_X86_AVX512F;
37273 GemmMicrokernelTester()
37274 .mr(4)
37275 .nr(16)
37276 .kr(1)
37277 .sr(1)
37278 .m(4)
37279 .n(16)
37280 .k(1)
37281 .cm_stride(19)
37282 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
37283 }
37284#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
37285
37286
37287#if XNN_ARCH_X86 || XNN_ARCH_X86_64
37288 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, k_eq_1) {
37289 TEST_REQUIRES_X86_AVX512F;
37290 GemmMicrokernelTester()
37291 .mr(5)
37292 .nr(16)
37293 .kr(1)
37294 .sr(1)
37295 .m(5)
37296 .n(16)
37297 .k(1)
37298 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37299 }
37300
37301 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, strided_cn) {
37302 TEST_REQUIRES_X86_AVX512F;
37303 GemmMicrokernelTester()
37304 .mr(5)
37305 .nr(16)
37306 .kr(1)
37307 .sr(1)
37308 .m(5)
37309 .n(16)
37310 .k(1)
37311 .cn_stride(19)
37312 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37313 }
37314
37315 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, k_eq_1_subtile) {
37316 TEST_REQUIRES_X86_AVX512F;
37317 for (uint32_t m = 1; m <= 5; m++) {
37318 for (uint32_t n = 1; n <= 16; n++) {
37319 GemmMicrokernelTester()
37320 .mr(5)
37321 .nr(16)
37322 .kr(1)
37323 .sr(1)
37324 .m(m)
37325 .n(n)
37326 .k(1)
37327 .iterations(1)
37328 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37329 }
37330 }
37331 }
37332
37333 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
37334 TEST_REQUIRES_X86_AVX512F;
37335 for (uint32_t m = 1; m <= 5; m++) {
37336 GemmMicrokernelTester()
37337 .mr(5)
37338 .nr(16)
37339 .kr(1)
37340 .sr(1)
37341 .m(m)
37342 .n(16)
37343 .k(1)
37344 .iterations(1)
37345 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37346 }
37347 }
37348
37349 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
37350 TEST_REQUIRES_X86_AVX512F;
37351 for (uint32_t n = 1; n <= 16; n++) {
37352 GemmMicrokernelTester()
37353 .mr(5)
37354 .nr(16)
37355 .kr(1)
37356 .sr(1)
37357 .m(5)
37358 .n(n)
37359 .k(1)
37360 .iterations(1)
37361 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37362 }
37363 }
37364
37365 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, k_gt_1) {
37366 TEST_REQUIRES_X86_AVX512F;
37367 for (size_t k = 2; k < 10; k++) {
37368 GemmMicrokernelTester()
37369 .mr(5)
37370 .nr(16)
37371 .kr(1)
37372 .sr(1)
37373 .m(5)
37374 .n(16)
37375 .k(k)
37376 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37377 }
37378 }
37379
37380 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, k_gt_1_subtile) {
37381 TEST_REQUIRES_X86_AVX512F;
37382 for (size_t k = 2; k < 10; k++) {
37383 for (uint32_t m = 1; m <= 5; m++) {
37384 for (uint32_t n = 1; n <= 16; n++) {
37385 GemmMicrokernelTester()
37386 .mr(5)
37387 .nr(16)
37388 .kr(1)
37389 .sr(1)
37390 .m(m)
37391 .n(n)
37392 .k(k)
37393 .iterations(1)
37394 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37395 }
37396 }
37397 }
37398 }
37399
37400 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, n_gt_16) {
37401 TEST_REQUIRES_X86_AVX512F;
37402 for (uint32_t n = 17; n < 32; n++) {
37403 for (size_t k = 1; k <= 5; k += 2) {
37404 GemmMicrokernelTester()
37405 .mr(5)
37406 .nr(16)
37407 .kr(1)
37408 .sr(1)
37409 .m(5)
37410 .n(16)
37411 .k(k)
37412 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37413 }
37414 }
37415 }
37416
37417 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
37418 TEST_REQUIRES_X86_AVX512F;
37419 for (uint32_t n = 17; n < 32; n++) {
37420 for (size_t k = 1; k <= 5; k += 2) {
37421 GemmMicrokernelTester()
37422 .mr(5)
37423 .nr(16)
37424 .kr(1)
37425 .sr(1)
37426 .m(5)
37427 .n(16)
37428 .k(k)
37429 .cn_stride(19)
37430 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37431 }
37432 }
37433 }
37434
37435 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, n_gt_16_subtile) {
37436 TEST_REQUIRES_X86_AVX512F;
37437 for (uint32_t n = 17; n < 32; n++) {
37438 for (size_t k = 1; k <= 5; k += 2) {
37439 for (uint32_t m = 1; m <= 5; m++) {
37440 GemmMicrokernelTester()
37441 .mr(5)
37442 .nr(16)
37443 .kr(1)
37444 .sr(1)
37445 .m(m)
37446 .n(n)
37447 .k(k)
37448 .iterations(1)
37449 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37450 }
37451 }
37452 }
37453 }
37454
37455 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, n_div_16) {
37456 TEST_REQUIRES_X86_AVX512F;
37457 for (uint32_t n = 32; n <= 48; n += 16) {
37458 for (size_t k = 1; k <= 5; k += 2) {
37459 GemmMicrokernelTester()
37460 .mr(5)
37461 .nr(16)
37462 .kr(1)
37463 .sr(1)
37464 .m(5)
37465 .n(16)
37466 .k(k)
37467 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37468 }
37469 }
37470 }
37471
37472 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
37473 TEST_REQUIRES_X86_AVX512F;
37474 for (uint32_t n = 32; n <= 48; n += 16) {
37475 for (size_t k = 1; k <= 5; k += 2) {
37476 GemmMicrokernelTester()
37477 .mr(5)
37478 .nr(16)
37479 .kr(1)
37480 .sr(1)
37481 .m(5)
37482 .n(n)
37483 .k(k)
37484 .cn_stride(19)
37485 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37486 }
37487 }
37488 }
37489
37490 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, n_div_16_subtile) {
37491 TEST_REQUIRES_X86_AVX512F;
37492 for (uint32_t n = 32; n <= 48; n += 16) {
37493 for (size_t k = 1; k <= 5; k += 2) {
37494 for (uint32_t m = 1; m <= 5; m++) {
37495 GemmMicrokernelTester()
37496 .mr(5)
37497 .nr(16)
37498 .kr(1)
37499 .sr(1)
37500 .m(m)
37501 .n(n)
37502 .k(k)
37503 .iterations(1)
37504 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37505 }
37506 }
37507 }
37508 }
37509
37510 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, small_kernel) {
37511 TEST_REQUIRES_X86_AVX512F;
37512 for (size_t k = 1; k <= 5; k += 2) {
37513 GemmMicrokernelTester()
37514 .mr(5)
37515 .nr(16)
37516 .kr(1)
37517 .sr(1)
37518 .m(5)
37519 .n(16)
37520 .k(k)
37521 .ks(3)
37522 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37523 }
37524 }
37525
37526 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, small_kernel_subtile) {
37527 TEST_REQUIRES_X86_AVX512F;
37528 for (size_t k = 1; k <= 5; k += 2) {
37529 for (uint32_t m = 1; m <= 5; m++) {
37530 for (uint32_t n = 1; n <= 16; n++) {
37531 GemmMicrokernelTester()
37532 .mr(5)
37533 .nr(16)
37534 .kr(1)
37535 .sr(1)
37536 .m(m)
37537 .n(n)
37538 .k(k)
37539 .ks(3)
37540 .iterations(1)
37541 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37542 }
37543 }
37544 }
37545 }
37546
37547 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, n_gt_16_small_kernel) {
37548 TEST_REQUIRES_X86_AVX512F;
37549 for (uint32_t n = 17; n < 32; n++) {
37550 for (size_t k = 1; k <= 5; k += 2) {
37551 GemmMicrokernelTester()
37552 .mr(5)
37553 .nr(16)
37554 .kr(1)
37555 .sr(1)
37556 .m(5)
37557 .n(16)
37558 .k(k)
37559 .ks(3)
37560 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37561 }
37562 }
37563 }
37564
37565 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, n_div_16_small_kernel) {
37566 TEST_REQUIRES_X86_AVX512F;
37567 for (uint32_t n = 32; n <= 48; n += 16) {
37568 for (size_t k = 1; k <= 5; k += 2) {
37569 GemmMicrokernelTester()
37570 .mr(5)
37571 .nr(16)
37572 .kr(1)
37573 .sr(1)
37574 .m(5)
37575 .n(16)
37576 .k(k)
37577 .ks(3)
37578 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37579 }
37580 }
37581 }
37582
37583 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, strided_cm_subtile) {
37584 TEST_REQUIRES_X86_AVX512F;
37585 for (size_t k = 1; k <= 5; k += 2) {
37586 for (uint32_t m = 1; m <= 5; m++) {
37587 for (uint32_t n = 1; n <= 16; n++) {
37588 GemmMicrokernelTester()
37589 .mr(5)
37590 .nr(16)
37591 .kr(1)
37592 .sr(1)
37593 .m(m)
37594 .n(n)
37595 .k(k)
37596 .cm_stride(19)
37597 .iterations(1)
37598 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37599 }
37600 }
37601 }
37602 }
37603
37604 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, a_offset) {
37605 TEST_REQUIRES_X86_AVX512F;
37606 for (size_t k = 1; k <= 5; k += 2) {
37607 GemmMicrokernelTester()
37608 .mr(5)
37609 .nr(16)
37610 .kr(1)
37611 .sr(1)
37612 .m(5)
37613 .n(16)
37614 .k(k)
37615 .ks(3)
37616 .a_offset(29)
37617 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37618 }
37619 }
37620
37621 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, zero) {
37622 TEST_REQUIRES_X86_AVX512F;
37623 for (uint32_t mz = 0; mz < 5; mz++) {
37624 for (size_t k = 1; k <= 5; k += 2) {
37625 GemmMicrokernelTester()
37626 .mr(5)
37627 .nr(16)
37628 .kr(1)
37629 .sr(1)
37630 .m(5)
37631 .n(16)
37632 .k(k)
37633 .ks(3)
37634 .a_offset(29)
37635 .zero_index(mz)
37636 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37637 }
37638 }
37639 }
37640
37641 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, qmin) {
37642 TEST_REQUIRES_X86_AVX512F;
37643 GemmMicrokernelTester()
37644 .mr(5)
37645 .nr(16)
37646 .kr(1)
37647 .sr(1)
37648 .m(5)
37649 .n(16)
37650 .k(1)
37651 .qmin(128)
37652 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37653 }
37654
37655 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, qmax) {
37656 TEST_REQUIRES_X86_AVX512F;
37657 GemmMicrokernelTester()
37658 .mr(5)
37659 .nr(16)
37660 .kr(1)
37661 .sr(1)
37662 .m(5)
37663 .n(16)
37664 .k(1)
37665 .qmax(128)
37666 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37667 }
37668
37669 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, strided_cm) {
37670 TEST_REQUIRES_X86_AVX512F;
37671 GemmMicrokernelTester()
37672 .mr(5)
37673 .nr(16)
37674 .kr(1)
37675 .sr(1)
37676 .m(5)
37677 .n(16)
37678 .k(1)
37679 .cm_stride(19)
37680 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
37681 }
37682#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
37683
37684
37685#if XNN_ARCH_X86 || XNN_ARCH_X86_64
37686 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, k_eq_1) {
37687 TEST_REQUIRES_X86_AVX512F;
37688 GemmMicrokernelTester()
37689 .mr(6)
37690 .nr(16)
37691 .kr(1)
37692 .sr(1)
37693 .m(6)
37694 .n(16)
37695 .k(1)
37696 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
37697 }
37698
37699 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, strided_cn) {
37700 TEST_REQUIRES_X86_AVX512F;
37701 GemmMicrokernelTester()
37702 .mr(6)
37703 .nr(16)
37704 .kr(1)
37705 .sr(1)
37706 .m(6)
37707 .n(16)
37708 .k(1)
37709 .cn_stride(19)
37710 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
37711 }
37712
37713 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, k_eq_1_subtile) {
37714 TEST_REQUIRES_X86_AVX512F;
37715 for (uint32_t m = 1; m <= 6; m++) {
37716 for (uint32_t n = 1; n <= 16; n++) {
37717 GemmMicrokernelTester()
37718 .mr(6)
37719 .nr(16)
37720 .kr(1)
37721 .sr(1)
37722 .m(m)
37723 .n(n)
37724 .k(1)
37725 .iterations(1)
37726 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
37727 }
37728 }
37729 }
37730
37731 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
37732 TEST_REQUIRES_X86_AVX512F;
37733 for (uint32_t m = 1; m <= 6; m++) {
37734 GemmMicrokernelTester()
37735 .mr(6)
37736 .nr(16)
37737 .kr(1)
37738 .sr(1)
37739 .m(m)
37740 .n(16)
37741 .k(1)
37742 .iterations(1)
37743 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
37744 }
37745 }
37746
37747 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
37748 TEST_REQUIRES_X86_AVX512F;
37749 for (uint32_t n = 1; n <= 16; n++) {
37750 GemmMicrokernelTester()
37751 .mr(6)
37752 .nr(16)
37753 .kr(1)
37754 .sr(1)
37755 .m(6)
37756 .n(n)
37757 .k(1)
37758 .iterations(1)
37759 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
37760 }
37761 }
37762
37763 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, k_gt_1) {
37764 TEST_REQUIRES_X86_AVX512F;
37765 for (size_t k = 2; k < 10; k++) {
37766 GemmMicrokernelTester()
37767 .mr(6)
37768 .nr(16)
37769 .kr(1)
37770 .sr(1)
37771 .m(6)
37772 .n(16)
37773 .k(k)
37774 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
37775 }
37776 }
37777
37778 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, k_gt_1_subtile) {
37779 TEST_REQUIRES_X86_AVX512F;
37780 for (size_t k = 2; k < 10; k++) {
37781 for (uint32_t m = 1; m <= 6; m++) {
37782 for (uint32_t n = 1; n <= 16; n++) {
37783 GemmMicrokernelTester()
37784 .mr(6)
37785 .nr(16)
37786 .kr(1)
37787 .sr(1)
37788 .m(m)
37789 .n(n)
37790 .k(k)
37791 .iterations(1)
37792 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
37793 }
37794 }
37795 }
37796 }
37797
37798 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, n_gt_16) {
37799 TEST_REQUIRES_X86_AVX512F;
37800 for (uint32_t n = 17; n < 32; n++) {
37801 for (size_t k = 1; k <= 5; k += 2) {
37802 GemmMicrokernelTester()
37803 .mr(6)
37804 .nr(16)
37805 .kr(1)
37806 .sr(1)
37807 .m(6)
37808 .n(16)
37809 .k(k)
37810 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
37811 }
37812 }
37813 }
37814
37815 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
37816 TEST_REQUIRES_X86_AVX512F;
37817 for (uint32_t n = 17; n < 32; n++) {
37818 for (size_t k = 1; k <= 5; k += 2) {
37819 GemmMicrokernelTester()
37820 .mr(6)
37821 .nr(16)
37822 .kr(1)
37823 .sr(1)
37824 .m(6)
37825 .n(16)
37826 .k(k)
37827 .cn_stride(19)
37828 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
37829 }
37830 }
37831 }
37832
37833 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, n_gt_16_subtile) {
37834 TEST_REQUIRES_X86_AVX512F;
37835 for (uint32_t n = 17; n < 32; n++) {
37836 for (size_t k = 1; k <= 5; k += 2) {
37837 for (uint32_t m = 1; m <= 6; m++) {
37838 GemmMicrokernelTester()
37839 .mr(6)
37840 .nr(16)
37841 .kr(1)
37842 .sr(1)
37843 .m(m)
37844 .n(n)
37845 .k(k)
37846 .iterations(1)
37847 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
37848 }
37849 }
37850 }
37851 }
37852
37853 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, n_div_16) {
37854 TEST_REQUIRES_X86_AVX512F;
37855 for (uint32_t n = 32; n <= 48; n += 16) {
37856 for (size_t k = 1; k <= 5; k += 2) {
37857 GemmMicrokernelTester()
37858 .mr(6)
37859 .nr(16)
37860 .kr(1)
37861 .sr(1)
37862 .m(6)
37863 .n(16)
37864 .k(k)
37865 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
37866 }
37867 }
37868 }
37869
37870 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
37871 TEST_REQUIRES_X86_AVX512F;
37872 for (uint32_t n = 32; n <= 48; n += 16) {
37873 for (size_t k = 1; k <= 5; k += 2) {
37874 GemmMicrokernelTester()
37875 .mr(6)
37876 .nr(16)
37877 .kr(1)
37878 .sr(1)
37879 .m(6)
37880 .n(n)
37881 .k(k)
37882 .cn_stride(19)
37883 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
37884 }
37885 }
37886 }
37887
37888 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, n_div_16_subtile) {
37889 TEST_REQUIRES_X86_AVX512F;
37890 for (uint32_t n = 32; n <= 48; n += 16) {
37891 for (size_t k = 1; k <= 5; k += 2) {
37892 for (uint32_t m = 1; m <= 6; m++) {
37893 GemmMicrokernelTester()
37894 .mr(6)
37895 .nr(16)
37896 .kr(1)
37897 .sr(1)
37898 .m(m)
37899 .n(n)
37900 .k(k)
37901 .iterations(1)
37902 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
37903 }
37904 }
37905 }
37906 }
37907
37908 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, small_kernel) {
37909 TEST_REQUIRES_X86_AVX512F;
37910 for (size_t k = 1; k <= 5; k += 2) {
37911 GemmMicrokernelTester()
37912 .mr(6)
37913 .nr(16)
37914 .kr(1)
37915 .sr(1)
37916 .m(6)
37917 .n(16)
37918 .k(k)
37919 .ks(3)
37920 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
37921 }
37922 }
37923
37924 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, small_kernel_subtile) {
37925 TEST_REQUIRES_X86_AVX512F;
37926 for (size_t k = 1; k <= 5; k += 2) {
37927 for (uint32_t m = 1; m <= 6; m++) {
37928 for (uint32_t n = 1; n <= 16; n++) {
37929 GemmMicrokernelTester()
37930 .mr(6)
37931 .nr(16)
37932 .kr(1)
37933 .sr(1)
37934 .m(m)
37935 .n(n)
37936 .k(k)
37937 .ks(3)
37938 .iterations(1)
37939 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
37940 }
37941 }
37942 }
37943 }
37944
37945 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, n_gt_16_small_kernel) {
37946 TEST_REQUIRES_X86_AVX512F;
37947 for (uint32_t n = 17; n < 32; n++) {
37948 for (size_t k = 1; k <= 5; k += 2) {
37949 GemmMicrokernelTester()
37950 .mr(6)
37951 .nr(16)
37952 .kr(1)
37953 .sr(1)
37954 .m(6)
37955 .n(16)
37956 .k(k)
37957 .ks(3)
37958 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
37959 }
37960 }
37961 }
37962
37963 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, n_div_16_small_kernel) {
37964 TEST_REQUIRES_X86_AVX512F;
37965 for (uint32_t n = 32; n <= 48; n += 16) {
37966 for (size_t k = 1; k <= 5; k += 2) {
37967 GemmMicrokernelTester()
37968 .mr(6)
37969 .nr(16)
37970 .kr(1)
37971 .sr(1)
37972 .m(6)
37973 .n(16)
37974 .k(k)
37975 .ks(3)
37976 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
37977 }
37978 }
37979 }
37980
37981 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, strided_cm_subtile) {
37982 TEST_REQUIRES_X86_AVX512F;
37983 for (size_t k = 1; k <= 5; k += 2) {
37984 for (uint32_t m = 1; m <= 6; m++) {
37985 for (uint32_t n = 1; n <= 16; n++) {
37986 GemmMicrokernelTester()
37987 .mr(6)
37988 .nr(16)
37989 .kr(1)
37990 .sr(1)
37991 .m(m)
37992 .n(n)
37993 .k(k)
37994 .cm_stride(19)
37995 .iterations(1)
37996 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
37997 }
37998 }
37999 }
38000 }
38001
38002 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, a_offset) {
38003 TEST_REQUIRES_X86_AVX512F;
38004 for (size_t k = 1; k <= 5; k += 2) {
38005 GemmMicrokernelTester()
38006 .mr(6)
38007 .nr(16)
38008 .kr(1)
38009 .sr(1)
38010 .m(6)
38011 .n(16)
38012 .k(k)
38013 .ks(3)
38014 .a_offset(37)
38015 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
38016 }
38017 }
38018
38019 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, zero) {
38020 TEST_REQUIRES_X86_AVX512F;
38021 for (uint32_t mz = 0; mz < 6; mz++) {
38022 for (size_t k = 1; k <= 5; k += 2) {
38023 GemmMicrokernelTester()
38024 .mr(6)
38025 .nr(16)
38026 .kr(1)
38027 .sr(1)
38028 .m(6)
38029 .n(16)
38030 .k(k)
38031 .ks(3)
38032 .a_offset(37)
38033 .zero_index(mz)
38034 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
38035 }
38036 }
38037 }
38038
38039 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, qmin) {
38040 TEST_REQUIRES_X86_AVX512F;
38041 GemmMicrokernelTester()
38042 .mr(6)
38043 .nr(16)
38044 .kr(1)
38045 .sr(1)
38046 .m(6)
38047 .n(16)
38048 .k(1)
38049 .qmin(128)
38050 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
38051 }
38052
38053 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, qmax) {
38054 TEST_REQUIRES_X86_AVX512F;
38055 GemmMicrokernelTester()
38056 .mr(6)
38057 .nr(16)
38058 .kr(1)
38059 .sr(1)
38060 .m(6)
38061 .n(16)
38062 .k(1)
38063 .qmax(128)
38064 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
38065 }
38066
38067 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, strided_cm) {
38068 TEST_REQUIRES_X86_AVX512F;
38069 GemmMicrokernelTester()
38070 .mr(6)
38071 .nr(16)
38072 .kr(1)
38073 .sr(1)
38074 .m(6)
38075 .n(16)
38076 .k(1)
38077 .cm_stride(19)
38078 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
38079 }
38080#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
38081
38082
38083#if XNN_ARCH_X86 || XNN_ARCH_X86_64
38084 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, k_eq_1) {
38085 TEST_REQUIRES_X86_AVX512F;
38086 GemmMicrokernelTester()
38087 .mr(7)
38088 .nr(16)
38089 .kr(1)
38090 .sr(1)
38091 .m(7)
38092 .n(16)
38093 .k(1)
38094 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38095 }
38096
38097 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, strided_cn) {
38098 TEST_REQUIRES_X86_AVX512F;
38099 GemmMicrokernelTester()
38100 .mr(7)
38101 .nr(16)
38102 .kr(1)
38103 .sr(1)
38104 .m(7)
38105 .n(16)
38106 .k(1)
38107 .cn_stride(19)
38108 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38109 }
38110
38111 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, k_eq_1_subtile) {
38112 TEST_REQUIRES_X86_AVX512F;
38113 for (uint32_t m = 1; m <= 7; m++) {
38114 for (uint32_t n = 1; n <= 16; n++) {
38115 GemmMicrokernelTester()
38116 .mr(7)
38117 .nr(16)
38118 .kr(1)
38119 .sr(1)
38120 .m(m)
38121 .n(n)
38122 .k(1)
38123 .iterations(1)
38124 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38125 }
38126 }
38127 }
38128
38129 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
38130 TEST_REQUIRES_X86_AVX512F;
38131 for (uint32_t m = 1; m <= 7; m++) {
38132 GemmMicrokernelTester()
38133 .mr(7)
38134 .nr(16)
38135 .kr(1)
38136 .sr(1)
38137 .m(m)
38138 .n(16)
38139 .k(1)
38140 .iterations(1)
38141 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38142 }
38143 }
38144
38145 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
38146 TEST_REQUIRES_X86_AVX512F;
38147 for (uint32_t n = 1; n <= 16; n++) {
38148 GemmMicrokernelTester()
38149 .mr(7)
38150 .nr(16)
38151 .kr(1)
38152 .sr(1)
38153 .m(7)
38154 .n(n)
38155 .k(1)
38156 .iterations(1)
38157 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38158 }
38159 }
38160
38161 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, k_gt_1) {
38162 TEST_REQUIRES_X86_AVX512F;
38163 for (size_t k = 2; k < 10; k++) {
38164 GemmMicrokernelTester()
38165 .mr(7)
38166 .nr(16)
38167 .kr(1)
38168 .sr(1)
38169 .m(7)
38170 .n(16)
38171 .k(k)
38172 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38173 }
38174 }
38175
38176 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, k_gt_1_subtile) {
38177 TEST_REQUIRES_X86_AVX512F;
38178 for (size_t k = 2; k < 10; k++) {
38179 for (uint32_t m = 1; m <= 7; m++) {
38180 for (uint32_t n = 1; n <= 16; n++) {
38181 GemmMicrokernelTester()
38182 .mr(7)
38183 .nr(16)
38184 .kr(1)
38185 .sr(1)
38186 .m(m)
38187 .n(n)
38188 .k(k)
38189 .iterations(1)
38190 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38191 }
38192 }
38193 }
38194 }
38195
38196 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, n_gt_16) {
38197 TEST_REQUIRES_X86_AVX512F;
38198 for (uint32_t n = 17; n < 32; n++) {
38199 for (size_t k = 1; k <= 5; k += 2) {
38200 GemmMicrokernelTester()
38201 .mr(7)
38202 .nr(16)
38203 .kr(1)
38204 .sr(1)
38205 .m(7)
38206 .n(16)
38207 .k(k)
38208 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38209 }
38210 }
38211 }
38212
38213 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
38214 TEST_REQUIRES_X86_AVX512F;
38215 for (uint32_t n = 17; n < 32; n++) {
38216 for (size_t k = 1; k <= 5; k += 2) {
38217 GemmMicrokernelTester()
38218 .mr(7)
38219 .nr(16)
38220 .kr(1)
38221 .sr(1)
38222 .m(7)
38223 .n(16)
38224 .k(k)
38225 .cn_stride(19)
38226 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38227 }
38228 }
38229 }
38230
38231 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, n_gt_16_subtile) {
38232 TEST_REQUIRES_X86_AVX512F;
38233 for (uint32_t n = 17; n < 32; n++) {
38234 for (size_t k = 1; k <= 5; k += 2) {
38235 for (uint32_t m = 1; m <= 7; m++) {
38236 GemmMicrokernelTester()
38237 .mr(7)
38238 .nr(16)
38239 .kr(1)
38240 .sr(1)
38241 .m(m)
38242 .n(n)
38243 .k(k)
38244 .iterations(1)
38245 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38246 }
38247 }
38248 }
38249 }
38250
38251 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, n_div_16) {
38252 TEST_REQUIRES_X86_AVX512F;
38253 for (uint32_t n = 32; n <= 48; n += 16) {
38254 for (size_t k = 1; k <= 5; k += 2) {
38255 GemmMicrokernelTester()
38256 .mr(7)
38257 .nr(16)
38258 .kr(1)
38259 .sr(1)
38260 .m(7)
38261 .n(16)
38262 .k(k)
38263 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38264 }
38265 }
38266 }
38267
38268 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
38269 TEST_REQUIRES_X86_AVX512F;
38270 for (uint32_t n = 32; n <= 48; n += 16) {
38271 for (size_t k = 1; k <= 5; k += 2) {
38272 GemmMicrokernelTester()
38273 .mr(7)
38274 .nr(16)
38275 .kr(1)
38276 .sr(1)
38277 .m(7)
38278 .n(n)
38279 .k(k)
38280 .cn_stride(19)
38281 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38282 }
38283 }
38284 }
38285
38286 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, n_div_16_subtile) {
38287 TEST_REQUIRES_X86_AVX512F;
38288 for (uint32_t n = 32; n <= 48; n += 16) {
38289 for (size_t k = 1; k <= 5; k += 2) {
38290 for (uint32_t m = 1; m <= 7; m++) {
38291 GemmMicrokernelTester()
38292 .mr(7)
38293 .nr(16)
38294 .kr(1)
38295 .sr(1)
38296 .m(m)
38297 .n(n)
38298 .k(k)
38299 .iterations(1)
38300 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38301 }
38302 }
38303 }
38304 }
38305
38306 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, small_kernel) {
38307 TEST_REQUIRES_X86_AVX512F;
38308 for (size_t k = 1; k <= 5; k += 2) {
38309 GemmMicrokernelTester()
38310 .mr(7)
38311 .nr(16)
38312 .kr(1)
38313 .sr(1)
38314 .m(7)
38315 .n(16)
38316 .k(k)
38317 .ks(3)
38318 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38319 }
38320 }
38321
38322 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, small_kernel_subtile) {
38323 TEST_REQUIRES_X86_AVX512F;
38324 for (size_t k = 1; k <= 5; k += 2) {
38325 for (uint32_t m = 1; m <= 7; m++) {
38326 for (uint32_t n = 1; n <= 16; n++) {
38327 GemmMicrokernelTester()
38328 .mr(7)
38329 .nr(16)
38330 .kr(1)
38331 .sr(1)
38332 .m(m)
38333 .n(n)
38334 .k(k)
38335 .ks(3)
38336 .iterations(1)
38337 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38338 }
38339 }
38340 }
38341 }
38342
38343 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, n_gt_16_small_kernel) {
38344 TEST_REQUIRES_X86_AVX512F;
38345 for (uint32_t n = 17; n < 32; n++) {
38346 for (size_t k = 1; k <= 5; k += 2) {
38347 GemmMicrokernelTester()
38348 .mr(7)
38349 .nr(16)
38350 .kr(1)
38351 .sr(1)
38352 .m(7)
38353 .n(16)
38354 .k(k)
38355 .ks(3)
38356 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38357 }
38358 }
38359 }
38360
38361 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, n_div_16_small_kernel) {
38362 TEST_REQUIRES_X86_AVX512F;
38363 for (uint32_t n = 32; n <= 48; n += 16) {
38364 for (size_t k = 1; k <= 5; k += 2) {
38365 GemmMicrokernelTester()
38366 .mr(7)
38367 .nr(16)
38368 .kr(1)
38369 .sr(1)
38370 .m(7)
38371 .n(16)
38372 .k(k)
38373 .ks(3)
38374 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38375 }
38376 }
38377 }
38378
38379 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, strided_cm_subtile) {
38380 TEST_REQUIRES_X86_AVX512F;
38381 for (size_t k = 1; k <= 5; k += 2) {
38382 for (uint32_t m = 1; m <= 7; m++) {
38383 for (uint32_t n = 1; n <= 16; n++) {
38384 GemmMicrokernelTester()
38385 .mr(7)
38386 .nr(16)
38387 .kr(1)
38388 .sr(1)
38389 .m(m)
38390 .n(n)
38391 .k(k)
38392 .cm_stride(19)
38393 .iterations(1)
38394 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38395 }
38396 }
38397 }
38398 }
38399
38400 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, a_offset) {
38401 TEST_REQUIRES_X86_AVX512F;
38402 for (size_t k = 1; k <= 5; k += 2) {
38403 GemmMicrokernelTester()
38404 .mr(7)
38405 .nr(16)
38406 .kr(1)
38407 .sr(1)
38408 .m(7)
38409 .n(16)
38410 .k(k)
38411 .ks(3)
38412 .a_offset(37)
38413 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38414 }
38415 }
38416
38417 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, zero) {
38418 TEST_REQUIRES_X86_AVX512F;
38419 for (uint32_t mz = 0; mz < 7; mz++) {
38420 for (size_t k = 1; k <= 5; k += 2) {
38421 GemmMicrokernelTester()
38422 .mr(7)
38423 .nr(16)
38424 .kr(1)
38425 .sr(1)
38426 .m(7)
38427 .n(16)
38428 .k(k)
38429 .ks(3)
38430 .a_offset(37)
38431 .zero_index(mz)
38432 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38433 }
38434 }
38435 }
38436
38437 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, qmin) {
38438 TEST_REQUIRES_X86_AVX512F;
38439 GemmMicrokernelTester()
38440 .mr(7)
38441 .nr(16)
38442 .kr(1)
38443 .sr(1)
38444 .m(7)
38445 .n(16)
38446 .k(1)
38447 .qmin(128)
38448 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38449 }
38450
38451 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, qmax) {
38452 TEST_REQUIRES_X86_AVX512F;
38453 GemmMicrokernelTester()
38454 .mr(7)
38455 .nr(16)
38456 .kr(1)
38457 .sr(1)
38458 .m(7)
38459 .n(16)
38460 .k(1)
38461 .qmax(128)
38462 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38463 }
38464
38465 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, strided_cm) {
38466 TEST_REQUIRES_X86_AVX512F;
38467 GemmMicrokernelTester()
38468 .mr(7)
38469 .nr(16)
38470 .kr(1)
38471 .sr(1)
38472 .m(7)
38473 .n(16)
38474 .k(1)
38475 .cm_stride(19)
38476 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
38477 }
38478#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
38479
38480
38481#if XNN_ARCH_X86 || XNN_ARCH_X86_64
38482 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, k_eq_1) {
38483 TEST_REQUIRES_X86_AVX512F;
38484 GemmMicrokernelTester()
38485 .mr(8)
38486 .nr(16)
38487 .kr(1)
38488 .sr(1)
38489 .m(8)
38490 .n(16)
38491 .k(1)
38492 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38493 }
38494
38495 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, strided_cn) {
38496 TEST_REQUIRES_X86_AVX512F;
38497 GemmMicrokernelTester()
38498 .mr(8)
38499 .nr(16)
38500 .kr(1)
38501 .sr(1)
38502 .m(8)
38503 .n(16)
38504 .k(1)
38505 .cn_stride(19)
38506 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38507 }
38508
38509 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, k_eq_1_subtile) {
38510 TEST_REQUIRES_X86_AVX512F;
38511 for (uint32_t m = 1; m <= 8; m++) {
38512 for (uint32_t n = 1; n <= 16; n++) {
38513 GemmMicrokernelTester()
38514 .mr(8)
38515 .nr(16)
38516 .kr(1)
38517 .sr(1)
38518 .m(m)
38519 .n(n)
38520 .k(1)
38521 .iterations(1)
38522 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38523 }
38524 }
38525 }
38526
38527 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
38528 TEST_REQUIRES_X86_AVX512F;
38529 for (uint32_t m = 1; m <= 8; m++) {
38530 GemmMicrokernelTester()
38531 .mr(8)
38532 .nr(16)
38533 .kr(1)
38534 .sr(1)
38535 .m(m)
38536 .n(16)
38537 .k(1)
38538 .iterations(1)
38539 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38540 }
38541 }
38542
38543 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
38544 TEST_REQUIRES_X86_AVX512F;
38545 for (uint32_t n = 1; n <= 16; n++) {
38546 GemmMicrokernelTester()
38547 .mr(8)
38548 .nr(16)
38549 .kr(1)
38550 .sr(1)
38551 .m(8)
38552 .n(n)
38553 .k(1)
38554 .iterations(1)
38555 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38556 }
38557 }
38558
38559 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, k_gt_1) {
38560 TEST_REQUIRES_X86_AVX512F;
38561 for (size_t k = 2; k < 10; k++) {
38562 GemmMicrokernelTester()
38563 .mr(8)
38564 .nr(16)
38565 .kr(1)
38566 .sr(1)
38567 .m(8)
38568 .n(16)
38569 .k(k)
38570 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38571 }
38572 }
38573
38574 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, k_gt_1_subtile) {
38575 TEST_REQUIRES_X86_AVX512F;
38576 for (size_t k = 2; k < 10; k++) {
38577 for (uint32_t m = 1; m <= 8; m++) {
38578 for (uint32_t n = 1; n <= 16; n++) {
38579 GemmMicrokernelTester()
38580 .mr(8)
38581 .nr(16)
38582 .kr(1)
38583 .sr(1)
38584 .m(m)
38585 .n(n)
38586 .k(k)
38587 .iterations(1)
38588 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38589 }
38590 }
38591 }
38592 }
38593
38594 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, n_gt_16) {
38595 TEST_REQUIRES_X86_AVX512F;
38596 for (uint32_t n = 17; n < 32; n++) {
38597 for (size_t k = 1; k <= 5; k += 2) {
38598 GemmMicrokernelTester()
38599 .mr(8)
38600 .nr(16)
38601 .kr(1)
38602 .sr(1)
38603 .m(8)
38604 .n(16)
38605 .k(k)
38606 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38607 }
38608 }
38609 }
38610
38611 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
38612 TEST_REQUIRES_X86_AVX512F;
38613 for (uint32_t n = 17; n < 32; n++) {
38614 for (size_t k = 1; k <= 5; k += 2) {
38615 GemmMicrokernelTester()
38616 .mr(8)
38617 .nr(16)
38618 .kr(1)
38619 .sr(1)
38620 .m(8)
38621 .n(16)
38622 .k(k)
38623 .cn_stride(19)
38624 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38625 }
38626 }
38627 }
38628
38629 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, n_gt_16_subtile) {
38630 TEST_REQUIRES_X86_AVX512F;
38631 for (uint32_t n = 17; n < 32; n++) {
38632 for (size_t k = 1; k <= 5; k += 2) {
38633 for (uint32_t m = 1; m <= 8; m++) {
38634 GemmMicrokernelTester()
38635 .mr(8)
38636 .nr(16)
38637 .kr(1)
38638 .sr(1)
38639 .m(m)
38640 .n(n)
38641 .k(k)
38642 .iterations(1)
38643 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38644 }
38645 }
38646 }
38647 }
38648
38649 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, n_div_16) {
38650 TEST_REQUIRES_X86_AVX512F;
38651 for (uint32_t n = 32; n <= 48; n += 16) {
38652 for (size_t k = 1; k <= 5; k += 2) {
38653 GemmMicrokernelTester()
38654 .mr(8)
38655 .nr(16)
38656 .kr(1)
38657 .sr(1)
38658 .m(8)
38659 .n(16)
38660 .k(k)
38661 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38662 }
38663 }
38664 }
38665
38666 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
38667 TEST_REQUIRES_X86_AVX512F;
38668 for (uint32_t n = 32; n <= 48; n += 16) {
38669 for (size_t k = 1; k <= 5; k += 2) {
38670 GemmMicrokernelTester()
38671 .mr(8)
38672 .nr(16)
38673 .kr(1)
38674 .sr(1)
38675 .m(8)
38676 .n(n)
38677 .k(k)
38678 .cn_stride(19)
38679 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38680 }
38681 }
38682 }
38683
38684 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, n_div_16_subtile) {
38685 TEST_REQUIRES_X86_AVX512F;
38686 for (uint32_t n = 32; n <= 48; n += 16) {
38687 for (size_t k = 1; k <= 5; k += 2) {
38688 for (uint32_t m = 1; m <= 8; m++) {
38689 GemmMicrokernelTester()
38690 .mr(8)
38691 .nr(16)
38692 .kr(1)
38693 .sr(1)
38694 .m(m)
38695 .n(n)
38696 .k(k)
38697 .iterations(1)
38698 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38699 }
38700 }
38701 }
38702 }
38703
38704 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, small_kernel) {
38705 TEST_REQUIRES_X86_AVX512F;
38706 for (size_t k = 1; k <= 5; k += 2) {
38707 GemmMicrokernelTester()
38708 .mr(8)
38709 .nr(16)
38710 .kr(1)
38711 .sr(1)
38712 .m(8)
38713 .n(16)
38714 .k(k)
38715 .ks(3)
38716 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38717 }
38718 }
38719
38720 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, small_kernel_subtile) {
38721 TEST_REQUIRES_X86_AVX512F;
38722 for (size_t k = 1; k <= 5; k += 2) {
38723 for (uint32_t m = 1; m <= 8; m++) {
38724 for (uint32_t n = 1; n <= 16; n++) {
38725 GemmMicrokernelTester()
38726 .mr(8)
38727 .nr(16)
38728 .kr(1)
38729 .sr(1)
38730 .m(m)
38731 .n(n)
38732 .k(k)
38733 .ks(3)
38734 .iterations(1)
38735 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38736 }
38737 }
38738 }
38739 }
38740
38741 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, n_gt_16_small_kernel) {
38742 TEST_REQUIRES_X86_AVX512F;
38743 for (uint32_t n = 17; n < 32; n++) {
38744 for (size_t k = 1; k <= 5; k += 2) {
38745 GemmMicrokernelTester()
38746 .mr(8)
38747 .nr(16)
38748 .kr(1)
38749 .sr(1)
38750 .m(8)
38751 .n(16)
38752 .k(k)
38753 .ks(3)
38754 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38755 }
38756 }
38757 }
38758
38759 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, n_div_16_small_kernel) {
38760 TEST_REQUIRES_X86_AVX512F;
38761 for (uint32_t n = 32; n <= 48; n += 16) {
38762 for (size_t k = 1; k <= 5; k += 2) {
38763 GemmMicrokernelTester()
38764 .mr(8)
38765 .nr(16)
38766 .kr(1)
38767 .sr(1)
38768 .m(8)
38769 .n(16)
38770 .k(k)
38771 .ks(3)
38772 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38773 }
38774 }
38775 }
38776
38777 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, strided_cm_subtile) {
38778 TEST_REQUIRES_X86_AVX512F;
38779 for (size_t k = 1; k <= 5; k += 2) {
38780 for (uint32_t m = 1; m <= 8; m++) {
38781 for (uint32_t n = 1; n <= 16; n++) {
38782 GemmMicrokernelTester()
38783 .mr(8)
38784 .nr(16)
38785 .kr(1)
38786 .sr(1)
38787 .m(m)
38788 .n(n)
38789 .k(k)
38790 .cm_stride(19)
38791 .iterations(1)
38792 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38793 }
38794 }
38795 }
38796 }
38797
38798 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, a_offset) {
38799 TEST_REQUIRES_X86_AVX512F;
38800 for (size_t k = 1; k <= 5; k += 2) {
38801 GemmMicrokernelTester()
38802 .mr(8)
38803 .nr(16)
38804 .kr(1)
38805 .sr(1)
38806 .m(8)
38807 .n(16)
38808 .k(k)
38809 .ks(3)
38810 .a_offset(43)
38811 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38812 }
38813 }
38814
38815 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, zero) {
38816 TEST_REQUIRES_X86_AVX512F;
38817 for (uint32_t mz = 0; mz < 8; mz++) {
38818 for (size_t k = 1; k <= 5; k += 2) {
38819 GemmMicrokernelTester()
38820 .mr(8)
38821 .nr(16)
38822 .kr(1)
38823 .sr(1)
38824 .m(8)
38825 .n(16)
38826 .k(k)
38827 .ks(3)
38828 .a_offset(43)
38829 .zero_index(mz)
38830 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38831 }
38832 }
38833 }
38834
38835 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, qmin) {
38836 TEST_REQUIRES_X86_AVX512F;
38837 GemmMicrokernelTester()
38838 .mr(8)
38839 .nr(16)
38840 .kr(1)
38841 .sr(1)
38842 .m(8)
38843 .n(16)
38844 .k(1)
38845 .qmin(128)
38846 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38847 }
38848
38849 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, qmax) {
38850 TEST_REQUIRES_X86_AVX512F;
38851 GemmMicrokernelTester()
38852 .mr(8)
38853 .nr(16)
38854 .kr(1)
38855 .sr(1)
38856 .m(8)
38857 .n(16)
38858 .k(1)
38859 .qmax(128)
38860 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38861 }
38862
38863 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, strided_cm) {
38864 TEST_REQUIRES_X86_AVX512F;
38865 GemmMicrokernelTester()
38866 .mr(8)
38867 .nr(16)
38868 .kr(1)
38869 .sr(1)
38870 .m(8)
38871 .n(16)
38872 .k(1)
38873 .cm_stride(19)
38874 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
38875 }
38876#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
38877
38878
Marat Dukhan1dadbf72019-10-01 10:46:20 -070038879#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070038880 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, k_eq_1) {
38881 TEST_REQUIRES_PSIMD;
38882 GemmMicrokernelTester()
38883 .mr(1)
38884 .nr(8)
38885 .kr(1)
38886 .sr(1)
38887 .m(1)
38888 .n(8)
38889 .k(1)
38890 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38891 }
38892
38893 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, strided_cn) {
38894 TEST_REQUIRES_PSIMD;
38895 GemmMicrokernelTester()
38896 .mr(1)
38897 .nr(8)
38898 .kr(1)
38899 .sr(1)
38900 .m(1)
38901 .n(8)
38902 .k(1)
38903 .cn_stride(11)
38904 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38905 }
38906
38907 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, k_eq_1_subtile) {
38908 TEST_REQUIRES_PSIMD;
38909 for (uint32_t m = 1; m <= 1; m++) {
38910 for (uint32_t n = 1; n <= 8; n++) {
38911 GemmMicrokernelTester()
38912 .mr(1)
38913 .nr(8)
38914 .kr(1)
38915 .sr(1)
38916 .m(m)
38917 .n(n)
38918 .k(1)
38919 .iterations(1)
38920 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38921 }
38922 }
38923 }
38924
38925 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, k_eq_1_subtile_m) {
38926 TEST_REQUIRES_PSIMD;
38927 for (uint32_t m = 1; m <= 1; m++) {
38928 GemmMicrokernelTester()
38929 .mr(1)
38930 .nr(8)
38931 .kr(1)
38932 .sr(1)
38933 .m(m)
38934 .n(8)
38935 .k(1)
38936 .iterations(1)
38937 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38938 }
38939 }
38940
38941 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, k_eq_1_subtile_n) {
38942 TEST_REQUIRES_PSIMD;
38943 for (uint32_t n = 1; n <= 8; n++) {
38944 GemmMicrokernelTester()
38945 .mr(1)
38946 .nr(8)
38947 .kr(1)
38948 .sr(1)
38949 .m(1)
38950 .n(n)
38951 .k(1)
38952 .iterations(1)
38953 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38954 }
38955 }
38956
38957 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, k_gt_1) {
38958 TEST_REQUIRES_PSIMD;
38959 for (size_t k = 2; k < 10; k++) {
38960 GemmMicrokernelTester()
38961 .mr(1)
38962 .nr(8)
38963 .kr(1)
38964 .sr(1)
38965 .m(1)
38966 .n(8)
38967 .k(k)
38968 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38969 }
38970 }
38971
38972 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, k_gt_1_subtile) {
38973 TEST_REQUIRES_PSIMD;
38974 for (size_t k = 2; k < 10; k++) {
38975 for (uint32_t m = 1; m <= 1; m++) {
38976 for (uint32_t n = 1; n <= 8; n++) {
38977 GemmMicrokernelTester()
38978 .mr(1)
38979 .nr(8)
38980 .kr(1)
38981 .sr(1)
38982 .m(m)
38983 .n(n)
38984 .k(k)
38985 .iterations(1)
38986 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38987 }
38988 }
38989 }
38990 }
38991
38992 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, n_gt_8) {
38993 TEST_REQUIRES_PSIMD;
38994 for (uint32_t n = 9; n < 16; n++) {
38995 for (size_t k = 1; k <= 5; k += 2) {
38996 GemmMicrokernelTester()
38997 .mr(1)
38998 .nr(8)
38999 .kr(1)
39000 .sr(1)
39001 .m(1)
39002 .n(8)
39003 .k(k)
39004 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39005 }
39006 }
39007 }
39008
39009 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, n_gt_8_strided_cn) {
39010 TEST_REQUIRES_PSIMD;
39011 for (uint32_t n = 9; n < 16; n++) {
39012 for (size_t k = 1; k <= 5; k += 2) {
39013 GemmMicrokernelTester()
39014 .mr(1)
39015 .nr(8)
39016 .kr(1)
39017 .sr(1)
39018 .m(1)
39019 .n(8)
39020 .k(k)
39021 .cn_stride(11)
39022 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39023 }
39024 }
39025 }
39026
39027 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, n_gt_8_subtile) {
39028 TEST_REQUIRES_PSIMD;
39029 for (uint32_t n = 9; n < 16; n++) {
39030 for (size_t k = 1; k <= 5; k += 2) {
39031 for (uint32_t m = 1; m <= 1; m++) {
39032 GemmMicrokernelTester()
39033 .mr(1)
39034 .nr(8)
39035 .kr(1)
39036 .sr(1)
39037 .m(m)
39038 .n(n)
39039 .k(k)
39040 .iterations(1)
39041 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39042 }
39043 }
39044 }
39045 }
39046
39047 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, n_div_8) {
39048 TEST_REQUIRES_PSIMD;
39049 for (uint32_t n = 16; n <= 24; n += 8) {
39050 for (size_t k = 1; k <= 5; k += 2) {
39051 GemmMicrokernelTester()
39052 .mr(1)
39053 .nr(8)
39054 .kr(1)
39055 .sr(1)
39056 .m(1)
39057 .n(8)
39058 .k(k)
39059 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39060 }
39061 }
39062 }
39063
39064 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, n_div_8_strided_cn) {
39065 TEST_REQUIRES_PSIMD;
39066 for (uint32_t n = 16; n <= 24; n += 8) {
39067 for (size_t k = 1; k <= 5; k += 2) {
39068 GemmMicrokernelTester()
39069 .mr(1)
39070 .nr(8)
39071 .kr(1)
39072 .sr(1)
39073 .m(1)
39074 .n(n)
39075 .k(k)
39076 .cn_stride(11)
39077 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39078 }
39079 }
39080 }
39081
39082 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, n_div_8_subtile) {
39083 TEST_REQUIRES_PSIMD;
39084 for (uint32_t n = 16; n <= 24; n += 8) {
39085 for (size_t k = 1; k <= 5; k += 2) {
39086 for (uint32_t m = 1; m <= 1; m++) {
39087 GemmMicrokernelTester()
39088 .mr(1)
39089 .nr(8)
39090 .kr(1)
39091 .sr(1)
39092 .m(m)
39093 .n(n)
39094 .k(k)
39095 .iterations(1)
39096 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39097 }
39098 }
39099 }
39100 }
39101
39102 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, small_kernel) {
39103 TEST_REQUIRES_PSIMD;
39104 for (size_t k = 1; k <= 5; k += 2) {
39105 GemmMicrokernelTester()
39106 .mr(1)
39107 .nr(8)
39108 .kr(1)
39109 .sr(1)
39110 .m(1)
39111 .n(8)
39112 .k(k)
39113 .ks(3)
39114 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39115 }
39116 }
39117
39118 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, small_kernel_subtile) {
39119 TEST_REQUIRES_PSIMD;
39120 for (size_t k = 1; k <= 5; k += 2) {
39121 for (uint32_t m = 1; m <= 1; m++) {
39122 for (uint32_t n = 1; n <= 8; n++) {
39123 GemmMicrokernelTester()
39124 .mr(1)
39125 .nr(8)
39126 .kr(1)
39127 .sr(1)
39128 .m(m)
39129 .n(n)
39130 .k(k)
39131 .ks(3)
39132 .iterations(1)
39133 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39134 }
39135 }
39136 }
39137 }
39138
39139 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, n_gt_8_small_kernel) {
39140 TEST_REQUIRES_PSIMD;
39141 for (uint32_t n = 9; n < 16; n++) {
39142 for (size_t k = 1; k <= 5; k += 2) {
39143 GemmMicrokernelTester()
39144 .mr(1)
39145 .nr(8)
39146 .kr(1)
39147 .sr(1)
39148 .m(1)
39149 .n(8)
39150 .k(k)
39151 .ks(3)
39152 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39153 }
39154 }
39155 }
39156
39157 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, n_div_8_small_kernel) {
39158 TEST_REQUIRES_PSIMD;
39159 for (uint32_t n = 16; n <= 24; n += 8) {
39160 for (size_t k = 1; k <= 5; k += 2) {
39161 GemmMicrokernelTester()
39162 .mr(1)
39163 .nr(8)
39164 .kr(1)
39165 .sr(1)
39166 .m(1)
39167 .n(8)
39168 .k(k)
39169 .ks(3)
39170 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39171 }
39172 }
39173 }
39174
39175 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, strided_cm_subtile) {
39176 TEST_REQUIRES_PSIMD;
39177 for (size_t k = 1; k <= 5; k += 2) {
39178 for (uint32_t m = 1; m <= 1; m++) {
39179 for (uint32_t n = 1; n <= 8; n++) {
39180 GemmMicrokernelTester()
39181 .mr(1)
39182 .nr(8)
39183 .kr(1)
39184 .sr(1)
39185 .m(m)
39186 .n(n)
39187 .k(k)
39188 .cm_stride(11)
39189 .iterations(1)
39190 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39191 }
39192 }
39193 }
39194 }
39195
39196 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, a_offset) {
39197 TEST_REQUIRES_PSIMD;
39198 for (size_t k = 1; k <= 5; k += 2) {
39199 GemmMicrokernelTester()
39200 .mr(1)
39201 .nr(8)
39202 .kr(1)
39203 .sr(1)
39204 .m(1)
39205 .n(8)
39206 .k(k)
39207 .ks(3)
39208 .a_offset(7)
39209 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39210 }
39211 }
39212
39213 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, zero) {
39214 TEST_REQUIRES_PSIMD;
39215 for (uint32_t mz = 0; mz < 1; mz++) {
39216 for (size_t k = 1; k <= 5; k += 2) {
39217 GemmMicrokernelTester()
39218 .mr(1)
39219 .nr(8)
39220 .kr(1)
39221 .sr(1)
39222 .m(1)
39223 .n(8)
39224 .k(k)
39225 .ks(3)
39226 .a_offset(7)
39227 .zero_index(mz)
39228 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39229 }
39230 }
39231 }
39232
39233 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, qmin) {
39234 TEST_REQUIRES_PSIMD;
39235 GemmMicrokernelTester()
39236 .mr(1)
39237 .nr(8)
39238 .kr(1)
39239 .sr(1)
39240 .m(1)
39241 .n(8)
39242 .k(1)
39243 .qmin(128)
39244 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39245 }
39246
39247 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, qmax) {
39248 TEST_REQUIRES_PSIMD;
39249 GemmMicrokernelTester()
39250 .mr(1)
39251 .nr(8)
39252 .kr(1)
39253 .sr(1)
39254 .m(1)
39255 .n(8)
39256 .k(1)
39257 .qmax(128)
39258 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39259 }
39260
39261 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, strided_cm) {
39262 TEST_REQUIRES_PSIMD;
39263 GemmMicrokernelTester()
39264 .mr(1)
39265 .nr(8)
39266 .kr(1)
39267 .sr(1)
39268 .m(1)
39269 .n(8)
39270 .k(1)
39271 .cm_stride(11)
39272 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39273 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070039274#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070039275
39276
Marat Dukhan1dadbf72019-10-01 10:46:20 -070039277#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070039278 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, k_eq_1) {
39279 TEST_REQUIRES_PSIMD;
39280 GemmMicrokernelTester()
39281 .mr(4)
39282 .nr(8)
39283 .kr(1)
39284 .sr(1)
39285 .m(4)
39286 .n(8)
39287 .k(1)
39288 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39289 }
39290
39291 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, strided_cn) {
39292 TEST_REQUIRES_PSIMD;
39293 GemmMicrokernelTester()
39294 .mr(4)
39295 .nr(8)
39296 .kr(1)
39297 .sr(1)
39298 .m(4)
39299 .n(8)
39300 .k(1)
39301 .cn_stride(11)
39302 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39303 }
39304
39305 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, k_eq_1_subtile) {
39306 TEST_REQUIRES_PSIMD;
39307 for (uint32_t m = 1; m <= 4; m++) {
39308 for (uint32_t n = 1; n <= 8; n++) {
39309 GemmMicrokernelTester()
39310 .mr(4)
39311 .nr(8)
39312 .kr(1)
39313 .sr(1)
39314 .m(m)
39315 .n(n)
39316 .k(1)
39317 .iterations(1)
39318 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39319 }
39320 }
39321 }
39322
39323 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, k_eq_1_subtile_m) {
39324 TEST_REQUIRES_PSIMD;
39325 for (uint32_t m = 1; m <= 4; m++) {
39326 GemmMicrokernelTester()
39327 .mr(4)
39328 .nr(8)
39329 .kr(1)
39330 .sr(1)
39331 .m(m)
39332 .n(8)
39333 .k(1)
39334 .iterations(1)
39335 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39336 }
39337 }
39338
39339 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, k_eq_1_subtile_n) {
39340 TEST_REQUIRES_PSIMD;
39341 for (uint32_t n = 1; n <= 8; n++) {
39342 GemmMicrokernelTester()
39343 .mr(4)
39344 .nr(8)
39345 .kr(1)
39346 .sr(1)
39347 .m(4)
39348 .n(n)
39349 .k(1)
39350 .iterations(1)
39351 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39352 }
39353 }
39354
39355 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, k_gt_1) {
39356 TEST_REQUIRES_PSIMD;
39357 for (size_t k = 2; k < 10; k++) {
39358 GemmMicrokernelTester()
39359 .mr(4)
39360 .nr(8)
39361 .kr(1)
39362 .sr(1)
39363 .m(4)
39364 .n(8)
39365 .k(k)
39366 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39367 }
39368 }
39369
39370 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, k_gt_1_subtile) {
39371 TEST_REQUIRES_PSIMD;
39372 for (size_t k = 2; k < 10; k++) {
39373 for (uint32_t m = 1; m <= 4; m++) {
39374 for (uint32_t n = 1; n <= 8; n++) {
39375 GemmMicrokernelTester()
39376 .mr(4)
39377 .nr(8)
39378 .kr(1)
39379 .sr(1)
39380 .m(m)
39381 .n(n)
39382 .k(k)
39383 .iterations(1)
39384 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39385 }
39386 }
39387 }
39388 }
39389
39390 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, n_gt_8) {
39391 TEST_REQUIRES_PSIMD;
39392 for (uint32_t n = 9; n < 16; n++) {
39393 for (size_t k = 1; k <= 5; k += 2) {
39394 GemmMicrokernelTester()
39395 .mr(4)
39396 .nr(8)
39397 .kr(1)
39398 .sr(1)
39399 .m(4)
39400 .n(8)
39401 .k(k)
39402 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39403 }
39404 }
39405 }
39406
39407 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, n_gt_8_strided_cn) {
39408 TEST_REQUIRES_PSIMD;
39409 for (uint32_t n = 9; n < 16; n++) {
39410 for (size_t k = 1; k <= 5; k += 2) {
39411 GemmMicrokernelTester()
39412 .mr(4)
39413 .nr(8)
39414 .kr(1)
39415 .sr(1)
39416 .m(4)
39417 .n(8)
39418 .k(k)
39419 .cn_stride(11)
39420 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39421 }
39422 }
39423 }
39424
39425 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, n_gt_8_subtile) {
39426 TEST_REQUIRES_PSIMD;
39427 for (uint32_t n = 9; n < 16; n++) {
39428 for (size_t k = 1; k <= 5; k += 2) {
39429 for (uint32_t m = 1; m <= 4; m++) {
39430 GemmMicrokernelTester()
39431 .mr(4)
39432 .nr(8)
39433 .kr(1)
39434 .sr(1)
39435 .m(m)
39436 .n(n)
39437 .k(k)
39438 .iterations(1)
39439 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39440 }
39441 }
39442 }
39443 }
39444
39445 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, n_div_8) {
39446 TEST_REQUIRES_PSIMD;
39447 for (uint32_t n = 16; n <= 24; n += 8) {
39448 for (size_t k = 1; k <= 5; k += 2) {
39449 GemmMicrokernelTester()
39450 .mr(4)
39451 .nr(8)
39452 .kr(1)
39453 .sr(1)
39454 .m(4)
39455 .n(8)
39456 .k(k)
39457 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39458 }
39459 }
39460 }
39461
39462 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, n_div_8_strided_cn) {
39463 TEST_REQUIRES_PSIMD;
39464 for (uint32_t n = 16; n <= 24; n += 8) {
39465 for (size_t k = 1; k <= 5; k += 2) {
39466 GemmMicrokernelTester()
39467 .mr(4)
39468 .nr(8)
39469 .kr(1)
39470 .sr(1)
39471 .m(4)
39472 .n(n)
39473 .k(k)
39474 .cn_stride(11)
39475 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39476 }
39477 }
39478 }
39479
39480 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, n_div_8_subtile) {
39481 TEST_REQUIRES_PSIMD;
39482 for (uint32_t n = 16; n <= 24; n += 8) {
39483 for (size_t k = 1; k <= 5; k += 2) {
39484 for (uint32_t m = 1; m <= 4; m++) {
39485 GemmMicrokernelTester()
39486 .mr(4)
39487 .nr(8)
39488 .kr(1)
39489 .sr(1)
39490 .m(m)
39491 .n(n)
39492 .k(k)
39493 .iterations(1)
39494 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39495 }
39496 }
39497 }
39498 }
39499
39500 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, small_kernel) {
39501 TEST_REQUIRES_PSIMD;
39502 for (size_t k = 1; k <= 5; k += 2) {
39503 GemmMicrokernelTester()
39504 .mr(4)
39505 .nr(8)
39506 .kr(1)
39507 .sr(1)
39508 .m(4)
39509 .n(8)
39510 .k(k)
39511 .ks(3)
39512 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39513 }
39514 }
39515
39516 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, small_kernel_subtile) {
39517 TEST_REQUIRES_PSIMD;
39518 for (size_t k = 1; k <= 5; k += 2) {
39519 for (uint32_t m = 1; m <= 4; m++) {
39520 for (uint32_t n = 1; n <= 8; n++) {
39521 GemmMicrokernelTester()
39522 .mr(4)
39523 .nr(8)
39524 .kr(1)
39525 .sr(1)
39526 .m(m)
39527 .n(n)
39528 .k(k)
39529 .ks(3)
39530 .iterations(1)
39531 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39532 }
39533 }
39534 }
39535 }
39536
39537 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, n_gt_8_small_kernel) {
39538 TEST_REQUIRES_PSIMD;
39539 for (uint32_t n = 9; n < 16; n++) {
39540 for (size_t k = 1; k <= 5; k += 2) {
39541 GemmMicrokernelTester()
39542 .mr(4)
39543 .nr(8)
39544 .kr(1)
39545 .sr(1)
39546 .m(4)
39547 .n(8)
39548 .k(k)
39549 .ks(3)
39550 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39551 }
39552 }
39553 }
39554
39555 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, n_div_8_small_kernel) {
39556 TEST_REQUIRES_PSIMD;
39557 for (uint32_t n = 16; n <= 24; n += 8) {
39558 for (size_t k = 1; k <= 5; k += 2) {
39559 GemmMicrokernelTester()
39560 .mr(4)
39561 .nr(8)
39562 .kr(1)
39563 .sr(1)
39564 .m(4)
39565 .n(8)
39566 .k(k)
39567 .ks(3)
39568 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39569 }
39570 }
39571 }
39572
39573 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, strided_cm_subtile) {
39574 TEST_REQUIRES_PSIMD;
39575 for (size_t k = 1; k <= 5; k += 2) {
39576 for (uint32_t m = 1; m <= 4; m++) {
39577 for (uint32_t n = 1; n <= 8; n++) {
39578 GemmMicrokernelTester()
39579 .mr(4)
39580 .nr(8)
39581 .kr(1)
39582 .sr(1)
39583 .m(m)
39584 .n(n)
39585 .k(k)
39586 .cm_stride(11)
39587 .iterations(1)
39588 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39589 }
39590 }
39591 }
39592 }
39593
39594 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, a_offset) {
39595 TEST_REQUIRES_PSIMD;
39596 for (size_t k = 1; k <= 5; k += 2) {
39597 GemmMicrokernelTester()
39598 .mr(4)
39599 .nr(8)
39600 .kr(1)
39601 .sr(1)
39602 .m(4)
39603 .n(8)
39604 .k(k)
39605 .ks(3)
39606 .a_offset(23)
39607 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39608 }
39609 }
39610
39611 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, zero) {
39612 TEST_REQUIRES_PSIMD;
39613 for (uint32_t mz = 0; mz < 4; mz++) {
39614 for (size_t k = 1; k <= 5; k += 2) {
39615 GemmMicrokernelTester()
39616 .mr(4)
39617 .nr(8)
39618 .kr(1)
39619 .sr(1)
39620 .m(4)
39621 .n(8)
39622 .k(k)
39623 .ks(3)
39624 .a_offset(23)
39625 .zero_index(mz)
39626 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39627 }
39628 }
39629 }
39630
39631 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, qmin) {
39632 TEST_REQUIRES_PSIMD;
39633 GemmMicrokernelTester()
39634 .mr(4)
39635 .nr(8)
39636 .kr(1)
39637 .sr(1)
39638 .m(4)
39639 .n(8)
39640 .k(1)
39641 .qmin(128)
39642 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39643 }
39644
39645 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, qmax) {
39646 TEST_REQUIRES_PSIMD;
39647 GemmMicrokernelTester()
39648 .mr(4)
39649 .nr(8)
39650 .kr(1)
39651 .sr(1)
39652 .m(4)
39653 .n(8)
39654 .k(1)
39655 .qmax(128)
39656 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39657 }
39658
39659 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, strided_cm) {
39660 TEST_REQUIRES_PSIMD;
39661 GemmMicrokernelTester()
39662 .mr(4)
39663 .nr(8)
39664 .kr(1)
39665 .sr(1)
39666 .m(4)
39667 .n(8)
39668 .k(1)
39669 .cm_stride(11)
39670 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39671 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070039672#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070039673
39674
Marat Dukhan1dadbf72019-10-01 10:46:20 -070039675#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070039676 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, k_eq_1) {
39677 TEST_REQUIRES_PSIMD;
39678 GemmMicrokernelTester()
39679 .mr(6)
39680 .nr(8)
39681 .kr(1)
39682 .sr(1)
39683 .m(6)
39684 .n(8)
39685 .k(1)
39686 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39687 }
39688
39689 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, strided_cn) {
39690 TEST_REQUIRES_PSIMD;
39691 GemmMicrokernelTester()
39692 .mr(6)
39693 .nr(8)
39694 .kr(1)
39695 .sr(1)
39696 .m(6)
39697 .n(8)
39698 .k(1)
39699 .cn_stride(11)
39700 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39701 }
39702
39703 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, k_eq_1_subtile) {
39704 TEST_REQUIRES_PSIMD;
39705 for (uint32_t m = 1; m <= 6; m++) {
39706 for (uint32_t n = 1; n <= 8; n++) {
39707 GemmMicrokernelTester()
39708 .mr(6)
39709 .nr(8)
39710 .kr(1)
39711 .sr(1)
39712 .m(m)
39713 .n(n)
39714 .k(1)
39715 .iterations(1)
39716 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39717 }
39718 }
39719 }
39720
39721 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, k_eq_1_subtile_m) {
39722 TEST_REQUIRES_PSIMD;
39723 for (uint32_t m = 1; m <= 6; m++) {
39724 GemmMicrokernelTester()
39725 .mr(6)
39726 .nr(8)
39727 .kr(1)
39728 .sr(1)
39729 .m(m)
39730 .n(8)
39731 .k(1)
39732 .iterations(1)
39733 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39734 }
39735 }
39736
39737 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, k_eq_1_subtile_n) {
39738 TEST_REQUIRES_PSIMD;
39739 for (uint32_t n = 1; n <= 8; n++) {
39740 GemmMicrokernelTester()
39741 .mr(6)
39742 .nr(8)
39743 .kr(1)
39744 .sr(1)
39745 .m(6)
39746 .n(n)
39747 .k(1)
39748 .iterations(1)
39749 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39750 }
39751 }
39752
39753 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, k_gt_1) {
39754 TEST_REQUIRES_PSIMD;
39755 for (size_t k = 2; k < 10; k++) {
39756 GemmMicrokernelTester()
39757 .mr(6)
39758 .nr(8)
39759 .kr(1)
39760 .sr(1)
39761 .m(6)
39762 .n(8)
39763 .k(k)
39764 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39765 }
39766 }
39767
39768 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, k_gt_1_subtile) {
39769 TEST_REQUIRES_PSIMD;
39770 for (size_t k = 2; k < 10; k++) {
39771 for (uint32_t m = 1; m <= 6; m++) {
39772 for (uint32_t n = 1; n <= 8; n++) {
39773 GemmMicrokernelTester()
39774 .mr(6)
39775 .nr(8)
39776 .kr(1)
39777 .sr(1)
39778 .m(m)
39779 .n(n)
39780 .k(k)
39781 .iterations(1)
39782 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39783 }
39784 }
39785 }
39786 }
39787
39788 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, n_gt_8) {
39789 TEST_REQUIRES_PSIMD;
39790 for (uint32_t n = 9; n < 16; n++) {
39791 for (size_t k = 1; k <= 5; k += 2) {
39792 GemmMicrokernelTester()
39793 .mr(6)
39794 .nr(8)
39795 .kr(1)
39796 .sr(1)
39797 .m(6)
39798 .n(8)
39799 .k(k)
39800 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39801 }
39802 }
39803 }
39804
39805 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, n_gt_8_strided_cn) {
39806 TEST_REQUIRES_PSIMD;
39807 for (uint32_t n = 9; n < 16; n++) {
39808 for (size_t k = 1; k <= 5; k += 2) {
39809 GemmMicrokernelTester()
39810 .mr(6)
39811 .nr(8)
39812 .kr(1)
39813 .sr(1)
39814 .m(6)
39815 .n(8)
39816 .k(k)
39817 .cn_stride(11)
39818 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39819 }
39820 }
39821 }
39822
39823 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, n_gt_8_subtile) {
39824 TEST_REQUIRES_PSIMD;
39825 for (uint32_t n = 9; n < 16; n++) {
39826 for (size_t k = 1; k <= 5; k += 2) {
39827 for (uint32_t m = 1; m <= 6; m++) {
39828 GemmMicrokernelTester()
39829 .mr(6)
39830 .nr(8)
39831 .kr(1)
39832 .sr(1)
39833 .m(m)
39834 .n(n)
39835 .k(k)
39836 .iterations(1)
39837 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39838 }
39839 }
39840 }
39841 }
39842
39843 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, n_div_8) {
39844 TEST_REQUIRES_PSIMD;
39845 for (uint32_t n = 16; n <= 24; n += 8) {
39846 for (size_t k = 1; k <= 5; k += 2) {
39847 GemmMicrokernelTester()
39848 .mr(6)
39849 .nr(8)
39850 .kr(1)
39851 .sr(1)
39852 .m(6)
39853 .n(8)
39854 .k(k)
39855 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39856 }
39857 }
39858 }
39859
39860 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, n_div_8_strided_cn) {
39861 TEST_REQUIRES_PSIMD;
39862 for (uint32_t n = 16; n <= 24; n += 8) {
39863 for (size_t k = 1; k <= 5; k += 2) {
39864 GemmMicrokernelTester()
39865 .mr(6)
39866 .nr(8)
39867 .kr(1)
39868 .sr(1)
39869 .m(6)
39870 .n(n)
39871 .k(k)
39872 .cn_stride(11)
39873 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39874 }
39875 }
39876 }
39877
39878 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, n_div_8_subtile) {
39879 TEST_REQUIRES_PSIMD;
39880 for (uint32_t n = 16; n <= 24; n += 8) {
39881 for (size_t k = 1; k <= 5; k += 2) {
39882 for (uint32_t m = 1; m <= 6; m++) {
39883 GemmMicrokernelTester()
39884 .mr(6)
39885 .nr(8)
39886 .kr(1)
39887 .sr(1)
39888 .m(m)
39889 .n(n)
39890 .k(k)
39891 .iterations(1)
39892 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39893 }
39894 }
39895 }
39896 }
39897
39898 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, small_kernel) {
39899 TEST_REQUIRES_PSIMD;
39900 for (size_t k = 1; k <= 5; k += 2) {
39901 GemmMicrokernelTester()
39902 .mr(6)
39903 .nr(8)
39904 .kr(1)
39905 .sr(1)
39906 .m(6)
39907 .n(8)
39908 .k(k)
39909 .ks(3)
39910 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39911 }
39912 }
39913
39914 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, small_kernel_subtile) {
39915 TEST_REQUIRES_PSIMD;
39916 for (size_t k = 1; k <= 5; k += 2) {
39917 for (uint32_t m = 1; m <= 6; m++) {
39918 for (uint32_t n = 1; n <= 8; n++) {
39919 GemmMicrokernelTester()
39920 .mr(6)
39921 .nr(8)
39922 .kr(1)
39923 .sr(1)
39924 .m(m)
39925 .n(n)
39926 .k(k)
39927 .ks(3)
39928 .iterations(1)
39929 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39930 }
39931 }
39932 }
39933 }
39934
39935 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, n_gt_8_small_kernel) {
39936 TEST_REQUIRES_PSIMD;
39937 for (uint32_t n = 9; n < 16; n++) {
39938 for (size_t k = 1; k <= 5; k += 2) {
39939 GemmMicrokernelTester()
39940 .mr(6)
39941 .nr(8)
39942 .kr(1)
39943 .sr(1)
39944 .m(6)
39945 .n(8)
39946 .k(k)
39947 .ks(3)
39948 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39949 }
39950 }
39951 }
39952
39953 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, n_div_8_small_kernel) {
39954 TEST_REQUIRES_PSIMD;
39955 for (uint32_t n = 16; n <= 24; n += 8) {
39956 for (size_t k = 1; k <= 5; k += 2) {
39957 GemmMicrokernelTester()
39958 .mr(6)
39959 .nr(8)
39960 .kr(1)
39961 .sr(1)
39962 .m(6)
39963 .n(8)
39964 .k(k)
39965 .ks(3)
39966 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39967 }
39968 }
39969 }
39970
39971 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, strided_cm_subtile) {
39972 TEST_REQUIRES_PSIMD;
39973 for (size_t k = 1; k <= 5; k += 2) {
39974 for (uint32_t m = 1; m <= 6; m++) {
39975 for (uint32_t n = 1; n <= 8; n++) {
39976 GemmMicrokernelTester()
39977 .mr(6)
39978 .nr(8)
39979 .kr(1)
39980 .sr(1)
39981 .m(m)
39982 .n(n)
39983 .k(k)
39984 .cm_stride(11)
39985 .iterations(1)
39986 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39987 }
39988 }
39989 }
39990 }
39991
39992 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, a_offset) {
39993 TEST_REQUIRES_PSIMD;
39994 for (size_t k = 1; k <= 5; k += 2) {
39995 GemmMicrokernelTester()
39996 .mr(6)
39997 .nr(8)
39998 .kr(1)
39999 .sr(1)
40000 .m(6)
40001 .n(8)
40002 .k(k)
40003 .ks(3)
40004 .a_offset(37)
40005 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40006 }
40007 }
40008
40009 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, zero) {
40010 TEST_REQUIRES_PSIMD;
40011 for (uint32_t mz = 0; mz < 6; mz++) {
40012 for (size_t k = 1; k <= 5; k += 2) {
40013 GemmMicrokernelTester()
40014 .mr(6)
40015 .nr(8)
40016 .kr(1)
40017 .sr(1)
40018 .m(6)
40019 .n(8)
40020 .k(k)
40021 .ks(3)
40022 .a_offset(37)
40023 .zero_index(mz)
40024 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40025 }
40026 }
40027 }
40028
40029 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, qmin) {
40030 TEST_REQUIRES_PSIMD;
40031 GemmMicrokernelTester()
40032 .mr(6)
40033 .nr(8)
40034 .kr(1)
40035 .sr(1)
40036 .m(6)
40037 .n(8)
40038 .k(1)
40039 .qmin(128)
40040 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40041 }
40042
40043 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, qmax) {
40044 TEST_REQUIRES_PSIMD;
40045 GemmMicrokernelTester()
40046 .mr(6)
40047 .nr(8)
40048 .kr(1)
40049 .sr(1)
40050 .m(6)
40051 .n(8)
40052 .k(1)
40053 .qmax(128)
40054 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40055 }
40056
40057 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, strided_cm) {
40058 TEST_REQUIRES_PSIMD;
40059 GemmMicrokernelTester()
40060 .mr(6)
40061 .nr(8)
40062 .kr(1)
40063 .sr(1)
40064 .m(6)
40065 .n(8)
40066 .k(1)
40067 .cm_stride(11)
40068 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40069 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070040070#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070040071
40072
Marat Dukhan1dadbf72019-10-01 10:46:20 -070040073#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070040074 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, k_eq_4) {
40075 TEST_REQUIRES_PSIMD;
40076 GemmMicrokernelTester()
40077 .mr(1)
40078 .nr(8)
40079 .kr(1)
40080 .sr(1)
40081 .m(1)
40082 .n(8)
40083 .k(4)
40084 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40085 }
40086
40087 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, strided_cn) {
40088 TEST_REQUIRES_PSIMD;
40089 GemmMicrokernelTester()
40090 .mr(1)
40091 .nr(8)
40092 .kr(1)
40093 .sr(1)
40094 .m(1)
40095 .n(8)
40096 .k(4)
40097 .cn_stride(11)
40098 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40099 }
40100
40101 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, k_eq_4_subtile) {
40102 TEST_REQUIRES_PSIMD;
40103 for (uint32_t m = 1; m <= 1; m++) {
40104 for (uint32_t n = 1; n <= 8; n++) {
40105 GemmMicrokernelTester()
40106 .mr(1)
40107 .nr(8)
40108 .kr(1)
40109 .sr(1)
40110 .m(m)
40111 .n(n)
40112 .k(4)
40113 .iterations(1)
40114 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40115 }
40116 }
40117 }
40118
40119 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, k_eq_4_subtile_m) {
40120 TEST_REQUIRES_PSIMD;
40121 for (uint32_t m = 1; m <= 1; m++) {
40122 GemmMicrokernelTester()
40123 .mr(1)
40124 .nr(8)
40125 .kr(1)
40126 .sr(1)
40127 .m(m)
40128 .n(8)
40129 .k(4)
40130 .iterations(1)
40131 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40132 }
40133 }
40134
40135 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, k_eq_4_subtile_n) {
40136 TEST_REQUIRES_PSIMD;
40137 for (uint32_t n = 1; n <= 8; n++) {
40138 GemmMicrokernelTester()
40139 .mr(1)
40140 .nr(8)
40141 .kr(1)
40142 .sr(1)
40143 .m(1)
40144 .n(n)
40145 .k(4)
40146 .iterations(1)
40147 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40148 }
40149 }
40150
40151 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, k_lt_4) {
40152 TEST_REQUIRES_PSIMD;
40153 for (size_t k = 1; k < 4; k++) {
40154 GemmMicrokernelTester()
40155 .mr(1)
40156 .nr(8)
40157 .kr(1)
40158 .sr(1)
40159 .m(1)
40160 .n(8)
40161 .k(k)
40162 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40163 }
40164 }
40165
40166 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, k_lt_4_subtile) {
40167 TEST_REQUIRES_PSIMD;
40168 for (size_t k = 1; k < 4; k++) {
40169 for (uint32_t m = 1; m <= 1; m++) {
40170 for (uint32_t n = 1; n <= 8; n++) {
40171 GemmMicrokernelTester()
40172 .mr(1)
40173 .nr(8)
40174 .kr(1)
40175 .sr(1)
40176 .m(m)
40177 .n(n)
40178 .k(k)
40179 .iterations(1)
40180 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40181 }
40182 }
40183 }
40184 }
40185
40186 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, k_gt_4) {
40187 TEST_REQUIRES_PSIMD;
40188 for (size_t k = 5; k < 8; k++) {
40189 GemmMicrokernelTester()
40190 .mr(1)
40191 .nr(8)
40192 .kr(1)
40193 .sr(1)
40194 .m(1)
40195 .n(8)
40196 .k(k)
40197 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40198 }
40199 }
40200
40201 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, k_gt_4_subtile) {
40202 TEST_REQUIRES_PSIMD;
40203 for (size_t k = 5; k < 8; k++) {
40204 for (uint32_t m = 1; m <= 1; m++) {
40205 for (uint32_t n = 1; n <= 8; n++) {
40206 GemmMicrokernelTester()
40207 .mr(1)
40208 .nr(8)
40209 .kr(1)
40210 .sr(1)
40211 .m(m)
40212 .n(n)
40213 .k(k)
40214 .iterations(1)
40215 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40216 }
40217 }
40218 }
40219 }
40220
40221 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, k_div_4) {
40222 TEST_REQUIRES_PSIMD;
40223 for (size_t k = 8; k <= 40; k += 4) {
40224 GemmMicrokernelTester()
40225 .mr(1)
40226 .nr(8)
40227 .kr(1)
40228 .sr(1)
40229 .m(1)
40230 .n(8)
40231 .k(k)
40232 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40233 }
40234 }
40235
40236 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, k_div_4_subtile) {
40237 TEST_REQUIRES_PSIMD;
40238 for (size_t k = 8; k <= 40; k += 4) {
40239 for (uint32_t m = 1; m <= 1; m++) {
40240 for (uint32_t n = 1; n <= 8; n++) {
40241 GemmMicrokernelTester()
40242 .mr(1)
40243 .nr(8)
40244 .kr(1)
40245 .sr(1)
40246 .m(m)
40247 .n(n)
40248 .k(k)
40249 .iterations(1)
40250 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40251 }
40252 }
40253 }
40254 }
40255
40256 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, n_gt_8) {
40257 TEST_REQUIRES_PSIMD;
40258 for (uint32_t n = 9; n < 16; n++) {
40259 for (size_t k = 1; k <= 20; k += 5) {
40260 GemmMicrokernelTester()
40261 .mr(1)
40262 .nr(8)
40263 .kr(1)
40264 .sr(1)
40265 .m(1)
40266 .n(8)
40267 .k(k)
40268 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40269 }
40270 }
40271 }
40272
40273 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, n_gt_8_strided_cn) {
40274 TEST_REQUIRES_PSIMD;
40275 for (uint32_t n = 9; n < 16; n++) {
40276 for (size_t k = 1; k <= 20; k += 5) {
40277 GemmMicrokernelTester()
40278 .mr(1)
40279 .nr(8)
40280 .kr(1)
40281 .sr(1)
40282 .m(1)
40283 .n(8)
40284 .k(k)
40285 .cn_stride(11)
40286 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40287 }
40288 }
40289 }
40290
40291 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, n_gt_8_subtile) {
40292 TEST_REQUIRES_PSIMD;
40293 for (uint32_t n = 9; n < 16; n++) {
40294 for (size_t k = 1; k <= 20; k += 5) {
40295 for (uint32_t m = 1; m <= 1; m++) {
40296 GemmMicrokernelTester()
40297 .mr(1)
40298 .nr(8)
40299 .kr(1)
40300 .sr(1)
40301 .m(m)
40302 .n(n)
40303 .k(k)
40304 .iterations(1)
40305 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40306 }
40307 }
40308 }
40309 }
40310
40311 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, n_div_8) {
40312 TEST_REQUIRES_PSIMD;
40313 for (uint32_t n = 16; n <= 24; n += 8) {
40314 for (size_t k = 1; k <= 20; k += 5) {
40315 GemmMicrokernelTester()
40316 .mr(1)
40317 .nr(8)
40318 .kr(1)
40319 .sr(1)
40320 .m(1)
40321 .n(8)
40322 .k(k)
40323 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40324 }
40325 }
40326 }
40327
40328 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, n_div_8_strided_cn) {
40329 TEST_REQUIRES_PSIMD;
40330 for (uint32_t n = 16; n <= 24; n += 8) {
40331 for (size_t k = 1; k <= 20; k += 5) {
40332 GemmMicrokernelTester()
40333 .mr(1)
40334 .nr(8)
40335 .kr(1)
40336 .sr(1)
40337 .m(1)
40338 .n(n)
40339 .k(k)
40340 .cn_stride(11)
40341 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40342 }
40343 }
40344 }
40345
40346 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, n_div_8_subtile) {
40347 TEST_REQUIRES_PSIMD;
40348 for (uint32_t n = 16; n <= 24; n += 8) {
40349 for (size_t k = 1; k <= 20; k += 5) {
40350 for (uint32_t m = 1; m <= 1; m++) {
40351 GemmMicrokernelTester()
40352 .mr(1)
40353 .nr(8)
40354 .kr(1)
40355 .sr(1)
40356 .m(m)
40357 .n(n)
40358 .k(k)
40359 .iterations(1)
40360 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40361 }
40362 }
40363 }
40364 }
40365
40366 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, small_kernel) {
40367 TEST_REQUIRES_PSIMD;
40368 for (size_t k = 1; k <= 20; k += 5) {
40369 GemmMicrokernelTester()
40370 .mr(1)
40371 .nr(8)
40372 .kr(1)
40373 .sr(1)
40374 .m(1)
40375 .n(8)
40376 .k(k)
40377 .ks(3)
40378 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40379 }
40380 }
40381
40382 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, small_kernel_subtile) {
40383 TEST_REQUIRES_PSIMD;
40384 for (size_t k = 1; k <= 20; k += 5) {
40385 for (uint32_t m = 1; m <= 1; m++) {
40386 for (uint32_t n = 1; n <= 8; n++) {
40387 GemmMicrokernelTester()
40388 .mr(1)
40389 .nr(8)
40390 .kr(1)
40391 .sr(1)
40392 .m(m)
40393 .n(n)
40394 .k(k)
40395 .ks(3)
40396 .iterations(1)
40397 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40398 }
40399 }
40400 }
40401 }
40402
40403 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, n_gt_8_small_kernel) {
40404 TEST_REQUIRES_PSIMD;
40405 for (uint32_t n = 9; n < 16; n++) {
40406 for (size_t k = 1; k <= 20; k += 5) {
40407 GemmMicrokernelTester()
40408 .mr(1)
40409 .nr(8)
40410 .kr(1)
40411 .sr(1)
40412 .m(1)
40413 .n(8)
40414 .k(k)
40415 .ks(3)
40416 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40417 }
40418 }
40419 }
40420
40421 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, n_div_8_small_kernel) {
40422 TEST_REQUIRES_PSIMD;
40423 for (uint32_t n = 16; n <= 24; n += 8) {
40424 for (size_t k = 1; k <= 20; k += 5) {
40425 GemmMicrokernelTester()
40426 .mr(1)
40427 .nr(8)
40428 .kr(1)
40429 .sr(1)
40430 .m(1)
40431 .n(8)
40432 .k(k)
40433 .ks(3)
40434 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40435 }
40436 }
40437 }
40438
40439 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, strided_cm_subtile) {
40440 TEST_REQUIRES_PSIMD;
40441 for (size_t k = 1; k <= 20; k += 5) {
40442 for (uint32_t m = 1; m <= 1; m++) {
40443 for (uint32_t n = 1; n <= 8; n++) {
40444 GemmMicrokernelTester()
40445 .mr(1)
40446 .nr(8)
40447 .kr(1)
40448 .sr(1)
40449 .m(m)
40450 .n(n)
40451 .k(k)
40452 .cm_stride(11)
40453 .iterations(1)
40454 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40455 }
40456 }
40457 }
40458 }
40459
40460 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, a_offset) {
40461 TEST_REQUIRES_PSIMD;
40462 for (size_t k = 1; k <= 20; k += 5) {
40463 GemmMicrokernelTester()
40464 .mr(1)
40465 .nr(8)
40466 .kr(1)
40467 .sr(1)
40468 .m(1)
40469 .n(8)
40470 .k(k)
40471 .ks(3)
40472 .a_offset(23)
40473 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40474 }
40475 }
40476
40477 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, zero) {
40478 TEST_REQUIRES_PSIMD;
40479 for (uint32_t mz = 0; mz < 1; mz++) {
40480 for (size_t k = 1; k <= 20; k += 5) {
40481 GemmMicrokernelTester()
40482 .mr(1)
40483 .nr(8)
40484 .kr(1)
40485 .sr(1)
40486 .m(1)
40487 .n(8)
40488 .k(k)
40489 .ks(3)
40490 .a_offset(23)
40491 .zero_index(mz)
40492 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40493 }
40494 }
40495 }
40496
40497 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, qmin) {
40498 TEST_REQUIRES_PSIMD;
40499 GemmMicrokernelTester()
40500 .mr(1)
40501 .nr(8)
40502 .kr(1)
40503 .sr(1)
40504 .m(1)
40505 .n(8)
40506 .k(4)
40507 .qmin(128)
40508 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40509 }
40510
40511 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, qmax) {
40512 TEST_REQUIRES_PSIMD;
40513 GemmMicrokernelTester()
40514 .mr(1)
40515 .nr(8)
40516 .kr(1)
40517 .sr(1)
40518 .m(1)
40519 .n(8)
40520 .k(4)
40521 .qmax(128)
40522 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40523 }
40524
40525 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, strided_cm) {
40526 TEST_REQUIRES_PSIMD;
40527 GemmMicrokernelTester()
40528 .mr(1)
40529 .nr(8)
40530 .kr(1)
40531 .sr(1)
40532 .m(1)
40533 .n(8)
40534 .k(4)
40535 .cm_stride(11)
40536 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40537 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070040538#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070040539
40540
Marat Dukhan1dadbf72019-10-01 10:46:20 -070040541#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070040542 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, k_eq_4) {
40543 TEST_REQUIRES_PSIMD;
40544 GemmMicrokernelTester()
40545 .mr(4)
40546 .nr(8)
40547 .kr(1)
40548 .sr(1)
40549 .m(4)
40550 .n(8)
40551 .k(4)
40552 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40553 }
40554
40555 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, strided_cn) {
40556 TEST_REQUIRES_PSIMD;
40557 GemmMicrokernelTester()
40558 .mr(4)
40559 .nr(8)
40560 .kr(1)
40561 .sr(1)
40562 .m(4)
40563 .n(8)
40564 .k(4)
40565 .cn_stride(11)
40566 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40567 }
40568
40569 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, k_eq_4_subtile) {
40570 TEST_REQUIRES_PSIMD;
40571 for (uint32_t m = 1; m <= 4; m++) {
40572 for (uint32_t n = 1; n <= 8; n++) {
40573 GemmMicrokernelTester()
40574 .mr(4)
40575 .nr(8)
40576 .kr(1)
40577 .sr(1)
40578 .m(m)
40579 .n(n)
40580 .k(4)
40581 .iterations(1)
40582 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40583 }
40584 }
40585 }
40586
40587 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, k_eq_4_subtile_m) {
40588 TEST_REQUIRES_PSIMD;
40589 for (uint32_t m = 1; m <= 4; m++) {
40590 GemmMicrokernelTester()
40591 .mr(4)
40592 .nr(8)
40593 .kr(1)
40594 .sr(1)
40595 .m(m)
40596 .n(8)
40597 .k(4)
40598 .iterations(1)
40599 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40600 }
40601 }
40602
40603 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, k_eq_4_subtile_n) {
40604 TEST_REQUIRES_PSIMD;
40605 for (uint32_t n = 1; n <= 8; n++) {
40606 GemmMicrokernelTester()
40607 .mr(4)
40608 .nr(8)
40609 .kr(1)
40610 .sr(1)
40611 .m(4)
40612 .n(n)
40613 .k(4)
40614 .iterations(1)
40615 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40616 }
40617 }
40618
40619 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, k_lt_4) {
40620 TEST_REQUIRES_PSIMD;
40621 for (size_t k = 1; k < 4; k++) {
40622 GemmMicrokernelTester()
40623 .mr(4)
40624 .nr(8)
40625 .kr(1)
40626 .sr(1)
40627 .m(4)
40628 .n(8)
40629 .k(k)
40630 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40631 }
40632 }
40633
40634 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, k_lt_4_subtile) {
40635 TEST_REQUIRES_PSIMD;
40636 for (size_t k = 1; k < 4; k++) {
40637 for (uint32_t m = 1; m <= 4; m++) {
40638 for (uint32_t n = 1; n <= 8; n++) {
40639 GemmMicrokernelTester()
40640 .mr(4)
40641 .nr(8)
40642 .kr(1)
40643 .sr(1)
40644 .m(m)
40645 .n(n)
40646 .k(k)
40647 .iterations(1)
40648 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40649 }
40650 }
40651 }
40652 }
40653
40654 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, k_gt_4) {
40655 TEST_REQUIRES_PSIMD;
40656 for (size_t k = 5; k < 8; k++) {
40657 GemmMicrokernelTester()
40658 .mr(4)
40659 .nr(8)
40660 .kr(1)
40661 .sr(1)
40662 .m(4)
40663 .n(8)
40664 .k(k)
40665 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40666 }
40667 }
40668
40669 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, k_gt_4_subtile) {
40670 TEST_REQUIRES_PSIMD;
40671 for (size_t k = 5; k < 8; k++) {
40672 for (uint32_t m = 1; m <= 4; m++) {
40673 for (uint32_t n = 1; n <= 8; n++) {
40674 GemmMicrokernelTester()
40675 .mr(4)
40676 .nr(8)
40677 .kr(1)
40678 .sr(1)
40679 .m(m)
40680 .n(n)
40681 .k(k)
40682 .iterations(1)
40683 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40684 }
40685 }
40686 }
40687 }
40688
40689 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, k_div_4) {
40690 TEST_REQUIRES_PSIMD;
40691 for (size_t k = 8; k <= 40; k += 4) {
40692 GemmMicrokernelTester()
40693 .mr(4)
40694 .nr(8)
40695 .kr(1)
40696 .sr(1)
40697 .m(4)
40698 .n(8)
40699 .k(k)
40700 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40701 }
40702 }
40703
40704 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, k_div_4_subtile) {
40705 TEST_REQUIRES_PSIMD;
40706 for (size_t k = 8; k <= 40; k += 4) {
40707 for (uint32_t m = 1; m <= 4; m++) {
40708 for (uint32_t n = 1; n <= 8; n++) {
40709 GemmMicrokernelTester()
40710 .mr(4)
40711 .nr(8)
40712 .kr(1)
40713 .sr(1)
40714 .m(m)
40715 .n(n)
40716 .k(k)
40717 .iterations(1)
40718 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40719 }
40720 }
40721 }
40722 }
40723
40724 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, n_gt_8) {
40725 TEST_REQUIRES_PSIMD;
40726 for (uint32_t n = 9; n < 16; n++) {
40727 for (size_t k = 1; k <= 20; k += 5) {
40728 GemmMicrokernelTester()
40729 .mr(4)
40730 .nr(8)
40731 .kr(1)
40732 .sr(1)
40733 .m(4)
40734 .n(8)
40735 .k(k)
40736 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40737 }
40738 }
40739 }
40740
40741 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, n_gt_8_strided_cn) {
40742 TEST_REQUIRES_PSIMD;
40743 for (uint32_t n = 9; n < 16; n++) {
40744 for (size_t k = 1; k <= 20; k += 5) {
40745 GemmMicrokernelTester()
40746 .mr(4)
40747 .nr(8)
40748 .kr(1)
40749 .sr(1)
40750 .m(4)
40751 .n(8)
40752 .k(k)
40753 .cn_stride(11)
40754 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40755 }
40756 }
40757 }
40758
40759 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, n_gt_8_subtile) {
40760 TEST_REQUIRES_PSIMD;
40761 for (uint32_t n = 9; n < 16; n++) {
40762 for (size_t k = 1; k <= 20; k += 5) {
40763 for (uint32_t m = 1; m <= 4; m++) {
40764 GemmMicrokernelTester()
40765 .mr(4)
40766 .nr(8)
40767 .kr(1)
40768 .sr(1)
40769 .m(m)
40770 .n(n)
40771 .k(k)
40772 .iterations(1)
40773 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40774 }
40775 }
40776 }
40777 }
40778
40779 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, n_div_8) {
40780 TEST_REQUIRES_PSIMD;
40781 for (uint32_t n = 16; n <= 24; n += 8) {
40782 for (size_t k = 1; k <= 20; k += 5) {
40783 GemmMicrokernelTester()
40784 .mr(4)
40785 .nr(8)
40786 .kr(1)
40787 .sr(1)
40788 .m(4)
40789 .n(8)
40790 .k(k)
40791 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40792 }
40793 }
40794 }
40795
40796 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, n_div_8_strided_cn) {
40797 TEST_REQUIRES_PSIMD;
40798 for (uint32_t n = 16; n <= 24; n += 8) {
40799 for (size_t k = 1; k <= 20; k += 5) {
40800 GemmMicrokernelTester()
40801 .mr(4)
40802 .nr(8)
40803 .kr(1)
40804 .sr(1)
40805 .m(4)
40806 .n(n)
40807 .k(k)
40808 .cn_stride(11)
40809 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40810 }
40811 }
40812 }
40813
40814 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, n_div_8_subtile) {
40815 TEST_REQUIRES_PSIMD;
40816 for (uint32_t n = 16; n <= 24; n += 8) {
40817 for (size_t k = 1; k <= 20; k += 5) {
40818 for (uint32_t m = 1; m <= 4; m++) {
40819 GemmMicrokernelTester()
40820 .mr(4)
40821 .nr(8)
40822 .kr(1)
40823 .sr(1)
40824 .m(m)
40825 .n(n)
40826 .k(k)
40827 .iterations(1)
40828 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40829 }
40830 }
40831 }
40832 }
40833
40834 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, small_kernel) {
40835 TEST_REQUIRES_PSIMD;
40836 for (size_t k = 1; k <= 20; k += 5) {
40837 GemmMicrokernelTester()
40838 .mr(4)
40839 .nr(8)
40840 .kr(1)
40841 .sr(1)
40842 .m(4)
40843 .n(8)
40844 .k(k)
40845 .ks(3)
40846 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40847 }
40848 }
40849
40850 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, small_kernel_subtile) {
40851 TEST_REQUIRES_PSIMD;
40852 for (size_t k = 1; k <= 20; k += 5) {
40853 for (uint32_t m = 1; m <= 4; m++) {
40854 for (uint32_t n = 1; n <= 8; n++) {
40855 GemmMicrokernelTester()
40856 .mr(4)
40857 .nr(8)
40858 .kr(1)
40859 .sr(1)
40860 .m(m)
40861 .n(n)
40862 .k(k)
40863 .ks(3)
40864 .iterations(1)
40865 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40866 }
40867 }
40868 }
40869 }
40870
40871 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, n_gt_8_small_kernel) {
40872 TEST_REQUIRES_PSIMD;
40873 for (uint32_t n = 9; n < 16; n++) {
40874 for (size_t k = 1; k <= 20; k += 5) {
40875 GemmMicrokernelTester()
40876 .mr(4)
40877 .nr(8)
40878 .kr(1)
40879 .sr(1)
40880 .m(4)
40881 .n(8)
40882 .k(k)
40883 .ks(3)
40884 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40885 }
40886 }
40887 }
40888
40889 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, n_div_8_small_kernel) {
40890 TEST_REQUIRES_PSIMD;
40891 for (uint32_t n = 16; n <= 24; n += 8) {
40892 for (size_t k = 1; k <= 20; k += 5) {
40893 GemmMicrokernelTester()
40894 .mr(4)
40895 .nr(8)
40896 .kr(1)
40897 .sr(1)
40898 .m(4)
40899 .n(8)
40900 .k(k)
40901 .ks(3)
40902 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40903 }
40904 }
40905 }
40906
40907 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, strided_cm_subtile) {
40908 TEST_REQUIRES_PSIMD;
40909 for (size_t k = 1; k <= 20; k += 5) {
40910 for (uint32_t m = 1; m <= 4; m++) {
40911 for (uint32_t n = 1; n <= 8; n++) {
40912 GemmMicrokernelTester()
40913 .mr(4)
40914 .nr(8)
40915 .kr(1)
40916 .sr(1)
40917 .m(m)
40918 .n(n)
40919 .k(k)
40920 .cm_stride(11)
40921 .iterations(1)
40922 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40923 }
40924 }
40925 }
40926 }
40927
40928 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, a_offset) {
40929 TEST_REQUIRES_PSIMD;
40930 for (size_t k = 1; k <= 20; k += 5) {
40931 GemmMicrokernelTester()
40932 .mr(4)
40933 .nr(8)
40934 .kr(1)
40935 .sr(1)
40936 .m(4)
40937 .n(8)
40938 .k(k)
40939 .ks(3)
40940 .a_offset(83)
40941 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40942 }
40943 }
40944
40945 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, zero) {
40946 TEST_REQUIRES_PSIMD;
40947 for (uint32_t mz = 0; mz < 4; mz++) {
40948 for (size_t k = 1; k <= 20; k += 5) {
40949 GemmMicrokernelTester()
40950 .mr(4)
40951 .nr(8)
40952 .kr(1)
40953 .sr(1)
40954 .m(4)
40955 .n(8)
40956 .k(k)
40957 .ks(3)
40958 .a_offset(83)
40959 .zero_index(mz)
40960 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40961 }
40962 }
40963 }
40964
40965 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, qmin) {
40966 TEST_REQUIRES_PSIMD;
40967 GemmMicrokernelTester()
40968 .mr(4)
40969 .nr(8)
40970 .kr(1)
40971 .sr(1)
40972 .m(4)
40973 .n(8)
40974 .k(4)
40975 .qmin(128)
40976 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40977 }
40978
40979 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, qmax) {
40980 TEST_REQUIRES_PSIMD;
40981 GemmMicrokernelTester()
40982 .mr(4)
40983 .nr(8)
40984 .kr(1)
40985 .sr(1)
40986 .m(4)
40987 .n(8)
40988 .k(4)
40989 .qmax(128)
40990 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40991 }
40992
40993 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, strided_cm) {
40994 TEST_REQUIRES_PSIMD;
40995 GemmMicrokernelTester()
40996 .mr(4)
40997 .nr(8)
40998 .kr(1)
40999 .sr(1)
41000 .m(4)
41001 .n(8)
41002 .k(4)
41003 .cm_stride(11)
41004 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41005 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070041006#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070041007
41008
Marat Dukhan1dadbf72019-10-01 10:46:20 -070041009#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070041010 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, k_eq_4) {
41011 TEST_REQUIRES_PSIMD;
41012 GemmMicrokernelTester()
41013 .mr(6)
41014 .nr(8)
41015 .kr(1)
41016 .sr(1)
41017 .m(6)
41018 .n(8)
41019 .k(4)
41020 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41021 }
41022
41023 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, strided_cn) {
41024 TEST_REQUIRES_PSIMD;
41025 GemmMicrokernelTester()
41026 .mr(6)
41027 .nr(8)
41028 .kr(1)
41029 .sr(1)
41030 .m(6)
41031 .n(8)
41032 .k(4)
41033 .cn_stride(11)
41034 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41035 }
41036
41037 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, k_eq_4_subtile) {
41038 TEST_REQUIRES_PSIMD;
41039 for (uint32_t m = 1; m <= 6; m++) {
41040 for (uint32_t n = 1; n <= 8; n++) {
41041 GemmMicrokernelTester()
41042 .mr(6)
41043 .nr(8)
41044 .kr(1)
41045 .sr(1)
41046 .m(m)
41047 .n(n)
41048 .k(4)
41049 .iterations(1)
41050 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41051 }
41052 }
41053 }
41054
41055 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, k_eq_4_subtile_m) {
41056 TEST_REQUIRES_PSIMD;
41057 for (uint32_t m = 1; m <= 6; m++) {
41058 GemmMicrokernelTester()
41059 .mr(6)
41060 .nr(8)
41061 .kr(1)
41062 .sr(1)
41063 .m(m)
41064 .n(8)
41065 .k(4)
41066 .iterations(1)
41067 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41068 }
41069 }
41070
41071 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, k_eq_4_subtile_n) {
41072 TEST_REQUIRES_PSIMD;
41073 for (uint32_t n = 1; n <= 8; n++) {
41074 GemmMicrokernelTester()
41075 .mr(6)
41076 .nr(8)
41077 .kr(1)
41078 .sr(1)
41079 .m(6)
41080 .n(n)
41081 .k(4)
41082 .iterations(1)
41083 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41084 }
41085 }
41086
41087 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, k_lt_4) {
41088 TEST_REQUIRES_PSIMD;
41089 for (size_t k = 1; k < 4; k++) {
41090 GemmMicrokernelTester()
41091 .mr(6)
41092 .nr(8)
41093 .kr(1)
41094 .sr(1)
41095 .m(6)
41096 .n(8)
41097 .k(k)
41098 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41099 }
41100 }
41101
41102 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, k_lt_4_subtile) {
41103 TEST_REQUIRES_PSIMD;
41104 for (size_t k = 1; k < 4; k++) {
41105 for (uint32_t m = 1; m <= 6; m++) {
41106 for (uint32_t n = 1; n <= 8; n++) {
41107 GemmMicrokernelTester()
41108 .mr(6)
41109 .nr(8)
41110 .kr(1)
41111 .sr(1)
41112 .m(m)
41113 .n(n)
41114 .k(k)
41115 .iterations(1)
41116 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41117 }
41118 }
41119 }
41120 }
41121
41122 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, k_gt_4) {
41123 TEST_REQUIRES_PSIMD;
41124 for (size_t k = 5; k < 8; k++) {
41125 GemmMicrokernelTester()
41126 .mr(6)
41127 .nr(8)
41128 .kr(1)
41129 .sr(1)
41130 .m(6)
41131 .n(8)
41132 .k(k)
41133 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41134 }
41135 }
41136
41137 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, k_gt_4_subtile) {
41138 TEST_REQUIRES_PSIMD;
41139 for (size_t k = 5; k < 8; k++) {
41140 for (uint32_t m = 1; m <= 6; m++) {
41141 for (uint32_t n = 1; n <= 8; n++) {
41142 GemmMicrokernelTester()
41143 .mr(6)
41144 .nr(8)
41145 .kr(1)
41146 .sr(1)
41147 .m(m)
41148 .n(n)
41149 .k(k)
41150 .iterations(1)
41151 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41152 }
41153 }
41154 }
41155 }
41156
41157 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, k_div_4) {
41158 TEST_REQUIRES_PSIMD;
41159 for (size_t k = 8; k <= 40; k += 4) {
41160 GemmMicrokernelTester()
41161 .mr(6)
41162 .nr(8)
41163 .kr(1)
41164 .sr(1)
41165 .m(6)
41166 .n(8)
41167 .k(k)
41168 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41169 }
41170 }
41171
41172 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, k_div_4_subtile) {
41173 TEST_REQUIRES_PSIMD;
41174 for (size_t k = 8; k <= 40; k += 4) {
41175 for (uint32_t m = 1; m <= 6; m++) {
41176 for (uint32_t n = 1; n <= 8; n++) {
41177 GemmMicrokernelTester()
41178 .mr(6)
41179 .nr(8)
41180 .kr(1)
41181 .sr(1)
41182 .m(m)
41183 .n(n)
41184 .k(k)
41185 .iterations(1)
41186 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41187 }
41188 }
41189 }
41190 }
41191
41192 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, n_gt_8) {
41193 TEST_REQUIRES_PSIMD;
41194 for (uint32_t n = 9; n < 16; n++) {
41195 for (size_t k = 1; k <= 20; k += 5) {
41196 GemmMicrokernelTester()
41197 .mr(6)
41198 .nr(8)
41199 .kr(1)
41200 .sr(1)
41201 .m(6)
41202 .n(8)
41203 .k(k)
41204 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41205 }
41206 }
41207 }
41208
41209 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, n_gt_8_strided_cn) {
41210 TEST_REQUIRES_PSIMD;
41211 for (uint32_t n = 9; n < 16; n++) {
41212 for (size_t k = 1; k <= 20; k += 5) {
41213 GemmMicrokernelTester()
41214 .mr(6)
41215 .nr(8)
41216 .kr(1)
41217 .sr(1)
41218 .m(6)
41219 .n(8)
41220 .k(k)
41221 .cn_stride(11)
41222 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41223 }
41224 }
41225 }
41226
41227 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, n_gt_8_subtile) {
41228 TEST_REQUIRES_PSIMD;
41229 for (uint32_t n = 9; n < 16; n++) {
41230 for (size_t k = 1; k <= 20; k += 5) {
41231 for (uint32_t m = 1; m <= 6; m++) {
41232 GemmMicrokernelTester()
41233 .mr(6)
41234 .nr(8)
41235 .kr(1)
41236 .sr(1)
41237 .m(m)
41238 .n(n)
41239 .k(k)
41240 .iterations(1)
41241 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41242 }
41243 }
41244 }
41245 }
41246
41247 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, n_div_8) {
41248 TEST_REQUIRES_PSIMD;
41249 for (uint32_t n = 16; n <= 24; n += 8) {
41250 for (size_t k = 1; k <= 20; k += 5) {
41251 GemmMicrokernelTester()
41252 .mr(6)
41253 .nr(8)
41254 .kr(1)
41255 .sr(1)
41256 .m(6)
41257 .n(8)
41258 .k(k)
41259 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41260 }
41261 }
41262 }
41263
41264 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, n_div_8_strided_cn) {
41265 TEST_REQUIRES_PSIMD;
41266 for (uint32_t n = 16; n <= 24; n += 8) {
41267 for (size_t k = 1; k <= 20; k += 5) {
41268 GemmMicrokernelTester()
41269 .mr(6)
41270 .nr(8)
41271 .kr(1)
41272 .sr(1)
41273 .m(6)
41274 .n(n)
41275 .k(k)
41276 .cn_stride(11)
41277 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41278 }
41279 }
41280 }
41281
41282 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, n_div_8_subtile) {
41283 TEST_REQUIRES_PSIMD;
41284 for (uint32_t n = 16; n <= 24; n += 8) {
41285 for (size_t k = 1; k <= 20; k += 5) {
41286 for (uint32_t m = 1; m <= 6; m++) {
41287 GemmMicrokernelTester()
41288 .mr(6)
41289 .nr(8)
41290 .kr(1)
41291 .sr(1)
41292 .m(m)
41293 .n(n)
41294 .k(k)
41295 .iterations(1)
41296 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41297 }
41298 }
41299 }
41300 }
41301
41302 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, small_kernel) {
41303 TEST_REQUIRES_PSIMD;
41304 for (size_t k = 1; k <= 20; k += 5) {
41305 GemmMicrokernelTester()
41306 .mr(6)
41307 .nr(8)
41308 .kr(1)
41309 .sr(1)
41310 .m(6)
41311 .n(8)
41312 .k(k)
41313 .ks(3)
41314 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41315 }
41316 }
41317
41318 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, small_kernel_subtile) {
41319 TEST_REQUIRES_PSIMD;
41320 for (size_t k = 1; k <= 20; k += 5) {
41321 for (uint32_t m = 1; m <= 6; m++) {
41322 for (uint32_t n = 1; n <= 8; n++) {
41323 GemmMicrokernelTester()
41324 .mr(6)
41325 .nr(8)
41326 .kr(1)
41327 .sr(1)
41328 .m(m)
41329 .n(n)
41330 .k(k)
41331 .ks(3)
41332 .iterations(1)
41333 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41334 }
41335 }
41336 }
41337 }
41338
41339 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, n_gt_8_small_kernel) {
41340 TEST_REQUIRES_PSIMD;
41341 for (uint32_t n = 9; n < 16; n++) {
41342 for (size_t k = 1; k <= 20; k += 5) {
41343 GemmMicrokernelTester()
41344 .mr(6)
41345 .nr(8)
41346 .kr(1)
41347 .sr(1)
41348 .m(6)
41349 .n(8)
41350 .k(k)
41351 .ks(3)
41352 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41353 }
41354 }
41355 }
41356
41357 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, n_div_8_small_kernel) {
41358 TEST_REQUIRES_PSIMD;
41359 for (uint32_t n = 16; n <= 24; n += 8) {
41360 for (size_t k = 1; k <= 20; k += 5) {
41361 GemmMicrokernelTester()
41362 .mr(6)
41363 .nr(8)
41364 .kr(1)
41365 .sr(1)
41366 .m(6)
41367 .n(8)
41368 .k(k)
41369 .ks(3)
41370 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41371 }
41372 }
41373 }
41374
41375 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, strided_cm_subtile) {
41376 TEST_REQUIRES_PSIMD;
41377 for (size_t k = 1; k <= 20; k += 5) {
41378 for (uint32_t m = 1; m <= 6; m++) {
41379 for (uint32_t n = 1; n <= 8; n++) {
41380 GemmMicrokernelTester()
41381 .mr(6)
41382 .nr(8)
41383 .kr(1)
41384 .sr(1)
41385 .m(m)
41386 .n(n)
41387 .k(k)
41388 .cm_stride(11)
41389 .iterations(1)
41390 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41391 }
41392 }
41393 }
41394 }
41395
41396 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, a_offset) {
41397 TEST_REQUIRES_PSIMD;
41398 for (size_t k = 1; k <= 20; k += 5) {
41399 GemmMicrokernelTester()
41400 .mr(6)
41401 .nr(8)
41402 .kr(1)
41403 .sr(1)
41404 .m(6)
41405 .n(8)
41406 .k(k)
41407 .ks(3)
41408 .a_offset(127)
41409 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41410 }
41411 }
41412
41413 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, zero) {
41414 TEST_REQUIRES_PSIMD;
41415 for (uint32_t mz = 0; mz < 6; mz++) {
41416 for (size_t k = 1; k <= 20; k += 5) {
41417 GemmMicrokernelTester()
41418 .mr(6)
41419 .nr(8)
41420 .kr(1)
41421 .sr(1)
41422 .m(6)
41423 .n(8)
41424 .k(k)
41425 .ks(3)
41426 .a_offset(127)
41427 .zero_index(mz)
41428 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41429 }
41430 }
41431 }
41432
41433 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, qmin) {
41434 TEST_REQUIRES_PSIMD;
41435 GemmMicrokernelTester()
41436 .mr(6)
41437 .nr(8)
41438 .kr(1)
41439 .sr(1)
41440 .m(6)
41441 .n(8)
41442 .k(4)
41443 .qmin(128)
41444 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41445 }
41446
41447 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, qmax) {
41448 TEST_REQUIRES_PSIMD;
41449 GemmMicrokernelTester()
41450 .mr(6)
41451 .nr(8)
41452 .kr(1)
41453 .sr(1)
41454 .m(6)
41455 .n(8)
41456 .k(4)
41457 .qmax(128)
41458 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41459 }
41460
41461 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, strided_cm) {
41462 TEST_REQUIRES_PSIMD;
41463 GemmMicrokernelTester()
41464 .mr(6)
41465 .nr(8)
41466 .kr(1)
41467 .sr(1)
41468 .m(6)
41469 .n(8)
41470 .k(4)
41471 .cm_stride(11)
41472 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41473 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070041474#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070041475
41476
Marat Dukhan1dadbf72019-10-01 10:46:20 -070041477#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070041478 TEST(F32_IGEMM_1X8S4__PSIMD, k_eq_4) {
41479 TEST_REQUIRES_PSIMD;
41480 GemmMicrokernelTester()
41481 .mr(1)
41482 .nr(8)
41483 .kr(1)
41484 .sr(4)
41485 .m(1)
41486 .n(8)
41487 .k(4)
41488 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41489 }
41490
41491 TEST(F32_IGEMM_1X8S4__PSIMD, strided_cn) {
41492 TEST_REQUIRES_PSIMD;
41493 GemmMicrokernelTester()
41494 .mr(1)
41495 .nr(8)
41496 .kr(1)
41497 .sr(4)
41498 .m(1)
41499 .n(8)
41500 .k(4)
41501 .cn_stride(11)
41502 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41503 }
41504
41505 TEST(F32_IGEMM_1X8S4__PSIMD, k_eq_4_subtile) {
41506 TEST_REQUIRES_PSIMD;
41507 for (uint32_t m = 1; m <= 1; m++) {
41508 for (uint32_t n = 1; n <= 8; n++) {
41509 GemmMicrokernelTester()
41510 .mr(1)
41511 .nr(8)
41512 .kr(1)
41513 .sr(4)
41514 .m(m)
41515 .n(n)
41516 .k(4)
41517 .iterations(1)
41518 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41519 }
41520 }
41521 }
41522
41523 TEST(F32_IGEMM_1X8S4__PSIMD, k_eq_4_subtile_m) {
41524 TEST_REQUIRES_PSIMD;
41525 for (uint32_t m = 1; m <= 1; m++) {
41526 GemmMicrokernelTester()
41527 .mr(1)
41528 .nr(8)
41529 .kr(1)
41530 .sr(4)
41531 .m(m)
41532 .n(8)
41533 .k(4)
41534 .iterations(1)
41535 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41536 }
41537 }
41538
41539 TEST(F32_IGEMM_1X8S4__PSIMD, k_eq_4_subtile_n) {
41540 TEST_REQUIRES_PSIMD;
41541 for (uint32_t n = 1; n <= 8; n++) {
41542 GemmMicrokernelTester()
41543 .mr(1)
41544 .nr(8)
41545 .kr(1)
41546 .sr(4)
41547 .m(1)
41548 .n(n)
41549 .k(4)
41550 .iterations(1)
41551 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41552 }
41553 }
41554
41555 TEST(F32_IGEMM_1X8S4__PSIMD, k_lt_4) {
41556 TEST_REQUIRES_PSIMD;
41557 for (size_t k = 1; k < 4; k++) {
41558 GemmMicrokernelTester()
41559 .mr(1)
41560 .nr(8)
41561 .kr(1)
41562 .sr(4)
41563 .m(1)
41564 .n(8)
41565 .k(k)
41566 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41567 }
41568 }
41569
41570 TEST(F32_IGEMM_1X8S4__PSIMD, k_lt_4_subtile) {
41571 TEST_REQUIRES_PSIMD;
41572 for (size_t k = 1; k < 4; k++) {
41573 for (uint32_t m = 1; m <= 1; m++) {
41574 for (uint32_t n = 1; n <= 8; n++) {
41575 GemmMicrokernelTester()
41576 .mr(1)
41577 .nr(8)
41578 .kr(1)
41579 .sr(4)
41580 .m(m)
41581 .n(n)
41582 .k(k)
41583 .iterations(1)
41584 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41585 }
41586 }
41587 }
41588 }
41589
41590 TEST(F32_IGEMM_1X8S4__PSIMD, k_gt_4) {
41591 TEST_REQUIRES_PSIMD;
41592 for (size_t k = 5; k < 8; k++) {
41593 GemmMicrokernelTester()
41594 .mr(1)
41595 .nr(8)
41596 .kr(1)
41597 .sr(4)
41598 .m(1)
41599 .n(8)
41600 .k(k)
41601 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41602 }
41603 }
41604
41605 TEST(F32_IGEMM_1X8S4__PSIMD, k_gt_4_subtile) {
41606 TEST_REQUIRES_PSIMD;
41607 for (size_t k = 5; k < 8; k++) {
41608 for (uint32_t m = 1; m <= 1; m++) {
41609 for (uint32_t n = 1; n <= 8; n++) {
41610 GemmMicrokernelTester()
41611 .mr(1)
41612 .nr(8)
41613 .kr(1)
41614 .sr(4)
41615 .m(m)
41616 .n(n)
41617 .k(k)
41618 .iterations(1)
41619 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41620 }
41621 }
41622 }
41623 }
41624
41625 TEST(F32_IGEMM_1X8S4__PSIMD, k_div_4) {
41626 TEST_REQUIRES_PSIMD;
41627 for (size_t k = 8; k <= 40; k += 4) {
41628 GemmMicrokernelTester()
41629 .mr(1)
41630 .nr(8)
41631 .kr(1)
41632 .sr(4)
41633 .m(1)
41634 .n(8)
41635 .k(k)
41636 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41637 }
41638 }
41639
41640 TEST(F32_IGEMM_1X8S4__PSIMD, k_div_4_subtile) {
41641 TEST_REQUIRES_PSIMD;
41642 for (size_t k = 8; k <= 40; k += 4) {
41643 for (uint32_t m = 1; m <= 1; m++) {
41644 for (uint32_t n = 1; n <= 8; n++) {
41645 GemmMicrokernelTester()
41646 .mr(1)
41647 .nr(8)
41648 .kr(1)
41649 .sr(4)
41650 .m(m)
41651 .n(n)
41652 .k(k)
41653 .iterations(1)
41654 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41655 }
41656 }
41657 }
41658 }
41659
41660 TEST(F32_IGEMM_1X8S4__PSIMD, n_gt_8) {
41661 TEST_REQUIRES_PSIMD;
41662 for (uint32_t n = 9; n < 16; n++) {
41663 for (size_t k = 1; k <= 20; k += 5) {
41664 GemmMicrokernelTester()
41665 .mr(1)
41666 .nr(8)
41667 .kr(1)
41668 .sr(4)
41669 .m(1)
41670 .n(8)
41671 .k(k)
41672 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41673 }
41674 }
41675 }
41676
41677 TEST(F32_IGEMM_1X8S4__PSIMD, n_gt_8_strided_cn) {
41678 TEST_REQUIRES_PSIMD;
41679 for (uint32_t n = 9; n < 16; n++) {
41680 for (size_t k = 1; k <= 20; k += 5) {
41681 GemmMicrokernelTester()
41682 .mr(1)
41683 .nr(8)
41684 .kr(1)
41685 .sr(4)
41686 .m(1)
41687 .n(8)
41688 .k(k)
41689 .cn_stride(11)
41690 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41691 }
41692 }
41693 }
41694
41695 TEST(F32_IGEMM_1X8S4__PSIMD, n_gt_8_subtile) {
41696 TEST_REQUIRES_PSIMD;
41697 for (uint32_t n = 9; n < 16; n++) {
41698 for (size_t k = 1; k <= 20; k += 5) {
41699 for (uint32_t m = 1; m <= 1; m++) {
41700 GemmMicrokernelTester()
41701 .mr(1)
41702 .nr(8)
41703 .kr(1)
41704 .sr(4)
41705 .m(m)
41706 .n(n)
41707 .k(k)
41708 .iterations(1)
41709 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41710 }
41711 }
41712 }
41713 }
41714
41715 TEST(F32_IGEMM_1X8S4__PSIMD, n_div_8) {
41716 TEST_REQUIRES_PSIMD;
41717 for (uint32_t n = 16; n <= 24; n += 8) {
41718 for (size_t k = 1; k <= 20; k += 5) {
41719 GemmMicrokernelTester()
41720 .mr(1)
41721 .nr(8)
41722 .kr(1)
41723 .sr(4)
41724 .m(1)
41725 .n(8)
41726 .k(k)
41727 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41728 }
41729 }
41730 }
41731
41732 TEST(F32_IGEMM_1X8S4__PSIMD, n_div_8_strided_cn) {
41733 TEST_REQUIRES_PSIMD;
41734 for (uint32_t n = 16; n <= 24; n += 8) {
41735 for (size_t k = 1; k <= 20; k += 5) {
41736 GemmMicrokernelTester()
41737 .mr(1)
41738 .nr(8)
41739 .kr(1)
41740 .sr(4)
41741 .m(1)
41742 .n(n)
41743 .k(k)
41744 .cn_stride(11)
41745 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41746 }
41747 }
41748 }
41749
41750 TEST(F32_IGEMM_1X8S4__PSIMD, n_div_8_subtile) {
41751 TEST_REQUIRES_PSIMD;
41752 for (uint32_t n = 16; n <= 24; n += 8) {
41753 for (size_t k = 1; k <= 20; k += 5) {
41754 for (uint32_t m = 1; m <= 1; m++) {
41755 GemmMicrokernelTester()
41756 .mr(1)
41757 .nr(8)
41758 .kr(1)
41759 .sr(4)
41760 .m(m)
41761 .n(n)
41762 .k(k)
41763 .iterations(1)
41764 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41765 }
41766 }
41767 }
41768 }
41769
41770 TEST(F32_IGEMM_1X8S4__PSIMD, small_kernel) {
41771 TEST_REQUIRES_PSIMD;
41772 for (size_t k = 1; k <= 20; k += 5) {
41773 GemmMicrokernelTester()
41774 .mr(1)
41775 .nr(8)
41776 .kr(1)
41777 .sr(4)
41778 .m(1)
41779 .n(8)
41780 .k(k)
41781 .ks(3)
41782 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41783 }
41784 }
41785
41786 TEST(F32_IGEMM_1X8S4__PSIMD, small_kernel_subtile) {
41787 TEST_REQUIRES_PSIMD;
41788 for (size_t k = 1; k <= 20; k += 5) {
41789 for (uint32_t m = 1; m <= 1; m++) {
41790 for (uint32_t n = 1; n <= 8; n++) {
41791 GemmMicrokernelTester()
41792 .mr(1)
41793 .nr(8)
41794 .kr(1)
41795 .sr(4)
41796 .m(m)
41797 .n(n)
41798 .k(k)
41799 .ks(3)
41800 .iterations(1)
41801 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41802 }
41803 }
41804 }
41805 }
41806
41807 TEST(F32_IGEMM_1X8S4__PSIMD, n_gt_8_small_kernel) {
41808 TEST_REQUIRES_PSIMD;
41809 for (uint32_t n = 9; n < 16; n++) {
41810 for (size_t k = 1; k <= 20; k += 5) {
41811 GemmMicrokernelTester()
41812 .mr(1)
41813 .nr(8)
41814 .kr(1)
41815 .sr(4)
41816 .m(1)
41817 .n(8)
41818 .k(k)
41819 .ks(3)
41820 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41821 }
41822 }
41823 }
41824
41825 TEST(F32_IGEMM_1X8S4__PSIMD, n_div_8_small_kernel) {
41826 TEST_REQUIRES_PSIMD;
41827 for (uint32_t n = 16; n <= 24; n += 8) {
41828 for (size_t k = 1; k <= 20; k += 5) {
41829 GemmMicrokernelTester()
41830 .mr(1)
41831 .nr(8)
41832 .kr(1)
41833 .sr(4)
41834 .m(1)
41835 .n(8)
41836 .k(k)
41837 .ks(3)
41838 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41839 }
41840 }
41841 }
41842
41843 TEST(F32_IGEMM_1X8S4__PSIMD, strided_cm_subtile) {
41844 TEST_REQUIRES_PSIMD;
41845 for (size_t k = 1; k <= 20; k += 5) {
41846 for (uint32_t m = 1; m <= 1; m++) {
41847 for (uint32_t n = 1; n <= 8; n++) {
41848 GemmMicrokernelTester()
41849 .mr(1)
41850 .nr(8)
41851 .kr(1)
41852 .sr(4)
41853 .m(m)
41854 .n(n)
41855 .k(k)
41856 .cm_stride(11)
41857 .iterations(1)
41858 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41859 }
41860 }
41861 }
41862 }
41863
41864 TEST(F32_IGEMM_1X8S4__PSIMD, a_offset) {
41865 TEST_REQUIRES_PSIMD;
41866 for (size_t k = 1; k <= 20; k += 5) {
41867 GemmMicrokernelTester()
41868 .mr(1)
41869 .nr(8)
41870 .kr(1)
41871 .sr(4)
41872 .m(1)
41873 .n(8)
41874 .k(k)
41875 .ks(3)
41876 .a_offset(23)
41877 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41878 }
41879 }
41880
41881 TEST(F32_IGEMM_1X8S4__PSIMD, zero) {
41882 TEST_REQUIRES_PSIMD;
41883 for (uint32_t mz = 0; mz < 1; mz++) {
41884 for (size_t k = 1; k <= 20; k += 5) {
41885 GemmMicrokernelTester()
41886 .mr(1)
41887 .nr(8)
41888 .kr(1)
41889 .sr(4)
41890 .m(1)
41891 .n(8)
41892 .k(k)
41893 .ks(3)
41894 .a_offset(23)
41895 .zero_index(mz)
41896 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41897 }
41898 }
41899 }
41900
41901 TEST(F32_IGEMM_1X8S4__PSIMD, qmin) {
41902 TEST_REQUIRES_PSIMD;
41903 GemmMicrokernelTester()
41904 .mr(1)
41905 .nr(8)
41906 .kr(1)
41907 .sr(4)
41908 .m(1)
41909 .n(8)
41910 .k(4)
41911 .qmin(128)
41912 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41913 }
41914
41915 TEST(F32_IGEMM_1X8S4__PSIMD, qmax) {
41916 TEST_REQUIRES_PSIMD;
41917 GemmMicrokernelTester()
41918 .mr(1)
41919 .nr(8)
41920 .kr(1)
41921 .sr(4)
41922 .m(1)
41923 .n(8)
41924 .k(4)
41925 .qmax(128)
41926 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41927 }
41928
41929 TEST(F32_IGEMM_1X8S4__PSIMD, strided_cm) {
41930 TEST_REQUIRES_PSIMD;
41931 GemmMicrokernelTester()
41932 .mr(1)
41933 .nr(8)
41934 .kr(1)
41935 .sr(4)
41936 .m(1)
41937 .n(8)
41938 .k(4)
41939 .cm_stride(11)
41940 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41941 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070041942#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070041943
41944
Marat Dukhan1dadbf72019-10-01 10:46:20 -070041945#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070041946 TEST(F32_IGEMM_4X8S4__PSIMD, k_eq_4) {
41947 TEST_REQUIRES_PSIMD;
41948 GemmMicrokernelTester()
41949 .mr(4)
41950 .nr(8)
41951 .kr(1)
41952 .sr(4)
41953 .m(4)
41954 .n(8)
41955 .k(4)
41956 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41957 }
41958
41959 TEST(F32_IGEMM_4X8S4__PSIMD, strided_cn) {
41960 TEST_REQUIRES_PSIMD;
41961 GemmMicrokernelTester()
41962 .mr(4)
41963 .nr(8)
41964 .kr(1)
41965 .sr(4)
41966 .m(4)
41967 .n(8)
41968 .k(4)
41969 .cn_stride(11)
41970 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41971 }
41972
41973 TEST(F32_IGEMM_4X8S4__PSIMD, k_eq_4_subtile) {
41974 TEST_REQUIRES_PSIMD;
41975 for (uint32_t m = 1; m <= 4; m++) {
41976 for (uint32_t n = 1; n <= 8; n++) {
41977 GemmMicrokernelTester()
41978 .mr(4)
41979 .nr(8)
41980 .kr(1)
41981 .sr(4)
41982 .m(m)
41983 .n(n)
41984 .k(4)
41985 .iterations(1)
41986 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41987 }
41988 }
41989 }
41990
41991 TEST(F32_IGEMM_4X8S4__PSIMD, k_eq_4_subtile_m) {
41992 TEST_REQUIRES_PSIMD;
41993 for (uint32_t m = 1; m <= 4; m++) {
41994 GemmMicrokernelTester()
41995 .mr(4)
41996 .nr(8)
41997 .kr(1)
41998 .sr(4)
41999 .m(m)
42000 .n(8)
42001 .k(4)
42002 .iterations(1)
42003 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42004 }
42005 }
42006
42007 TEST(F32_IGEMM_4X8S4__PSIMD, k_eq_4_subtile_n) {
42008 TEST_REQUIRES_PSIMD;
42009 for (uint32_t n = 1; n <= 8; n++) {
42010 GemmMicrokernelTester()
42011 .mr(4)
42012 .nr(8)
42013 .kr(1)
42014 .sr(4)
42015 .m(4)
42016 .n(n)
42017 .k(4)
42018 .iterations(1)
42019 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42020 }
42021 }
42022
42023 TEST(F32_IGEMM_4X8S4__PSIMD, k_lt_4) {
42024 TEST_REQUIRES_PSIMD;
42025 for (size_t k = 1; k < 4; k++) {
42026 GemmMicrokernelTester()
42027 .mr(4)
42028 .nr(8)
42029 .kr(1)
42030 .sr(4)
42031 .m(4)
42032 .n(8)
42033 .k(k)
42034 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42035 }
42036 }
42037
42038 TEST(F32_IGEMM_4X8S4__PSIMD, k_lt_4_subtile) {
42039 TEST_REQUIRES_PSIMD;
42040 for (size_t k = 1; k < 4; k++) {
42041 for (uint32_t m = 1; m <= 4; m++) {
42042 for (uint32_t n = 1; n <= 8; n++) {
42043 GemmMicrokernelTester()
42044 .mr(4)
42045 .nr(8)
42046 .kr(1)
42047 .sr(4)
42048 .m(m)
42049 .n(n)
42050 .k(k)
42051 .iterations(1)
42052 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42053 }
42054 }
42055 }
42056 }
42057
42058 TEST(F32_IGEMM_4X8S4__PSIMD, k_gt_4) {
42059 TEST_REQUIRES_PSIMD;
42060 for (size_t k = 5; k < 8; k++) {
42061 GemmMicrokernelTester()
42062 .mr(4)
42063 .nr(8)
42064 .kr(1)
42065 .sr(4)
42066 .m(4)
42067 .n(8)
42068 .k(k)
42069 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42070 }
42071 }
42072
42073 TEST(F32_IGEMM_4X8S4__PSIMD, k_gt_4_subtile) {
42074 TEST_REQUIRES_PSIMD;
42075 for (size_t k = 5; k < 8; k++) {
42076 for (uint32_t m = 1; m <= 4; m++) {
42077 for (uint32_t n = 1; n <= 8; n++) {
42078 GemmMicrokernelTester()
42079 .mr(4)
42080 .nr(8)
42081 .kr(1)
42082 .sr(4)
42083 .m(m)
42084 .n(n)
42085 .k(k)
42086 .iterations(1)
42087 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42088 }
42089 }
42090 }
42091 }
42092
42093 TEST(F32_IGEMM_4X8S4__PSIMD, k_div_4) {
42094 TEST_REQUIRES_PSIMD;
42095 for (size_t k = 8; k <= 40; k += 4) {
42096 GemmMicrokernelTester()
42097 .mr(4)
42098 .nr(8)
42099 .kr(1)
42100 .sr(4)
42101 .m(4)
42102 .n(8)
42103 .k(k)
42104 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42105 }
42106 }
42107
42108 TEST(F32_IGEMM_4X8S4__PSIMD, k_div_4_subtile) {
42109 TEST_REQUIRES_PSIMD;
42110 for (size_t k = 8; k <= 40; k += 4) {
42111 for (uint32_t m = 1; m <= 4; m++) {
42112 for (uint32_t n = 1; n <= 8; n++) {
42113 GemmMicrokernelTester()
42114 .mr(4)
42115 .nr(8)
42116 .kr(1)
42117 .sr(4)
42118 .m(m)
42119 .n(n)
42120 .k(k)
42121 .iterations(1)
42122 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42123 }
42124 }
42125 }
42126 }
42127
42128 TEST(F32_IGEMM_4X8S4__PSIMD, n_gt_8) {
42129 TEST_REQUIRES_PSIMD;
42130 for (uint32_t n = 9; n < 16; n++) {
42131 for (size_t k = 1; k <= 20; k += 5) {
42132 GemmMicrokernelTester()
42133 .mr(4)
42134 .nr(8)
42135 .kr(1)
42136 .sr(4)
42137 .m(4)
42138 .n(8)
42139 .k(k)
42140 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42141 }
42142 }
42143 }
42144
42145 TEST(F32_IGEMM_4X8S4__PSIMD, n_gt_8_strided_cn) {
42146 TEST_REQUIRES_PSIMD;
42147 for (uint32_t n = 9; n < 16; n++) {
42148 for (size_t k = 1; k <= 20; k += 5) {
42149 GemmMicrokernelTester()
42150 .mr(4)
42151 .nr(8)
42152 .kr(1)
42153 .sr(4)
42154 .m(4)
42155 .n(8)
42156 .k(k)
42157 .cn_stride(11)
42158 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42159 }
42160 }
42161 }
42162
42163 TEST(F32_IGEMM_4X8S4__PSIMD, n_gt_8_subtile) {
42164 TEST_REQUIRES_PSIMD;
42165 for (uint32_t n = 9; n < 16; n++) {
42166 for (size_t k = 1; k <= 20; k += 5) {
42167 for (uint32_t m = 1; m <= 4; m++) {
42168 GemmMicrokernelTester()
42169 .mr(4)
42170 .nr(8)
42171 .kr(1)
42172 .sr(4)
42173 .m(m)
42174 .n(n)
42175 .k(k)
42176 .iterations(1)
42177 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42178 }
42179 }
42180 }
42181 }
42182
42183 TEST(F32_IGEMM_4X8S4__PSIMD, n_div_8) {
42184 TEST_REQUIRES_PSIMD;
42185 for (uint32_t n = 16; n <= 24; n += 8) {
42186 for (size_t k = 1; k <= 20; k += 5) {
42187 GemmMicrokernelTester()
42188 .mr(4)
42189 .nr(8)
42190 .kr(1)
42191 .sr(4)
42192 .m(4)
42193 .n(8)
42194 .k(k)
42195 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42196 }
42197 }
42198 }
42199
42200 TEST(F32_IGEMM_4X8S4__PSIMD, n_div_8_strided_cn) {
42201 TEST_REQUIRES_PSIMD;
42202 for (uint32_t n = 16; n <= 24; n += 8) {
42203 for (size_t k = 1; k <= 20; k += 5) {
42204 GemmMicrokernelTester()
42205 .mr(4)
42206 .nr(8)
42207 .kr(1)
42208 .sr(4)
42209 .m(4)
42210 .n(n)
42211 .k(k)
42212 .cn_stride(11)
42213 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42214 }
42215 }
42216 }
42217
42218 TEST(F32_IGEMM_4X8S4__PSIMD, n_div_8_subtile) {
42219 TEST_REQUIRES_PSIMD;
42220 for (uint32_t n = 16; n <= 24; n += 8) {
42221 for (size_t k = 1; k <= 20; k += 5) {
42222 for (uint32_t m = 1; m <= 4; m++) {
42223 GemmMicrokernelTester()
42224 .mr(4)
42225 .nr(8)
42226 .kr(1)
42227 .sr(4)
42228 .m(m)
42229 .n(n)
42230 .k(k)
42231 .iterations(1)
42232 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42233 }
42234 }
42235 }
42236 }
42237
42238 TEST(F32_IGEMM_4X8S4__PSIMD, small_kernel) {
42239 TEST_REQUIRES_PSIMD;
42240 for (size_t k = 1; k <= 20; k += 5) {
42241 GemmMicrokernelTester()
42242 .mr(4)
42243 .nr(8)
42244 .kr(1)
42245 .sr(4)
42246 .m(4)
42247 .n(8)
42248 .k(k)
42249 .ks(3)
42250 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42251 }
42252 }
42253
42254 TEST(F32_IGEMM_4X8S4__PSIMD, small_kernel_subtile) {
42255 TEST_REQUIRES_PSIMD;
42256 for (size_t k = 1; k <= 20; k += 5) {
42257 for (uint32_t m = 1; m <= 4; m++) {
42258 for (uint32_t n = 1; n <= 8; n++) {
42259 GemmMicrokernelTester()
42260 .mr(4)
42261 .nr(8)
42262 .kr(1)
42263 .sr(4)
42264 .m(m)
42265 .n(n)
42266 .k(k)
42267 .ks(3)
42268 .iterations(1)
42269 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42270 }
42271 }
42272 }
42273 }
42274
42275 TEST(F32_IGEMM_4X8S4__PSIMD, n_gt_8_small_kernel) {
42276 TEST_REQUIRES_PSIMD;
42277 for (uint32_t n = 9; n < 16; n++) {
42278 for (size_t k = 1; k <= 20; k += 5) {
42279 GemmMicrokernelTester()
42280 .mr(4)
42281 .nr(8)
42282 .kr(1)
42283 .sr(4)
42284 .m(4)
42285 .n(8)
42286 .k(k)
42287 .ks(3)
42288 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42289 }
42290 }
42291 }
42292
42293 TEST(F32_IGEMM_4X8S4__PSIMD, n_div_8_small_kernel) {
42294 TEST_REQUIRES_PSIMD;
42295 for (uint32_t n = 16; n <= 24; n += 8) {
42296 for (size_t k = 1; k <= 20; k += 5) {
42297 GemmMicrokernelTester()
42298 .mr(4)
42299 .nr(8)
42300 .kr(1)
42301 .sr(4)
42302 .m(4)
42303 .n(8)
42304 .k(k)
42305 .ks(3)
42306 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42307 }
42308 }
42309 }
42310
42311 TEST(F32_IGEMM_4X8S4__PSIMD, strided_cm_subtile) {
42312 TEST_REQUIRES_PSIMD;
42313 for (size_t k = 1; k <= 20; k += 5) {
42314 for (uint32_t m = 1; m <= 4; m++) {
42315 for (uint32_t n = 1; n <= 8; n++) {
42316 GemmMicrokernelTester()
42317 .mr(4)
42318 .nr(8)
42319 .kr(1)
42320 .sr(4)
42321 .m(m)
42322 .n(n)
42323 .k(k)
42324 .cm_stride(11)
42325 .iterations(1)
42326 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42327 }
42328 }
42329 }
42330 }
42331
42332 TEST(F32_IGEMM_4X8S4__PSIMD, a_offset) {
42333 TEST_REQUIRES_PSIMD;
42334 for (size_t k = 1; k <= 20; k += 5) {
42335 GemmMicrokernelTester()
42336 .mr(4)
42337 .nr(8)
42338 .kr(1)
42339 .sr(4)
42340 .m(4)
42341 .n(8)
42342 .k(k)
42343 .ks(3)
42344 .a_offset(83)
42345 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42346 }
42347 }
42348
42349 TEST(F32_IGEMM_4X8S4__PSIMD, zero) {
42350 TEST_REQUIRES_PSIMD;
42351 for (uint32_t mz = 0; mz < 4; mz++) {
42352 for (size_t k = 1; k <= 20; k += 5) {
42353 GemmMicrokernelTester()
42354 .mr(4)
42355 .nr(8)
42356 .kr(1)
42357 .sr(4)
42358 .m(4)
42359 .n(8)
42360 .k(k)
42361 .ks(3)
42362 .a_offset(83)
42363 .zero_index(mz)
42364 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42365 }
42366 }
42367 }
42368
42369 TEST(F32_IGEMM_4X8S4__PSIMD, qmin) {
42370 TEST_REQUIRES_PSIMD;
42371 GemmMicrokernelTester()
42372 .mr(4)
42373 .nr(8)
42374 .kr(1)
42375 .sr(4)
42376 .m(4)
42377 .n(8)
42378 .k(4)
42379 .qmin(128)
42380 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42381 }
42382
42383 TEST(F32_IGEMM_4X8S4__PSIMD, qmax) {
42384 TEST_REQUIRES_PSIMD;
42385 GemmMicrokernelTester()
42386 .mr(4)
42387 .nr(8)
42388 .kr(1)
42389 .sr(4)
42390 .m(4)
42391 .n(8)
42392 .k(4)
42393 .qmax(128)
42394 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42395 }
42396
42397 TEST(F32_IGEMM_4X8S4__PSIMD, strided_cm) {
42398 TEST_REQUIRES_PSIMD;
42399 GemmMicrokernelTester()
42400 .mr(4)
42401 .nr(8)
42402 .kr(1)
42403 .sr(4)
42404 .m(4)
42405 .n(8)
42406 .k(4)
42407 .cm_stride(11)
42408 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42409 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070042410#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070042411
42412
Marat Dukhan1dadbf72019-10-01 10:46:20 -070042413#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070042414 TEST(F32_IGEMM_6X8S4__PSIMD, k_eq_4) {
42415 TEST_REQUIRES_PSIMD;
42416 GemmMicrokernelTester()
42417 .mr(6)
42418 .nr(8)
42419 .kr(1)
42420 .sr(4)
42421 .m(6)
42422 .n(8)
42423 .k(4)
42424 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42425 }
42426
42427 TEST(F32_IGEMM_6X8S4__PSIMD, strided_cn) {
42428 TEST_REQUIRES_PSIMD;
42429 GemmMicrokernelTester()
42430 .mr(6)
42431 .nr(8)
42432 .kr(1)
42433 .sr(4)
42434 .m(6)
42435 .n(8)
42436 .k(4)
42437 .cn_stride(11)
42438 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42439 }
42440
42441 TEST(F32_IGEMM_6X8S4__PSIMD, k_eq_4_subtile) {
42442 TEST_REQUIRES_PSIMD;
42443 for (uint32_t m = 1; m <= 6; m++) {
42444 for (uint32_t n = 1; n <= 8; n++) {
42445 GemmMicrokernelTester()
42446 .mr(6)
42447 .nr(8)
42448 .kr(1)
42449 .sr(4)
42450 .m(m)
42451 .n(n)
42452 .k(4)
42453 .iterations(1)
42454 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42455 }
42456 }
42457 }
42458
42459 TEST(F32_IGEMM_6X8S4__PSIMD, k_eq_4_subtile_m) {
42460 TEST_REQUIRES_PSIMD;
42461 for (uint32_t m = 1; m <= 6; m++) {
42462 GemmMicrokernelTester()
42463 .mr(6)
42464 .nr(8)
42465 .kr(1)
42466 .sr(4)
42467 .m(m)
42468 .n(8)
42469 .k(4)
42470 .iterations(1)
42471 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42472 }
42473 }
42474
42475 TEST(F32_IGEMM_6X8S4__PSIMD, k_eq_4_subtile_n) {
42476 TEST_REQUIRES_PSIMD;
42477 for (uint32_t n = 1; n <= 8; n++) {
42478 GemmMicrokernelTester()
42479 .mr(6)
42480 .nr(8)
42481 .kr(1)
42482 .sr(4)
42483 .m(6)
42484 .n(n)
42485 .k(4)
42486 .iterations(1)
42487 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42488 }
42489 }
42490
42491 TEST(F32_IGEMM_6X8S4__PSIMD, k_lt_4) {
42492 TEST_REQUIRES_PSIMD;
42493 for (size_t k = 1; k < 4; k++) {
42494 GemmMicrokernelTester()
42495 .mr(6)
42496 .nr(8)
42497 .kr(1)
42498 .sr(4)
42499 .m(6)
42500 .n(8)
42501 .k(k)
42502 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42503 }
42504 }
42505
42506 TEST(F32_IGEMM_6X8S4__PSIMD, k_lt_4_subtile) {
42507 TEST_REQUIRES_PSIMD;
42508 for (size_t k = 1; k < 4; k++) {
42509 for (uint32_t m = 1; m <= 6; m++) {
42510 for (uint32_t n = 1; n <= 8; n++) {
42511 GemmMicrokernelTester()
42512 .mr(6)
42513 .nr(8)
42514 .kr(1)
42515 .sr(4)
42516 .m(m)
42517 .n(n)
42518 .k(k)
42519 .iterations(1)
42520 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42521 }
42522 }
42523 }
42524 }
42525
42526 TEST(F32_IGEMM_6X8S4__PSIMD, k_gt_4) {
42527 TEST_REQUIRES_PSIMD;
42528 for (size_t k = 5; k < 8; k++) {
42529 GemmMicrokernelTester()
42530 .mr(6)
42531 .nr(8)
42532 .kr(1)
42533 .sr(4)
42534 .m(6)
42535 .n(8)
42536 .k(k)
42537 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42538 }
42539 }
42540
42541 TEST(F32_IGEMM_6X8S4__PSIMD, k_gt_4_subtile) {
42542 TEST_REQUIRES_PSIMD;
42543 for (size_t k = 5; k < 8; k++) {
42544 for (uint32_t m = 1; m <= 6; m++) {
42545 for (uint32_t n = 1; n <= 8; n++) {
42546 GemmMicrokernelTester()
42547 .mr(6)
42548 .nr(8)
42549 .kr(1)
42550 .sr(4)
42551 .m(m)
42552 .n(n)
42553 .k(k)
42554 .iterations(1)
42555 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42556 }
42557 }
42558 }
42559 }
42560
42561 TEST(F32_IGEMM_6X8S4__PSIMD, k_div_4) {
42562 TEST_REQUIRES_PSIMD;
42563 for (size_t k = 8; k <= 40; k += 4) {
42564 GemmMicrokernelTester()
42565 .mr(6)
42566 .nr(8)
42567 .kr(1)
42568 .sr(4)
42569 .m(6)
42570 .n(8)
42571 .k(k)
42572 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42573 }
42574 }
42575
42576 TEST(F32_IGEMM_6X8S4__PSIMD, k_div_4_subtile) {
42577 TEST_REQUIRES_PSIMD;
42578 for (size_t k = 8; k <= 40; k += 4) {
42579 for (uint32_t m = 1; m <= 6; m++) {
42580 for (uint32_t n = 1; n <= 8; n++) {
42581 GemmMicrokernelTester()
42582 .mr(6)
42583 .nr(8)
42584 .kr(1)
42585 .sr(4)
42586 .m(m)
42587 .n(n)
42588 .k(k)
42589 .iterations(1)
42590 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42591 }
42592 }
42593 }
42594 }
42595
42596 TEST(F32_IGEMM_6X8S4__PSIMD, n_gt_8) {
42597 TEST_REQUIRES_PSIMD;
42598 for (uint32_t n = 9; n < 16; n++) {
42599 for (size_t k = 1; k <= 20; k += 5) {
42600 GemmMicrokernelTester()
42601 .mr(6)
42602 .nr(8)
42603 .kr(1)
42604 .sr(4)
42605 .m(6)
42606 .n(8)
42607 .k(k)
42608 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42609 }
42610 }
42611 }
42612
42613 TEST(F32_IGEMM_6X8S4__PSIMD, n_gt_8_strided_cn) {
42614 TEST_REQUIRES_PSIMD;
42615 for (uint32_t n = 9; n < 16; n++) {
42616 for (size_t k = 1; k <= 20; k += 5) {
42617 GemmMicrokernelTester()
42618 .mr(6)
42619 .nr(8)
42620 .kr(1)
42621 .sr(4)
42622 .m(6)
42623 .n(8)
42624 .k(k)
42625 .cn_stride(11)
42626 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42627 }
42628 }
42629 }
42630
42631 TEST(F32_IGEMM_6X8S4__PSIMD, n_gt_8_subtile) {
42632 TEST_REQUIRES_PSIMD;
42633 for (uint32_t n = 9; n < 16; n++) {
42634 for (size_t k = 1; k <= 20; k += 5) {
42635 for (uint32_t m = 1; m <= 6; m++) {
42636 GemmMicrokernelTester()
42637 .mr(6)
42638 .nr(8)
42639 .kr(1)
42640 .sr(4)
42641 .m(m)
42642 .n(n)
42643 .k(k)
42644 .iterations(1)
42645 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42646 }
42647 }
42648 }
42649 }
42650
42651 TEST(F32_IGEMM_6X8S4__PSIMD, n_div_8) {
42652 TEST_REQUIRES_PSIMD;
42653 for (uint32_t n = 16; n <= 24; n += 8) {
42654 for (size_t k = 1; k <= 20; k += 5) {
42655 GemmMicrokernelTester()
42656 .mr(6)
42657 .nr(8)
42658 .kr(1)
42659 .sr(4)
42660 .m(6)
42661 .n(8)
42662 .k(k)
42663 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42664 }
42665 }
42666 }
42667
42668 TEST(F32_IGEMM_6X8S4__PSIMD, n_div_8_strided_cn) {
42669 TEST_REQUIRES_PSIMD;
42670 for (uint32_t n = 16; n <= 24; n += 8) {
42671 for (size_t k = 1; k <= 20; k += 5) {
42672 GemmMicrokernelTester()
42673 .mr(6)
42674 .nr(8)
42675 .kr(1)
42676 .sr(4)
42677 .m(6)
42678 .n(n)
42679 .k(k)
42680 .cn_stride(11)
42681 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42682 }
42683 }
42684 }
42685
42686 TEST(F32_IGEMM_6X8S4__PSIMD, n_div_8_subtile) {
42687 TEST_REQUIRES_PSIMD;
42688 for (uint32_t n = 16; n <= 24; n += 8) {
42689 for (size_t k = 1; k <= 20; k += 5) {
42690 for (uint32_t m = 1; m <= 6; m++) {
42691 GemmMicrokernelTester()
42692 .mr(6)
42693 .nr(8)
42694 .kr(1)
42695 .sr(4)
42696 .m(m)
42697 .n(n)
42698 .k(k)
42699 .iterations(1)
42700 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42701 }
42702 }
42703 }
42704 }
42705
42706 TEST(F32_IGEMM_6X8S4__PSIMD, small_kernel) {
42707 TEST_REQUIRES_PSIMD;
42708 for (size_t k = 1; k <= 20; k += 5) {
42709 GemmMicrokernelTester()
42710 .mr(6)
42711 .nr(8)
42712 .kr(1)
42713 .sr(4)
42714 .m(6)
42715 .n(8)
42716 .k(k)
42717 .ks(3)
42718 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42719 }
42720 }
42721
42722 TEST(F32_IGEMM_6X8S4__PSIMD, small_kernel_subtile) {
42723 TEST_REQUIRES_PSIMD;
42724 for (size_t k = 1; k <= 20; k += 5) {
42725 for (uint32_t m = 1; m <= 6; m++) {
42726 for (uint32_t n = 1; n <= 8; n++) {
42727 GemmMicrokernelTester()
42728 .mr(6)
42729 .nr(8)
42730 .kr(1)
42731 .sr(4)
42732 .m(m)
42733 .n(n)
42734 .k(k)
42735 .ks(3)
42736 .iterations(1)
42737 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42738 }
42739 }
42740 }
42741 }
42742
42743 TEST(F32_IGEMM_6X8S4__PSIMD, n_gt_8_small_kernel) {
42744 TEST_REQUIRES_PSIMD;
42745 for (uint32_t n = 9; n < 16; n++) {
42746 for (size_t k = 1; k <= 20; k += 5) {
42747 GemmMicrokernelTester()
42748 .mr(6)
42749 .nr(8)
42750 .kr(1)
42751 .sr(4)
42752 .m(6)
42753 .n(8)
42754 .k(k)
42755 .ks(3)
42756 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42757 }
42758 }
42759 }
42760
42761 TEST(F32_IGEMM_6X8S4__PSIMD, n_div_8_small_kernel) {
42762 TEST_REQUIRES_PSIMD;
42763 for (uint32_t n = 16; n <= 24; n += 8) {
42764 for (size_t k = 1; k <= 20; k += 5) {
42765 GemmMicrokernelTester()
42766 .mr(6)
42767 .nr(8)
42768 .kr(1)
42769 .sr(4)
42770 .m(6)
42771 .n(8)
42772 .k(k)
42773 .ks(3)
42774 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42775 }
42776 }
42777 }
42778
42779 TEST(F32_IGEMM_6X8S4__PSIMD, strided_cm_subtile) {
42780 TEST_REQUIRES_PSIMD;
42781 for (size_t k = 1; k <= 20; k += 5) {
42782 for (uint32_t m = 1; m <= 6; m++) {
42783 for (uint32_t n = 1; n <= 8; n++) {
42784 GemmMicrokernelTester()
42785 .mr(6)
42786 .nr(8)
42787 .kr(1)
42788 .sr(4)
42789 .m(m)
42790 .n(n)
42791 .k(k)
42792 .cm_stride(11)
42793 .iterations(1)
42794 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42795 }
42796 }
42797 }
42798 }
42799
42800 TEST(F32_IGEMM_6X8S4__PSIMD, a_offset) {
42801 TEST_REQUIRES_PSIMD;
42802 for (size_t k = 1; k <= 20; k += 5) {
42803 GemmMicrokernelTester()
42804 .mr(6)
42805 .nr(8)
42806 .kr(1)
42807 .sr(4)
42808 .m(6)
42809 .n(8)
42810 .k(k)
42811 .ks(3)
42812 .a_offset(127)
42813 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42814 }
42815 }
42816
42817 TEST(F32_IGEMM_6X8S4__PSIMD, zero) {
42818 TEST_REQUIRES_PSIMD;
42819 for (uint32_t mz = 0; mz < 6; mz++) {
42820 for (size_t k = 1; k <= 20; k += 5) {
42821 GemmMicrokernelTester()
42822 .mr(6)
42823 .nr(8)
42824 .kr(1)
42825 .sr(4)
42826 .m(6)
42827 .n(8)
42828 .k(k)
42829 .ks(3)
42830 .a_offset(127)
42831 .zero_index(mz)
42832 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42833 }
42834 }
42835 }
42836
42837 TEST(F32_IGEMM_6X8S4__PSIMD, qmin) {
42838 TEST_REQUIRES_PSIMD;
42839 GemmMicrokernelTester()
42840 .mr(6)
42841 .nr(8)
42842 .kr(1)
42843 .sr(4)
42844 .m(6)
42845 .n(8)
42846 .k(4)
42847 .qmin(128)
42848 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42849 }
42850
42851 TEST(F32_IGEMM_6X8S4__PSIMD, qmax) {
42852 TEST_REQUIRES_PSIMD;
42853 GemmMicrokernelTester()
42854 .mr(6)
42855 .nr(8)
42856 .kr(1)
42857 .sr(4)
42858 .m(6)
42859 .n(8)
42860 .k(4)
42861 .qmax(128)
42862 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42863 }
42864
42865 TEST(F32_IGEMM_6X8S4__PSIMD, strided_cm) {
42866 TEST_REQUIRES_PSIMD;
42867 GemmMicrokernelTester()
42868 .mr(6)
42869 .nr(8)
42870 .kr(1)
42871 .sr(4)
42872 .m(6)
42873 .n(8)
42874 .k(4)
42875 .cm_stride(11)
42876 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
42877 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070042878#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070042879
42880
Marat Dukhan436ebe62019-12-04 15:10:12 -080042881#if XNN_ARCH_WASM
42882 TEST(F32_IGEMM_1X4__WASM, k_eq_1) {
42883 GemmMicrokernelTester()
42884 .mr(1)
42885 .nr(4)
42886 .kr(1)
42887 .sr(1)
42888 .m(1)
42889 .n(4)
42890 .k(1)
42891 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42892 }
42893
42894 TEST(F32_IGEMM_1X4__WASM, strided_cn) {
42895 GemmMicrokernelTester()
42896 .mr(1)
42897 .nr(4)
42898 .kr(1)
42899 .sr(1)
42900 .m(1)
42901 .n(4)
42902 .k(1)
42903 .cn_stride(7)
42904 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42905 }
42906
42907 TEST(F32_IGEMM_1X4__WASM, k_eq_1_subtile) {
42908 for (uint32_t m = 1; m <= 1; m++) {
42909 for (uint32_t n = 1; n <= 4; n++) {
42910 GemmMicrokernelTester()
42911 .mr(1)
42912 .nr(4)
42913 .kr(1)
42914 .sr(1)
42915 .m(m)
42916 .n(n)
42917 .k(1)
42918 .iterations(1)
42919 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42920 }
42921 }
42922 }
42923
42924 TEST(F32_IGEMM_1X4__WASM, k_eq_1_subtile_m) {
42925 for (uint32_t m = 1; m <= 1; m++) {
42926 GemmMicrokernelTester()
42927 .mr(1)
42928 .nr(4)
42929 .kr(1)
42930 .sr(1)
42931 .m(m)
42932 .n(4)
42933 .k(1)
42934 .iterations(1)
42935 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42936 }
42937 }
42938
42939 TEST(F32_IGEMM_1X4__WASM, k_eq_1_subtile_n) {
42940 for (uint32_t n = 1; n <= 4; n++) {
42941 GemmMicrokernelTester()
42942 .mr(1)
42943 .nr(4)
42944 .kr(1)
42945 .sr(1)
42946 .m(1)
42947 .n(n)
42948 .k(1)
42949 .iterations(1)
42950 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42951 }
42952 }
42953
42954 TEST(F32_IGEMM_1X4__WASM, k_gt_1) {
42955 for (size_t k = 2; k < 10; k++) {
42956 GemmMicrokernelTester()
42957 .mr(1)
42958 .nr(4)
42959 .kr(1)
42960 .sr(1)
42961 .m(1)
42962 .n(4)
42963 .k(k)
42964 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42965 }
42966 }
42967
42968 TEST(F32_IGEMM_1X4__WASM, k_gt_1_subtile) {
42969 for (size_t k = 2; k < 10; k++) {
42970 for (uint32_t m = 1; m <= 1; m++) {
42971 for (uint32_t n = 1; n <= 4; n++) {
42972 GemmMicrokernelTester()
42973 .mr(1)
42974 .nr(4)
42975 .kr(1)
42976 .sr(1)
42977 .m(m)
42978 .n(n)
42979 .k(k)
42980 .iterations(1)
42981 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42982 }
42983 }
42984 }
42985 }
42986
42987 TEST(F32_IGEMM_1X4__WASM, n_gt_4) {
42988 for (uint32_t n = 5; n < 8; n++) {
42989 for (size_t k = 1; k <= 5; k += 2) {
42990 GemmMicrokernelTester()
42991 .mr(1)
42992 .nr(4)
42993 .kr(1)
42994 .sr(1)
42995 .m(1)
42996 .n(4)
42997 .k(k)
42998 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42999 }
43000 }
43001 }
43002
43003 TEST(F32_IGEMM_1X4__WASM, n_gt_4_strided_cn) {
43004 for (uint32_t n = 5; n < 8; n++) {
43005 for (size_t k = 1; k <= 5; k += 2) {
43006 GemmMicrokernelTester()
43007 .mr(1)
43008 .nr(4)
43009 .kr(1)
43010 .sr(1)
43011 .m(1)
43012 .n(4)
43013 .k(k)
43014 .cn_stride(7)
43015 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43016 }
43017 }
43018 }
43019
43020 TEST(F32_IGEMM_1X4__WASM, n_gt_4_subtile) {
43021 for (uint32_t n = 5; n < 8; n++) {
43022 for (size_t k = 1; k <= 5; k += 2) {
43023 for (uint32_t m = 1; m <= 1; m++) {
43024 GemmMicrokernelTester()
43025 .mr(1)
43026 .nr(4)
43027 .kr(1)
43028 .sr(1)
43029 .m(m)
43030 .n(n)
43031 .k(k)
43032 .iterations(1)
43033 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43034 }
43035 }
43036 }
43037 }
43038
43039 TEST(F32_IGEMM_1X4__WASM, n_div_4) {
43040 for (uint32_t n = 8; n <= 12; n += 4) {
43041 for (size_t k = 1; k <= 5; k += 2) {
43042 GemmMicrokernelTester()
43043 .mr(1)
43044 .nr(4)
43045 .kr(1)
43046 .sr(1)
43047 .m(1)
43048 .n(4)
43049 .k(k)
43050 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43051 }
43052 }
43053 }
43054
43055 TEST(F32_IGEMM_1X4__WASM, n_div_4_strided_cn) {
43056 for (uint32_t n = 8; n <= 12; n += 4) {
43057 for (size_t k = 1; k <= 5; k += 2) {
43058 GemmMicrokernelTester()
43059 .mr(1)
43060 .nr(4)
43061 .kr(1)
43062 .sr(1)
43063 .m(1)
43064 .n(n)
43065 .k(k)
43066 .cn_stride(7)
43067 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43068 }
43069 }
43070 }
43071
43072 TEST(F32_IGEMM_1X4__WASM, n_div_4_subtile) {
43073 for (uint32_t n = 8; n <= 12; n += 4) {
43074 for (size_t k = 1; k <= 5; k += 2) {
43075 for (uint32_t m = 1; m <= 1; m++) {
43076 GemmMicrokernelTester()
43077 .mr(1)
43078 .nr(4)
43079 .kr(1)
43080 .sr(1)
43081 .m(m)
43082 .n(n)
43083 .k(k)
43084 .iterations(1)
43085 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43086 }
43087 }
43088 }
43089 }
43090
43091 TEST(F32_IGEMM_1X4__WASM, small_kernel) {
43092 for (size_t k = 1; k <= 5; k += 2) {
43093 GemmMicrokernelTester()
43094 .mr(1)
43095 .nr(4)
43096 .kr(1)
43097 .sr(1)
43098 .m(1)
43099 .n(4)
43100 .k(k)
43101 .ks(3)
43102 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43103 }
43104 }
43105
43106 TEST(F32_IGEMM_1X4__WASM, small_kernel_subtile) {
43107 for (size_t k = 1; k <= 5; k += 2) {
43108 for (uint32_t m = 1; m <= 1; m++) {
43109 for (uint32_t n = 1; n <= 4; n++) {
43110 GemmMicrokernelTester()
43111 .mr(1)
43112 .nr(4)
43113 .kr(1)
43114 .sr(1)
43115 .m(m)
43116 .n(n)
43117 .k(k)
43118 .ks(3)
43119 .iterations(1)
43120 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43121 }
43122 }
43123 }
43124 }
43125
43126 TEST(F32_IGEMM_1X4__WASM, n_gt_4_small_kernel) {
43127 for (uint32_t n = 5; n < 8; n++) {
43128 for (size_t k = 1; k <= 5; k += 2) {
43129 GemmMicrokernelTester()
43130 .mr(1)
43131 .nr(4)
43132 .kr(1)
43133 .sr(1)
43134 .m(1)
43135 .n(4)
43136 .k(k)
43137 .ks(3)
43138 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43139 }
43140 }
43141 }
43142
43143 TEST(F32_IGEMM_1X4__WASM, n_div_4_small_kernel) {
43144 for (uint32_t n = 8; n <= 12; n += 4) {
43145 for (size_t k = 1; k <= 5; k += 2) {
43146 GemmMicrokernelTester()
43147 .mr(1)
43148 .nr(4)
43149 .kr(1)
43150 .sr(1)
43151 .m(1)
43152 .n(4)
43153 .k(k)
43154 .ks(3)
43155 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43156 }
43157 }
43158 }
43159
43160 TEST(F32_IGEMM_1X4__WASM, strided_cm_subtile) {
43161 for (size_t k = 1; k <= 5; k += 2) {
43162 for (uint32_t m = 1; m <= 1; m++) {
43163 for (uint32_t n = 1; n <= 4; n++) {
43164 GemmMicrokernelTester()
43165 .mr(1)
43166 .nr(4)
43167 .kr(1)
43168 .sr(1)
43169 .m(m)
43170 .n(n)
43171 .k(k)
43172 .cm_stride(7)
43173 .iterations(1)
43174 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43175 }
43176 }
43177 }
43178 }
43179
43180 TEST(F32_IGEMM_1X4__WASM, a_offset) {
43181 for (size_t k = 1; k <= 5; k += 2) {
43182 GemmMicrokernelTester()
43183 .mr(1)
43184 .nr(4)
43185 .kr(1)
43186 .sr(1)
43187 .m(1)
43188 .n(4)
43189 .k(k)
43190 .ks(3)
43191 .a_offset(7)
43192 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43193 }
43194 }
43195
43196 TEST(F32_IGEMM_1X4__WASM, zero) {
43197 for (uint32_t mz = 0; mz < 1; mz++) {
43198 for (size_t k = 1; k <= 5; k += 2) {
43199 GemmMicrokernelTester()
43200 .mr(1)
43201 .nr(4)
43202 .kr(1)
43203 .sr(1)
43204 .m(1)
43205 .n(4)
43206 .k(k)
43207 .ks(3)
43208 .a_offset(7)
43209 .zero_index(mz)
43210 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43211 }
43212 }
43213 }
43214
43215 TEST(F32_IGEMM_1X4__WASM, qmin) {
43216 GemmMicrokernelTester()
43217 .mr(1)
43218 .nr(4)
43219 .kr(1)
43220 .sr(1)
43221 .m(1)
43222 .n(4)
43223 .k(1)
43224 .qmin(128)
43225 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43226 }
43227
43228 TEST(F32_IGEMM_1X4__WASM, qmax) {
43229 GemmMicrokernelTester()
43230 .mr(1)
43231 .nr(4)
43232 .kr(1)
43233 .sr(1)
43234 .m(1)
43235 .n(4)
43236 .k(1)
43237 .qmax(128)
43238 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43239 }
43240
43241 TEST(F32_IGEMM_1X4__WASM, strided_cm) {
43242 GemmMicrokernelTester()
43243 .mr(1)
43244 .nr(4)
43245 .kr(1)
43246 .sr(1)
43247 .m(1)
43248 .n(4)
43249 .k(1)
43250 .cm_stride(7)
43251 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43252 }
43253#endif // XNN_ARCH_WASM
43254
43255
43256#if XNN_ARCH_WASM
43257 TEST(F32_IGEMM_2X4__WASM, k_eq_1) {
43258 GemmMicrokernelTester()
43259 .mr(2)
43260 .nr(4)
43261 .kr(1)
43262 .sr(1)
43263 .m(2)
43264 .n(4)
43265 .k(1)
43266 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43267 }
43268
43269 TEST(F32_IGEMM_2X4__WASM, strided_cn) {
43270 GemmMicrokernelTester()
43271 .mr(2)
43272 .nr(4)
43273 .kr(1)
43274 .sr(1)
43275 .m(2)
43276 .n(4)
43277 .k(1)
43278 .cn_stride(7)
43279 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43280 }
43281
43282 TEST(F32_IGEMM_2X4__WASM, k_eq_1_subtile) {
43283 for (uint32_t m = 1; m <= 2; m++) {
43284 for (uint32_t n = 1; n <= 4; n++) {
43285 GemmMicrokernelTester()
43286 .mr(2)
43287 .nr(4)
43288 .kr(1)
43289 .sr(1)
43290 .m(m)
43291 .n(n)
43292 .k(1)
43293 .iterations(1)
43294 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43295 }
43296 }
43297 }
43298
43299 TEST(F32_IGEMM_2X4__WASM, k_eq_1_subtile_m) {
43300 for (uint32_t m = 1; m <= 2; m++) {
43301 GemmMicrokernelTester()
43302 .mr(2)
43303 .nr(4)
43304 .kr(1)
43305 .sr(1)
43306 .m(m)
43307 .n(4)
43308 .k(1)
43309 .iterations(1)
43310 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43311 }
43312 }
43313
43314 TEST(F32_IGEMM_2X4__WASM, k_eq_1_subtile_n) {
43315 for (uint32_t n = 1; n <= 4; n++) {
43316 GemmMicrokernelTester()
43317 .mr(2)
43318 .nr(4)
43319 .kr(1)
43320 .sr(1)
43321 .m(2)
43322 .n(n)
43323 .k(1)
43324 .iterations(1)
43325 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43326 }
43327 }
43328
43329 TEST(F32_IGEMM_2X4__WASM, k_gt_1) {
43330 for (size_t k = 2; k < 10; k++) {
43331 GemmMicrokernelTester()
43332 .mr(2)
43333 .nr(4)
43334 .kr(1)
43335 .sr(1)
43336 .m(2)
43337 .n(4)
43338 .k(k)
43339 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43340 }
43341 }
43342
43343 TEST(F32_IGEMM_2X4__WASM, k_gt_1_subtile) {
43344 for (size_t k = 2; k < 10; k++) {
43345 for (uint32_t m = 1; m <= 2; m++) {
43346 for (uint32_t n = 1; n <= 4; n++) {
43347 GemmMicrokernelTester()
43348 .mr(2)
43349 .nr(4)
43350 .kr(1)
43351 .sr(1)
43352 .m(m)
43353 .n(n)
43354 .k(k)
43355 .iterations(1)
43356 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43357 }
43358 }
43359 }
43360 }
43361
43362 TEST(F32_IGEMM_2X4__WASM, n_gt_4) {
43363 for (uint32_t n = 5; n < 8; n++) {
43364 for (size_t k = 1; k <= 5; k += 2) {
43365 GemmMicrokernelTester()
43366 .mr(2)
43367 .nr(4)
43368 .kr(1)
43369 .sr(1)
43370 .m(2)
43371 .n(4)
43372 .k(k)
43373 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43374 }
43375 }
43376 }
43377
43378 TEST(F32_IGEMM_2X4__WASM, n_gt_4_strided_cn) {
43379 for (uint32_t n = 5; n < 8; n++) {
43380 for (size_t k = 1; k <= 5; k += 2) {
43381 GemmMicrokernelTester()
43382 .mr(2)
43383 .nr(4)
43384 .kr(1)
43385 .sr(1)
43386 .m(2)
43387 .n(4)
43388 .k(k)
43389 .cn_stride(7)
43390 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43391 }
43392 }
43393 }
43394
43395 TEST(F32_IGEMM_2X4__WASM, n_gt_4_subtile) {
43396 for (uint32_t n = 5; n < 8; n++) {
43397 for (size_t k = 1; k <= 5; k += 2) {
43398 for (uint32_t m = 1; m <= 2; m++) {
43399 GemmMicrokernelTester()
43400 .mr(2)
43401 .nr(4)
43402 .kr(1)
43403 .sr(1)
43404 .m(m)
43405 .n(n)
43406 .k(k)
43407 .iterations(1)
43408 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43409 }
43410 }
43411 }
43412 }
43413
43414 TEST(F32_IGEMM_2X4__WASM, n_div_4) {
43415 for (uint32_t n = 8; n <= 12; n += 4) {
43416 for (size_t k = 1; k <= 5; k += 2) {
43417 GemmMicrokernelTester()
43418 .mr(2)
43419 .nr(4)
43420 .kr(1)
43421 .sr(1)
43422 .m(2)
43423 .n(4)
43424 .k(k)
43425 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43426 }
43427 }
43428 }
43429
43430 TEST(F32_IGEMM_2X4__WASM, n_div_4_strided_cn) {
43431 for (uint32_t n = 8; n <= 12; n += 4) {
43432 for (size_t k = 1; k <= 5; k += 2) {
43433 GemmMicrokernelTester()
43434 .mr(2)
43435 .nr(4)
43436 .kr(1)
43437 .sr(1)
43438 .m(2)
43439 .n(n)
43440 .k(k)
43441 .cn_stride(7)
43442 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43443 }
43444 }
43445 }
43446
43447 TEST(F32_IGEMM_2X4__WASM, n_div_4_subtile) {
43448 for (uint32_t n = 8; n <= 12; n += 4) {
43449 for (size_t k = 1; k <= 5; k += 2) {
43450 for (uint32_t m = 1; m <= 2; m++) {
43451 GemmMicrokernelTester()
43452 .mr(2)
43453 .nr(4)
43454 .kr(1)
43455 .sr(1)
43456 .m(m)
43457 .n(n)
43458 .k(k)
43459 .iterations(1)
43460 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43461 }
43462 }
43463 }
43464 }
43465
43466 TEST(F32_IGEMM_2X4__WASM, small_kernel) {
43467 for (size_t k = 1; k <= 5; k += 2) {
43468 GemmMicrokernelTester()
43469 .mr(2)
43470 .nr(4)
43471 .kr(1)
43472 .sr(1)
43473 .m(2)
43474 .n(4)
43475 .k(k)
43476 .ks(3)
43477 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43478 }
43479 }
43480
43481 TEST(F32_IGEMM_2X4__WASM, small_kernel_subtile) {
43482 for (size_t k = 1; k <= 5; k += 2) {
43483 for (uint32_t m = 1; m <= 2; m++) {
43484 for (uint32_t n = 1; n <= 4; n++) {
43485 GemmMicrokernelTester()
43486 .mr(2)
43487 .nr(4)
43488 .kr(1)
43489 .sr(1)
43490 .m(m)
43491 .n(n)
43492 .k(k)
43493 .ks(3)
43494 .iterations(1)
43495 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43496 }
43497 }
43498 }
43499 }
43500
43501 TEST(F32_IGEMM_2X4__WASM, n_gt_4_small_kernel) {
43502 for (uint32_t n = 5; n < 8; n++) {
43503 for (size_t k = 1; k <= 5; k += 2) {
43504 GemmMicrokernelTester()
43505 .mr(2)
43506 .nr(4)
43507 .kr(1)
43508 .sr(1)
43509 .m(2)
43510 .n(4)
43511 .k(k)
43512 .ks(3)
43513 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43514 }
43515 }
43516 }
43517
43518 TEST(F32_IGEMM_2X4__WASM, n_div_4_small_kernel) {
43519 for (uint32_t n = 8; n <= 12; n += 4) {
43520 for (size_t k = 1; k <= 5; k += 2) {
43521 GemmMicrokernelTester()
43522 .mr(2)
43523 .nr(4)
43524 .kr(1)
43525 .sr(1)
43526 .m(2)
43527 .n(4)
43528 .k(k)
43529 .ks(3)
43530 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43531 }
43532 }
43533 }
43534
43535 TEST(F32_IGEMM_2X4__WASM, strided_cm_subtile) {
43536 for (size_t k = 1; k <= 5; k += 2) {
43537 for (uint32_t m = 1; m <= 2; m++) {
43538 for (uint32_t n = 1; n <= 4; n++) {
43539 GemmMicrokernelTester()
43540 .mr(2)
43541 .nr(4)
43542 .kr(1)
43543 .sr(1)
43544 .m(m)
43545 .n(n)
43546 .k(k)
43547 .cm_stride(7)
43548 .iterations(1)
43549 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43550 }
43551 }
43552 }
43553 }
43554
43555 TEST(F32_IGEMM_2X4__WASM, a_offset) {
43556 for (size_t k = 1; k <= 5; k += 2) {
43557 GemmMicrokernelTester()
43558 .mr(2)
43559 .nr(4)
43560 .kr(1)
43561 .sr(1)
43562 .m(2)
43563 .n(4)
43564 .k(k)
43565 .ks(3)
43566 .a_offset(13)
43567 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43568 }
43569 }
43570
43571 TEST(F32_IGEMM_2X4__WASM, zero) {
43572 for (uint32_t mz = 0; mz < 2; mz++) {
43573 for (size_t k = 1; k <= 5; k += 2) {
43574 GemmMicrokernelTester()
43575 .mr(2)
43576 .nr(4)
43577 .kr(1)
43578 .sr(1)
43579 .m(2)
43580 .n(4)
43581 .k(k)
43582 .ks(3)
43583 .a_offset(13)
43584 .zero_index(mz)
43585 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43586 }
43587 }
43588 }
43589
43590 TEST(F32_IGEMM_2X4__WASM, qmin) {
43591 GemmMicrokernelTester()
43592 .mr(2)
43593 .nr(4)
43594 .kr(1)
43595 .sr(1)
43596 .m(2)
43597 .n(4)
43598 .k(1)
43599 .qmin(128)
43600 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43601 }
43602
43603 TEST(F32_IGEMM_2X4__WASM, qmax) {
43604 GemmMicrokernelTester()
43605 .mr(2)
43606 .nr(4)
43607 .kr(1)
43608 .sr(1)
43609 .m(2)
43610 .n(4)
43611 .k(1)
43612 .qmax(128)
43613 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43614 }
43615
43616 TEST(F32_IGEMM_2X4__WASM, strided_cm) {
43617 GemmMicrokernelTester()
43618 .mr(2)
43619 .nr(4)
43620 .kr(1)
43621 .sr(1)
43622 .m(2)
43623 .n(4)
43624 .k(1)
43625 .cm_stride(7)
43626 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43627 }
43628#endif // XNN_ARCH_WASM
43629
43630
43631#if XNN_ARCH_WASM
43632 TEST(F32_IGEMM_4X4__WASM, k_eq_1) {
43633 GemmMicrokernelTester()
43634 .mr(4)
43635 .nr(4)
43636 .kr(1)
43637 .sr(1)
43638 .m(4)
43639 .n(4)
43640 .k(1)
43641 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43642 }
43643
43644 TEST(F32_IGEMM_4X4__WASM, strided_cn) {
43645 GemmMicrokernelTester()
43646 .mr(4)
43647 .nr(4)
43648 .kr(1)
43649 .sr(1)
43650 .m(4)
43651 .n(4)
43652 .k(1)
43653 .cn_stride(7)
43654 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43655 }
43656
43657 TEST(F32_IGEMM_4X4__WASM, k_eq_1_subtile) {
43658 for (uint32_t m = 1; m <= 4; m++) {
43659 for (uint32_t n = 1; n <= 4; n++) {
43660 GemmMicrokernelTester()
43661 .mr(4)
43662 .nr(4)
43663 .kr(1)
43664 .sr(1)
43665 .m(m)
43666 .n(n)
43667 .k(1)
43668 .iterations(1)
43669 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43670 }
43671 }
43672 }
43673
43674 TEST(F32_IGEMM_4X4__WASM, k_eq_1_subtile_m) {
43675 for (uint32_t m = 1; m <= 4; m++) {
43676 GemmMicrokernelTester()
43677 .mr(4)
43678 .nr(4)
43679 .kr(1)
43680 .sr(1)
43681 .m(m)
43682 .n(4)
43683 .k(1)
43684 .iterations(1)
43685 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43686 }
43687 }
43688
43689 TEST(F32_IGEMM_4X4__WASM, k_eq_1_subtile_n) {
43690 for (uint32_t n = 1; n <= 4; n++) {
43691 GemmMicrokernelTester()
43692 .mr(4)
43693 .nr(4)
43694 .kr(1)
43695 .sr(1)
43696 .m(4)
43697 .n(n)
43698 .k(1)
43699 .iterations(1)
43700 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43701 }
43702 }
43703
43704 TEST(F32_IGEMM_4X4__WASM, k_gt_1) {
43705 for (size_t k = 2; k < 10; k++) {
43706 GemmMicrokernelTester()
43707 .mr(4)
43708 .nr(4)
43709 .kr(1)
43710 .sr(1)
43711 .m(4)
43712 .n(4)
43713 .k(k)
43714 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43715 }
43716 }
43717
43718 TEST(F32_IGEMM_4X4__WASM, k_gt_1_subtile) {
43719 for (size_t k = 2; k < 10; k++) {
43720 for (uint32_t m = 1; m <= 4; m++) {
43721 for (uint32_t n = 1; n <= 4; n++) {
43722 GemmMicrokernelTester()
43723 .mr(4)
43724 .nr(4)
43725 .kr(1)
43726 .sr(1)
43727 .m(m)
43728 .n(n)
43729 .k(k)
43730 .iterations(1)
43731 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43732 }
43733 }
43734 }
43735 }
43736
43737 TEST(F32_IGEMM_4X4__WASM, n_gt_4) {
43738 for (uint32_t n = 5; n < 8; n++) {
43739 for (size_t k = 1; k <= 5; k += 2) {
43740 GemmMicrokernelTester()
43741 .mr(4)
43742 .nr(4)
43743 .kr(1)
43744 .sr(1)
43745 .m(4)
43746 .n(4)
43747 .k(k)
43748 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43749 }
43750 }
43751 }
43752
43753 TEST(F32_IGEMM_4X4__WASM, n_gt_4_strided_cn) {
43754 for (uint32_t n = 5; n < 8; n++) {
43755 for (size_t k = 1; k <= 5; k += 2) {
43756 GemmMicrokernelTester()
43757 .mr(4)
43758 .nr(4)
43759 .kr(1)
43760 .sr(1)
43761 .m(4)
43762 .n(4)
43763 .k(k)
43764 .cn_stride(7)
43765 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43766 }
43767 }
43768 }
43769
43770 TEST(F32_IGEMM_4X4__WASM, n_gt_4_subtile) {
43771 for (uint32_t n = 5; n < 8; n++) {
43772 for (size_t k = 1; k <= 5; k += 2) {
43773 for (uint32_t m = 1; m <= 4; m++) {
43774 GemmMicrokernelTester()
43775 .mr(4)
43776 .nr(4)
43777 .kr(1)
43778 .sr(1)
43779 .m(m)
43780 .n(n)
43781 .k(k)
43782 .iterations(1)
43783 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43784 }
43785 }
43786 }
43787 }
43788
43789 TEST(F32_IGEMM_4X4__WASM, n_div_4) {
43790 for (uint32_t n = 8; n <= 12; n += 4) {
43791 for (size_t k = 1; k <= 5; k += 2) {
43792 GemmMicrokernelTester()
43793 .mr(4)
43794 .nr(4)
43795 .kr(1)
43796 .sr(1)
43797 .m(4)
43798 .n(4)
43799 .k(k)
43800 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43801 }
43802 }
43803 }
43804
43805 TEST(F32_IGEMM_4X4__WASM, n_div_4_strided_cn) {
43806 for (uint32_t n = 8; n <= 12; n += 4) {
43807 for (size_t k = 1; k <= 5; k += 2) {
43808 GemmMicrokernelTester()
43809 .mr(4)
43810 .nr(4)
43811 .kr(1)
43812 .sr(1)
43813 .m(4)
43814 .n(n)
43815 .k(k)
43816 .cn_stride(7)
43817 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43818 }
43819 }
43820 }
43821
43822 TEST(F32_IGEMM_4X4__WASM, n_div_4_subtile) {
43823 for (uint32_t n = 8; n <= 12; n += 4) {
43824 for (size_t k = 1; k <= 5; k += 2) {
43825 for (uint32_t m = 1; m <= 4; m++) {
43826 GemmMicrokernelTester()
43827 .mr(4)
43828 .nr(4)
43829 .kr(1)
43830 .sr(1)
43831 .m(m)
43832 .n(n)
43833 .k(k)
43834 .iterations(1)
43835 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43836 }
43837 }
43838 }
43839 }
43840
43841 TEST(F32_IGEMM_4X4__WASM, small_kernel) {
43842 for (size_t k = 1; k <= 5; k += 2) {
43843 GemmMicrokernelTester()
43844 .mr(4)
43845 .nr(4)
43846 .kr(1)
43847 .sr(1)
43848 .m(4)
43849 .n(4)
43850 .k(k)
43851 .ks(3)
43852 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43853 }
43854 }
43855
43856 TEST(F32_IGEMM_4X4__WASM, small_kernel_subtile) {
43857 for (size_t k = 1; k <= 5; k += 2) {
43858 for (uint32_t m = 1; m <= 4; m++) {
43859 for (uint32_t n = 1; n <= 4; n++) {
43860 GemmMicrokernelTester()
43861 .mr(4)
43862 .nr(4)
43863 .kr(1)
43864 .sr(1)
43865 .m(m)
43866 .n(n)
43867 .k(k)
43868 .ks(3)
43869 .iterations(1)
43870 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43871 }
43872 }
43873 }
43874 }
43875
43876 TEST(F32_IGEMM_4X4__WASM, n_gt_4_small_kernel) {
43877 for (uint32_t n = 5; n < 8; n++) {
43878 for (size_t k = 1; k <= 5; k += 2) {
43879 GemmMicrokernelTester()
43880 .mr(4)
43881 .nr(4)
43882 .kr(1)
43883 .sr(1)
43884 .m(4)
43885 .n(4)
43886 .k(k)
43887 .ks(3)
43888 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43889 }
43890 }
43891 }
43892
43893 TEST(F32_IGEMM_4X4__WASM, n_div_4_small_kernel) {
43894 for (uint32_t n = 8; n <= 12; n += 4) {
43895 for (size_t k = 1; k <= 5; k += 2) {
43896 GemmMicrokernelTester()
43897 .mr(4)
43898 .nr(4)
43899 .kr(1)
43900 .sr(1)
43901 .m(4)
43902 .n(4)
43903 .k(k)
43904 .ks(3)
43905 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43906 }
43907 }
43908 }
43909
43910 TEST(F32_IGEMM_4X4__WASM, strided_cm_subtile) {
43911 for (size_t k = 1; k <= 5; k += 2) {
43912 for (uint32_t m = 1; m <= 4; m++) {
43913 for (uint32_t n = 1; n <= 4; n++) {
43914 GemmMicrokernelTester()
43915 .mr(4)
43916 .nr(4)
43917 .kr(1)
43918 .sr(1)
43919 .m(m)
43920 .n(n)
43921 .k(k)
43922 .cm_stride(7)
43923 .iterations(1)
43924 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43925 }
43926 }
43927 }
43928 }
43929
43930 TEST(F32_IGEMM_4X4__WASM, a_offset) {
43931 for (size_t k = 1; k <= 5; k += 2) {
43932 GemmMicrokernelTester()
43933 .mr(4)
43934 .nr(4)
43935 .kr(1)
43936 .sr(1)
43937 .m(4)
43938 .n(4)
43939 .k(k)
43940 .ks(3)
43941 .a_offset(23)
43942 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43943 }
43944 }
43945
43946 TEST(F32_IGEMM_4X4__WASM, zero) {
43947 for (uint32_t mz = 0; mz < 4; mz++) {
43948 for (size_t k = 1; k <= 5; k += 2) {
43949 GemmMicrokernelTester()
43950 .mr(4)
43951 .nr(4)
43952 .kr(1)
43953 .sr(1)
43954 .m(4)
43955 .n(4)
43956 .k(k)
43957 .ks(3)
43958 .a_offset(23)
43959 .zero_index(mz)
43960 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43961 }
43962 }
43963 }
43964
43965 TEST(F32_IGEMM_4X4__WASM, qmin) {
43966 GemmMicrokernelTester()
43967 .mr(4)
43968 .nr(4)
43969 .kr(1)
43970 .sr(1)
43971 .m(4)
43972 .n(4)
43973 .k(1)
43974 .qmin(128)
43975 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43976 }
43977
43978 TEST(F32_IGEMM_4X4__WASM, qmax) {
43979 GemmMicrokernelTester()
43980 .mr(4)
43981 .nr(4)
43982 .kr(1)
43983 .sr(1)
43984 .m(4)
43985 .n(4)
43986 .k(1)
43987 .qmax(128)
43988 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
43989 }
43990
43991 TEST(F32_IGEMM_4X4__WASM, strided_cm) {
43992 GemmMicrokernelTester()
43993 .mr(4)
43994 .nr(4)
43995 .kr(1)
43996 .sr(1)
43997 .m(4)
43998 .n(4)
43999 .k(1)
44000 .cm_stride(7)
44001 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
44002 }
44003#endif // XNN_ARCH_WASM
44004
44005
44006#if XNN_ARCH_WASM
44007 TEST(F32_IGEMM_4X2__WASM, k_eq_1) {
44008 GemmMicrokernelTester()
44009 .mr(4)
44010 .nr(2)
44011 .kr(1)
44012 .sr(1)
44013 .m(4)
44014 .n(2)
44015 .k(1)
44016 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44017 }
44018
44019 TEST(F32_IGEMM_4X2__WASM, strided_cn) {
44020 GemmMicrokernelTester()
44021 .mr(4)
44022 .nr(2)
44023 .kr(1)
44024 .sr(1)
44025 .m(4)
44026 .n(2)
44027 .k(1)
44028 .cn_stride(5)
44029 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44030 }
44031
44032 TEST(F32_IGEMM_4X2__WASM, k_eq_1_subtile) {
44033 for (uint32_t m = 1; m <= 4; m++) {
44034 for (uint32_t n = 1; n <= 2; n++) {
44035 GemmMicrokernelTester()
44036 .mr(4)
44037 .nr(2)
44038 .kr(1)
44039 .sr(1)
44040 .m(m)
44041 .n(n)
44042 .k(1)
44043 .iterations(1)
44044 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44045 }
44046 }
44047 }
44048
44049 TEST(F32_IGEMM_4X2__WASM, k_eq_1_subtile_m) {
44050 for (uint32_t m = 1; m <= 4; m++) {
44051 GemmMicrokernelTester()
44052 .mr(4)
44053 .nr(2)
44054 .kr(1)
44055 .sr(1)
44056 .m(m)
44057 .n(2)
44058 .k(1)
44059 .iterations(1)
44060 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44061 }
44062 }
44063
44064 TEST(F32_IGEMM_4X2__WASM, k_eq_1_subtile_n) {
44065 for (uint32_t n = 1; n <= 2; n++) {
44066 GemmMicrokernelTester()
44067 .mr(4)
44068 .nr(2)
44069 .kr(1)
44070 .sr(1)
44071 .m(4)
44072 .n(n)
44073 .k(1)
44074 .iterations(1)
44075 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44076 }
44077 }
44078
44079 TEST(F32_IGEMM_4X2__WASM, k_gt_1) {
44080 for (size_t k = 2; k < 10; k++) {
44081 GemmMicrokernelTester()
44082 .mr(4)
44083 .nr(2)
44084 .kr(1)
44085 .sr(1)
44086 .m(4)
44087 .n(2)
44088 .k(k)
44089 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44090 }
44091 }
44092
44093 TEST(F32_IGEMM_4X2__WASM, k_gt_1_subtile) {
44094 for (size_t k = 2; k < 10; k++) {
44095 for (uint32_t m = 1; m <= 4; m++) {
44096 for (uint32_t n = 1; n <= 2; n++) {
44097 GemmMicrokernelTester()
44098 .mr(4)
44099 .nr(2)
44100 .kr(1)
44101 .sr(1)
44102 .m(m)
44103 .n(n)
44104 .k(k)
44105 .iterations(1)
44106 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44107 }
44108 }
44109 }
44110 }
44111
44112 TEST(F32_IGEMM_4X2__WASM, n_gt_2) {
44113 for (uint32_t n = 3; n < 4; n++) {
44114 for (size_t k = 1; k <= 5; k += 2) {
44115 GemmMicrokernelTester()
44116 .mr(4)
44117 .nr(2)
44118 .kr(1)
44119 .sr(1)
44120 .m(4)
44121 .n(2)
44122 .k(k)
44123 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44124 }
44125 }
44126 }
44127
44128 TEST(F32_IGEMM_4X2__WASM, n_gt_2_strided_cn) {
44129 for (uint32_t n = 3; n < 4; n++) {
44130 for (size_t k = 1; k <= 5; k += 2) {
44131 GemmMicrokernelTester()
44132 .mr(4)
44133 .nr(2)
44134 .kr(1)
44135 .sr(1)
44136 .m(4)
44137 .n(2)
44138 .k(k)
44139 .cn_stride(5)
44140 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44141 }
44142 }
44143 }
44144
44145 TEST(F32_IGEMM_4X2__WASM, n_gt_2_subtile) {
44146 for (uint32_t n = 3; n < 4; n++) {
44147 for (size_t k = 1; k <= 5; k += 2) {
44148 for (uint32_t m = 1; m <= 4; m++) {
44149 GemmMicrokernelTester()
44150 .mr(4)
44151 .nr(2)
44152 .kr(1)
44153 .sr(1)
44154 .m(m)
44155 .n(n)
44156 .k(k)
44157 .iterations(1)
44158 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44159 }
44160 }
44161 }
44162 }
44163
44164 TEST(F32_IGEMM_4X2__WASM, n_div_2) {
44165 for (uint32_t n = 4; n <= 6; n += 2) {
44166 for (size_t k = 1; k <= 5; k += 2) {
44167 GemmMicrokernelTester()
44168 .mr(4)
44169 .nr(2)
44170 .kr(1)
44171 .sr(1)
44172 .m(4)
44173 .n(2)
44174 .k(k)
44175 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44176 }
44177 }
44178 }
44179
44180 TEST(F32_IGEMM_4X2__WASM, n_div_2_strided_cn) {
44181 for (uint32_t n = 4; n <= 6; n += 2) {
44182 for (size_t k = 1; k <= 5; k += 2) {
44183 GemmMicrokernelTester()
44184 .mr(4)
44185 .nr(2)
44186 .kr(1)
44187 .sr(1)
44188 .m(4)
44189 .n(n)
44190 .k(k)
44191 .cn_stride(5)
44192 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44193 }
44194 }
44195 }
44196
44197 TEST(F32_IGEMM_4X2__WASM, n_div_2_subtile) {
44198 for (uint32_t n = 4; n <= 6; n += 2) {
44199 for (size_t k = 1; k <= 5; k += 2) {
44200 for (uint32_t m = 1; m <= 4; m++) {
44201 GemmMicrokernelTester()
44202 .mr(4)
44203 .nr(2)
44204 .kr(1)
44205 .sr(1)
44206 .m(m)
44207 .n(n)
44208 .k(k)
44209 .iterations(1)
44210 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44211 }
44212 }
44213 }
44214 }
44215
44216 TEST(F32_IGEMM_4X2__WASM, small_kernel) {
44217 for (size_t k = 1; k <= 5; k += 2) {
44218 GemmMicrokernelTester()
44219 .mr(4)
44220 .nr(2)
44221 .kr(1)
44222 .sr(1)
44223 .m(4)
44224 .n(2)
44225 .k(k)
44226 .ks(3)
44227 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44228 }
44229 }
44230
44231 TEST(F32_IGEMM_4X2__WASM, small_kernel_subtile) {
44232 for (size_t k = 1; k <= 5; k += 2) {
44233 for (uint32_t m = 1; m <= 4; m++) {
44234 for (uint32_t n = 1; n <= 2; n++) {
44235 GemmMicrokernelTester()
44236 .mr(4)
44237 .nr(2)
44238 .kr(1)
44239 .sr(1)
44240 .m(m)
44241 .n(n)
44242 .k(k)
44243 .ks(3)
44244 .iterations(1)
44245 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44246 }
44247 }
44248 }
44249 }
44250
44251 TEST(F32_IGEMM_4X2__WASM, n_gt_2_small_kernel) {
44252 for (uint32_t n = 3; n < 4; n++) {
44253 for (size_t k = 1; k <= 5; k += 2) {
44254 GemmMicrokernelTester()
44255 .mr(4)
44256 .nr(2)
44257 .kr(1)
44258 .sr(1)
44259 .m(4)
44260 .n(2)
44261 .k(k)
44262 .ks(3)
44263 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44264 }
44265 }
44266 }
44267
44268 TEST(F32_IGEMM_4X2__WASM, n_div_2_small_kernel) {
44269 for (uint32_t n = 4; n <= 6; n += 2) {
44270 for (size_t k = 1; k <= 5; k += 2) {
44271 GemmMicrokernelTester()
44272 .mr(4)
44273 .nr(2)
44274 .kr(1)
44275 .sr(1)
44276 .m(4)
44277 .n(2)
44278 .k(k)
44279 .ks(3)
44280 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44281 }
44282 }
44283 }
44284
44285 TEST(F32_IGEMM_4X2__WASM, strided_cm_subtile) {
44286 for (size_t k = 1; k <= 5; k += 2) {
44287 for (uint32_t m = 1; m <= 4; m++) {
44288 for (uint32_t n = 1; n <= 2; n++) {
44289 GemmMicrokernelTester()
44290 .mr(4)
44291 .nr(2)
44292 .kr(1)
44293 .sr(1)
44294 .m(m)
44295 .n(n)
44296 .k(k)
44297 .cm_stride(5)
44298 .iterations(1)
44299 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44300 }
44301 }
44302 }
44303 }
44304
44305 TEST(F32_IGEMM_4X2__WASM, a_offset) {
44306 for (size_t k = 1; k <= 5; k += 2) {
44307 GemmMicrokernelTester()
44308 .mr(4)
44309 .nr(2)
44310 .kr(1)
44311 .sr(1)
44312 .m(4)
44313 .n(2)
44314 .k(k)
44315 .ks(3)
44316 .a_offset(23)
44317 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44318 }
44319 }
44320
44321 TEST(F32_IGEMM_4X2__WASM, zero) {
44322 for (uint32_t mz = 0; mz < 4; mz++) {
44323 for (size_t k = 1; k <= 5; k += 2) {
44324 GemmMicrokernelTester()
44325 .mr(4)
44326 .nr(2)
44327 .kr(1)
44328 .sr(1)
44329 .m(4)
44330 .n(2)
44331 .k(k)
44332 .ks(3)
44333 .a_offset(23)
44334 .zero_index(mz)
44335 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44336 }
44337 }
44338 }
44339
44340 TEST(F32_IGEMM_4X2__WASM, qmin) {
44341 GemmMicrokernelTester()
44342 .mr(4)
44343 .nr(2)
44344 .kr(1)
44345 .sr(1)
44346 .m(4)
44347 .n(2)
44348 .k(1)
44349 .qmin(128)
44350 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44351 }
44352
44353 TEST(F32_IGEMM_4X2__WASM, qmax) {
44354 GemmMicrokernelTester()
44355 .mr(4)
44356 .nr(2)
44357 .kr(1)
44358 .sr(1)
44359 .m(4)
44360 .n(2)
44361 .k(1)
44362 .qmax(128)
44363 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44364 }
44365
44366 TEST(F32_IGEMM_4X2__WASM, strided_cm) {
44367 GemmMicrokernelTester()
44368 .mr(4)
44369 .nr(2)
44370 .kr(1)
44371 .sr(1)
44372 .m(4)
44373 .n(2)
44374 .k(1)
44375 .cm_stride(5)
44376 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
44377 }
44378#endif // XNN_ARCH_WASM
44379
44380
XNNPACK Teamb455b122019-09-27 18:10:33 -070044381TEST(F32_IGEMM_1X4__SCALAR, k_eq_1) {
44382 GemmMicrokernelTester()
44383 .mr(1)
44384 .nr(4)
44385 .kr(1)
44386 .sr(1)
44387 .m(1)
44388 .n(4)
44389 .k(1)
44390 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44391}
44392
44393TEST(F32_IGEMM_1X4__SCALAR, strided_cn) {
44394 GemmMicrokernelTester()
44395 .mr(1)
44396 .nr(4)
44397 .kr(1)
44398 .sr(1)
44399 .m(1)
44400 .n(4)
44401 .k(1)
44402 .cn_stride(7)
44403 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44404}
44405
44406TEST(F32_IGEMM_1X4__SCALAR, k_eq_1_subtile) {
44407 for (uint32_t m = 1; m <= 1; m++) {
44408 for (uint32_t n = 1; n <= 4; n++) {
44409 GemmMicrokernelTester()
44410 .mr(1)
44411 .nr(4)
44412 .kr(1)
44413 .sr(1)
44414 .m(m)
44415 .n(n)
44416 .k(1)
44417 .iterations(1)
44418 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44419 }
44420 }
44421}
44422
44423TEST(F32_IGEMM_1X4__SCALAR, k_eq_1_subtile_m) {
44424 for (uint32_t m = 1; m <= 1; m++) {
44425 GemmMicrokernelTester()
44426 .mr(1)
44427 .nr(4)
44428 .kr(1)
44429 .sr(1)
44430 .m(m)
44431 .n(4)
44432 .k(1)
44433 .iterations(1)
44434 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44435 }
44436}
44437
44438TEST(F32_IGEMM_1X4__SCALAR, k_eq_1_subtile_n) {
44439 for (uint32_t n = 1; n <= 4; n++) {
44440 GemmMicrokernelTester()
44441 .mr(1)
44442 .nr(4)
44443 .kr(1)
44444 .sr(1)
44445 .m(1)
44446 .n(n)
44447 .k(1)
44448 .iterations(1)
44449 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44450 }
44451}
44452
44453TEST(F32_IGEMM_1X4__SCALAR, k_gt_1) {
44454 for (size_t k = 2; k < 10; k++) {
44455 GemmMicrokernelTester()
44456 .mr(1)
44457 .nr(4)
44458 .kr(1)
44459 .sr(1)
44460 .m(1)
44461 .n(4)
44462 .k(k)
44463 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44464 }
44465}
44466
44467TEST(F32_IGEMM_1X4__SCALAR, k_gt_1_subtile) {
44468 for (size_t k = 2; k < 10; k++) {
44469 for (uint32_t m = 1; m <= 1; m++) {
44470 for (uint32_t n = 1; n <= 4; n++) {
44471 GemmMicrokernelTester()
44472 .mr(1)
44473 .nr(4)
44474 .kr(1)
44475 .sr(1)
44476 .m(m)
44477 .n(n)
44478 .k(k)
44479 .iterations(1)
44480 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44481 }
44482 }
44483 }
44484}
44485
44486TEST(F32_IGEMM_1X4__SCALAR, n_gt_4) {
44487 for (uint32_t n = 5; n < 8; n++) {
44488 for (size_t k = 1; k <= 5; k += 2) {
44489 GemmMicrokernelTester()
44490 .mr(1)
44491 .nr(4)
44492 .kr(1)
44493 .sr(1)
44494 .m(1)
44495 .n(4)
44496 .k(k)
44497 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44498 }
44499 }
44500}
44501
44502TEST(F32_IGEMM_1X4__SCALAR, n_gt_4_strided_cn) {
44503 for (uint32_t n = 5; n < 8; n++) {
44504 for (size_t k = 1; k <= 5; k += 2) {
44505 GemmMicrokernelTester()
44506 .mr(1)
44507 .nr(4)
44508 .kr(1)
44509 .sr(1)
44510 .m(1)
44511 .n(4)
44512 .k(k)
44513 .cn_stride(7)
44514 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44515 }
44516 }
44517}
44518
44519TEST(F32_IGEMM_1X4__SCALAR, n_gt_4_subtile) {
44520 for (uint32_t n = 5; n < 8; n++) {
44521 for (size_t k = 1; k <= 5; k += 2) {
44522 for (uint32_t m = 1; m <= 1; m++) {
44523 GemmMicrokernelTester()
44524 .mr(1)
44525 .nr(4)
44526 .kr(1)
44527 .sr(1)
44528 .m(m)
44529 .n(n)
44530 .k(k)
44531 .iterations(1)
44532 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44533 }
44534 }
44535 }
44536}
44537
44538TEST(F32_IGEMM_1X4__SCALAR, n_div_4) {
44539 for (uint32_t n = 8; n <= 12; n += 4) {
44540 for (size_t k = 1; k <= 5; k += 2) {
44541 GemmMicrokernelTester()
44542 .mr(1)
44543 .nr(4)
44544 .kr(1)
44545 .sr(1)
44546 .m(1)
44547 .n(4)
44548 .k(k)
44549 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44550 }
44551 }
44552}
44553
44554TEST(F32_IGEMM_1X4__SCALAR, n_div_4_strided_cn) {
44555 for (uint32_t n = 8; n <= 12; n += 4) {
44556 for (size_t k = 1; k <= 5; k += 2) {
44557 GemmMicrokernelTester()
44558 .mr(1)
44559 .nr(4)
44560 .kr(1)
44561 .sr(1)
44562 .m(1)
44563 .n(n)
44564 .k(k)
44565 .cn_stride(7)
44566 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44567 }
44568 }
44569}
44570
44571TEST(F32_IGEMM_1X4__SCALAR, n_div_4_subtile) {
44572 for (uint32_t n = 8; n <= 12; n += 4) {
44573 for (size_t k = 1; k <= 5; k += 2) {
44574 for (uint32_t m = 1; m <= 1; m++) {
44575 GemmMicrokernelTester()
44576 .mr(1)
44577 .nr(4)
44578 .kr(1)
44579 .sr(1)
44580 .m(m)
44581 .n(n)
44582 .k(k)
44583 .iterations(1)
44584 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44585 }
44586 }
44587 }
44588}
44589
44590TEST(F32_IGEMM_1X4__SCALAR, small_kernel) {
44591 for (size_t k = 1; k <= 5; k += 2) {
44592 GemmMicrokernelTester()
44593 .mr(1)
44594 .nr(4)
44595 .kr(1)
44596 .sr(1)
44597 .m(1)
44598 .n(4)
44599 .k(k)
44600 .ks(3)
44601 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44602 }
44603}
44604
44605TEST(F32_IGEMM_1X4__SCALAR, small_kernel_subtile) {
44606 for (size_t k = 1; k <= 5; k += 2) {
44607 for (uint32_t m = 1; m <= 1; m++) {
44608 for (uint32_t n = 1; n <= 4; n++) {
44609 GemmMicrokernelTester()
44610 .mr(1)
44611 .nr(4)
44612 .kr(1)
44613 .sr(1)
44614 .m(m)
44615 .n(n)
44616 .k(k)
44617 .ks(3)
44618 .iterations(1)
44619 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44620 }
44621 }
44622 }
44623}
44624
44625TEST(F32_IGEMM_1X4__SCALAR, n_gt_4_small_kernel) {
44626 for (uint32_t n = 5; n < 8; n++) {
44627 for (size_t k = 1; k <= 5; k += 2) {
44628 GemmMicrokernelTester()
44629 .mr(1)
44630 .nr(4)
44631 .kr(1)
44632 .sr(1)
44633 .m(1)
44634 .n(4)
44635 .k(k)
44636 .ks(3)
44637 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44638 }
44639 }
44640}
44641
44642TEST(F32_IGEMM_1X4__SCALAR, n_div_4_small_kernel) {
44643 for (uint32_t n = 8; n <= 12; n += 4) {
44644 for (size_t k = 1; k <= 5; k += 2) {
44645 GemmMicrokernelTester()
44646 .mr(1)
44647 .nr(4)
44648 .kr(1)
44649 .sr(1)
44650 .m(1)
44651 .n(4)
44652 .k(k)
44653 .ks(3)
44654 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44655 }
44656 }
44657}
44658
44659TEST(F32_IGEMM_1X4__SCALAR, strided_cm_subtile) {
44660 for (size_t k = 1; k <= 5; k += 2) {
44661 for (uint32_t m = 1; m <= 1; m++) {
44662 for (uint32_t n = 1; n <= 4; n++) {
44663 GemmMicrokernelTester()
44664 .mr(1)
44665 .nr(4)
44666 .kr(1)
44667 .sr(1)
44668 .m(m)
44669 .n(n)
44670 .k(k)
44671 .cm_stride(7)
44672 .iterations(1)
44673 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44674 }
44675 }
44676 }
44677}
44678
44679TEST(F32_IGEMM_1X4__SCALAR, a_offset) {
44680 for (size_t k = 1; k <= 5; k += 2) {
44681 GemmMicrokernelTester()
44682 .mr(1)
44683 .nr(4)
44684 .kr(1)
44685 .sr(1)
44686 .m(1)
44687 .n(4)
44688 .k(k)
44689 .ks(3)
44690 .a_offset(7)
44691 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44692 }
44693}
44694
44695TEST(F32_IGEMM_1X4__SCALAR, zero) {
44696 for (uint32_t mz = 0; mz < 1; mz++) {
44697 for (size_t k = 1; k <= 5; k += 2) {
44698 GemmMicrokernelTester()
44699 .mr(1)
44700 .nr(4)
44701 .kr(1)
44702 .sr(1)
44703 .m(1)
44704 .n(4)
44705 .k(k)
44706 .ks(3)
44707 .a_offset(7)
44708 .zero_index(mz)
44709 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44710 }
44711 }
44712}
44713
44714TEST(F32_IGEMM_1X4__SCALAR, qmin) {
44715 GemmMicrokernelTester()
44716 .mr(1)
44717 .nr(4)
44718 .kr(1)
44719 .sr(1)
44720 .m(1)
44721 .n(4)
44722 .k(1)
44723 .qmin(128)
44724 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44725}
44726
44727TEST(F32_IGEMM_1X4__SCALAR, qmax) {
44728 GemmMicrokernelTester()
44729 .mr(1)
44730 .nr(4)
44731 .kr(1)
44732 .sr(1)
44733 .m(1)
44734 .n(4)
44735 .k(1)
44736 .qmax(128)
44737 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44738}
44739
44740TEST(F32_IGEMM_1X4__SCALAR, strided_cm) {
44741 GemmMicrokernelTester()
44742 .mr(1)
44743 .nr(4)
44744 .kr(1)
44745 .sr(1)
44746 .m(1)
44747 .n(4)
44748 .k(1)
44749 .cm_stride(7)
44750 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44751}
44752
44753
44754TEST(F32_IGEMM_2X4__SCALAR, k_eq_1) {
44755 GemmMicrokernelTester()
44756 .mr(2)
44757 .nr(4)
44758 .kr(1)
44759 .sr(1)
44760 .m(2)
44761 .n(4)
44762 .k(1)
44763 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44764}
44765
44766TEST(F32_IGEMM_2X4__SCALAR, strided_cn) {
44767 GemmMicrokernelTester()
44768 .mr(2)
44769 .nr(4)
44770 .kr(1)
44771 .sr(1)
44772 .m(2)
44773 .n(4)
44774 .k(1)
44775 .cn_stride(7)
44776 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44777}
44778
44779TEST(F32_IGEMM_2X4__SCALAR, k_eq_1_subtile) {
44780 for (uint32_t m = 1; m <= 2; m++) {
44781 for (uint32_t n = 1; n <= 4; n++) {
44782 GemmMicrokernelTester()
44783 .mr(2)
44784 .nr(4)
44785 .kr(1)
44786 .sr(1)
44787 .m(m)
44788 .n(n)
44789 .k(1)
44790 .iterations(1)
44791 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44792 }
44793 }
44794}
44795
44796TEST(F32_IGEMM_2X4__SCALAR, k_eq_1_subtile_m) {
44797 for (uint32_t m = 1; m <= 2; m++) {
44798 GemmMicrokernelTester()
44799 .mr(2)
44800 .nr(4)
44801 .kr(1)
44802 .sr(1)
44803 .m(m)
44804 .n(4)
44805 .k(1)
44806 .iterations(1)
44807 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44808 }
44809}
44810
44811TEST(F32_IGEMM_2X4__SCALAR, k_eq_1_subtile_n) {
44812 for (uint32_t n = 1; n <= 4; n++) {
44813 GemmMicrokernelTester()
44814 .mr(2)
44815 .nr(4)
44816 .kr(1)
44817 .sr(1)
44818 .m(2)
44819 .n(n)
44820 .k(1)
44821 .iterations(1)
44822 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44823 }
44824}
44825
44826TEST(F32_IGEMM_2X4__SCALAR, k_gt_1) {
44827 for (size_t k = 2; k < 10; k++) {
44828 GemmMicrokernelTester()
44829 .mr(2)
44830 .nr(4)
44831 .kr(1)
44832 .sr(1)
44833 .m(2)
44834 .n(4)
44835 .k(k)
44836 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44837 }
44838}
44839
44840TEST(F32_IGEMM_2X4__SCALAR, k_gt_1_subtile) {
44841 for (size_t k = 2; k < 10; k++) {
44842 for (uint32_t m = 1; m <= 2; m++) {
44843 for (uint32_t n = 1; n <= 4; n++) {
44844 GemmMicrokernelTester()
44845 .mr(2)
44846 .nr(4)
44847 .kr(1)
44848 .sr(1)
44849 .m(m)
44850 .n(n)
44851 .k(k)
44852 .iterations(1)
44853 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44854 }
44855 }
44856 }
44857}
44858
44859TEST(F32_IGEMM_2X4__SCALAR, n_gt_4) {
44860 for (uint32_t n = 5; n < 8; n++) {
44861 for (size_t k = 1; k <= 5; k += 2) {
44862 GemmMicrokernelTester()
44863 .mr(2)
44864 .nr(4)
44865 .kr(1)
44866 .sr(1)
44867 .m(2)
44868 .n(4)
44869 .k(k)
44870 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44871 }
44872 }
44873}
44874
44875TEST(F32_IGEMM_2X4__SCALAR, n_gt_4_strided_cn) {
44876 for (uint32_t n = 5; n < 8; n++) {
44877 for (size_t k = 1; k <= 5; k += 2) {
44878 GemmMicrokernelTester()
44879 .mr(2)
44880 .nr(4)
44881 .kr(1)
44882 .sr(1)
44883 .m(2)
44884 .n(4)
44885 .k(k)
44886 .cn_stride(7)
44887 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44888 }
44889 }
44890}
44891
44892TEST(F32_IGEMM_2X4__SCALAR, n_gt_4_subtile) {
44893 for (uint32_t n = 5; n < 8; n++) {
44894 for (size_t k = 1; k <= 5; k += 2) {
44895 for (uint32_t m = 1; m <= 2; m++) {
44896 GemmMicrokernelTester()
44897 .mr(2)
44898 .nr(4)
44899 .kr(1)
44900 .sr(1)
44901 .m(m)
44902 .n(n)
44903 .k(k)
44904 .iterations(1)
44905 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44906 }
44907 }
44908 }
44909}
44910
44911TEST(F32_IGEMM_2X4__SCALAR, n_div_4) {
44912 for (uint32_t n = 8; n <= 12; n += 4) {
44913 for (size_t k = 1; k <= 5; k += 2) {
44914 GemmMicrokernelTester()
44915 .mr(2)
44916 .nr(4)
44917 .kr(1)
44918 .sr(1)
44919 .m(2)
44920 .n(4)
44921 .k(k)
44922 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44923 }
44924 }
44925}
44926
44927TEST(F32_IGEMM_2X4__SCALAR, n_div_4_strided_cn) {
44928 for (uint32_t n = 8; n <= 12; n += 4) {
44929 for (size_t k = 1; k <= 5; k += 2) {
44930 GemmMicrokernelTester()
44931 .mr(2)
44932 .nr(4)
44933 .kr(1)
44934 .sr(1)
44935 .m(2)
44936 .n(n)
44937 .k(k)
44938 .cn_stride(7)
44939 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44940 }
44941 }
44942}
44943
44944TEST(F32_IGEMM_2X4__SCALAR, n_div_4_subtile) {
44945 for (uint32_t n = 8; n <= 12; n += 4) {
44946 for (size_t k = 1; k <= 5; k += 2) {
44947 for (uint32_t m = 1; m <= 2; m++) {
44948 GemmMicrokernelTester()
44949 .mr(2)
44950 .nr(4)
44951 .kr(1)
44952 .sr(1)
44953 .m(m)
44954 .n(n)
44955 .k(k)
44956 .iterations(1)
44957 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44958 }
44959 }
44960 }
44961}
44962
44963TEST(F32_IGEMM_2X4__SCALAR, small_kernel) {
44964 for (size_t k = 1; k <= 5; k += 2) {
44965 GemmMicrokernelTester()
44966 .mr(2)
44967 .nr(4)
44968 .kr(1)
44969 .sr(1)
44970 .m(2)
44971 .n(4)
44972 .k(k)
44973 .ks(3)
44974 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44975 }
44976}
44977
44978TEST(F32_IGEMM_2X4__SCALAR, small_kernel_subtile) {
44979 for (size_t k = 1; k <= 5; k += 2) {
44980 for (uint32_t m = 1; m <= 2; m++) {
44981 for (uint32_t n = 1; n <= 4; n++) {
44982 GemmMicrokernelTester()
44983 .mr(2)
44984 .nr(4)
44985 .kr(1)
44986 .sr(1)
44987 .m(m)
44988 .n(n)
44989 .k(k)
44990 .ks(3)
44991 .iterations(1)
44992 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44993 }
44994 }
44995 }
44996}
44997
44998TEST(F32_IGEMM_2X4__SCALAR, n_gt_4_small_kernel) {
44999 for (uint32_t n = 5; n < 8; n++) {
45000 for (size_t k = 1; k <= 5; k += 2) {
45001 GemmMicrokernelTester()
45002 .mr(2)
45003 .nr(4)
45004 .kr(1)
45005 .sr(1)
45006 .m(2)
45007 .n(4)
45008 .k(k)
45009 .ks(3)
45010 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45011 }
45012 }
45013}
45014
45015TEST(F32_IGEMM_2X4__SCALAR, n_div_4_small_kernel) {
45016 for (uint32_t n = 8; n <= 12; n += 4) {
45017 for (size_t k = 1; k <= 5; k += 2) {
45018 GemmMicrokernelTester()
45019 .mr(2)
45020 .nr(4)
45021 .kr(1)
45022 .sr(1)
45023 .m(2)
45024 .n(4)
45025 .k(k)
45026 .ks(3)
45027 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45028 }
45029 }
45030}
45031
45032TEST(F32_IGEMM_2X4__SCALAR, strided_cm_subtile) {
45033 for (size_t k = 1; k <= 5; k += 2) {
45034 for (uint32_t m = 1; m <= 2; m++) {
45035 for (uint32_t n = 1; n <= 4; n++) {
45036 GemmMicrokernelTester()
45037 .mr(2)
45038 .nr(4)
45039 .kr(1)
45040 .sr(1)
45041 .m(m)
45042 .n(n)
45043 .k(k)
45044 .cm_stride(7)
45045 .iterations(1)
45046 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45047 }
45048 }
45049 }
45050}
45051
45052TEST(F32_IGEMM_2X4__SCALAR, a_offset) {
45053 for (size_t k = 1; k <= 5; k += 2) {
45054 GemmMicrokernelTester()
45055 .mr(2)
45056 .nr(4)
45057 .kr(1)
45058 .sr(1)
45059 .m(2)
45060 .n(4)
45061 .k(k)
45062 .ks(3)
45063 .a_offset(13)
45064 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45065 }
45066}
45067
45068TEST(F32_IGEMM_2X4__SCALAR, zero) {
45069 for (uint32_t mz = 0; mz < 2; mz++) {
45070 for (size_t k = 1; k <= 5; k += 2) {
45071 GemmMicrokernelTester()
45072 .mr(2)
45073 .nr(4)
45074 .kr(1)
45075 .sr(1)
45076 .m(2)
45077 .n(4)
45078 .k(k)
45079 .ks(3)
45080 .a_offset(13)
45081 .zero_index(mz)
45082 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45083 }
45084 }
45085}
45086
45087TEST(F32_IGEMM_2X4__SCALAR, qmin) {
45088 GemmMicrokernelTester()
45089 .mr(2)
45090 .nr(4)
45091 .kr(1)
45092 .sr(1)
45093 .m(2)
45094 .n(4)
45095 .k(1)
45096 .qmin(128)
45097 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45098}
45099
45100TEST(F32_IGEMM_2X4__SCALAR, qmax) {
45101 GemmMicrokernelTester()
45102 .mr(2)
45103 .nr(4)
45104 .kr(1)
45105 .sr(1)
45106 .m(2)
45107 .n(4)
45108 .k(1)
45109 .qmax(128)
45110 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45111}
45112
45113TEST(F32_IGEMM_2X4__SCALAR, strided_cm) {
45114 GemmMicrokernelTester()
45115 .mr(2)
45116 .nr(4)
45117 .kr(1)
45118 .sr(1)
45119 .m(2)
45120 .n(4)
45121 .k(1)
45122 .cm_stride(7)
45123 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45124}
45125
45126
45127TEST(F32_IGEMM_4X4__SCALAR, k_eq_1) {
45128 GemmMicrokernelTester()
45129 .mr(4)
45130 .nr(4)
45131 .kr(1)
45132 .sr(1)
45133 .m(4)
45134 .n(4)
45135 .k(1)
45136 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45137}
45138
45139TEST(F32_IGEMM_4X4__SCALAR, strided_cn) {
45140 GemmMicrokernelTester()
45141 .mr(4)
45142 .nr(4)
45143 .kr(1)
45144 .sr(1)
45145 .m(4)
45146 .n(4)
45147 .k(1)
45148 .cn_stride(7)
45149 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45150}
45151
45152TEST(F32_IGEMM_4X4__SCALAR, k_eq_1_subtile) {
45153 for (uint32_t m = 1; m <= 4; m++) {
45154 for (uint32_t n = 1; n <= 4; n++) {
45155 GemmMicrokernelTester()
45156 .mr(4)
45157 .nr(4)
45158 .kr(1)
45159 .sr(1)
45160 .m(m)
45161 .n(n)
45162 .k(1)
45163 .iterations(1)
45164 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45165 }
45166 }
45167}
45168
45169TEST(F32_IGEMM_4X4__SCALAR, k_eq_1_subtile_m) {
45170 for (uint32_t m = 1; m <= 4; m++) {
45171 GemmMicrokernelTester()
45172 .mr(4)
45173 .nr(4)
45174 .kr(1)
45175 .sr(1)
45176 .m(m)
45177 .n(4)
45178 .k(1)
45179 .iterations(1)
45180 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45181 }
45182}
45183
45184TEST(F32_IGEMM_4X4__SCALAR, k_eq_1_subtile_n) {
45185 for (uint32_t n = 1; n <= 4; n++) {
45186 GemmMicrokernelTester()
45187 .mr(4)
45188 .nr(4)
45189 .kr(1)
45190 .sr(1)
45191 .m(4)
45192 .n(n)
45193 .k(1)
45194 .iterations(1)
45195 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45196 }
45197}
45198
45199TEST(F32_IGEMM_4X4__SCALAR, k_gt_1) {
45200 for (size_t k = 2; k < 10; k++) {
45201 GemmMicrokernelTester()
45202 .mr(4)
45203 .nr(4)
45204 .kr(1)
45205 .sr(1)
45206 .m(4)
45207 .n(4)
45208 .k(k)
45209 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45210 }
45211}
45212
45213TEST(F32_IGEMM_4X4__SCALAR, k_gt_1_subtile) {
45214 for (size_t k = 2; k < 10; k++) {
45215 for (uint32_t m = 1; m <= 4; m++) {
45216 for (uint32_t n = 1; n <= 4; n++) {
45217 GemmMicrokernelTester()
45218 .mr(4)
45219 .nr(4)
45220 .kr(1)
45221 .sr(1)
45222 .m(m)
45223 .n(n)
45224 .k(k)
45225 .iterations(1)
45226 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45227 }
45228 }
45229 }
45230}
45231
45232TEST(F32_IGEMM_4X4__SCALAR, n_gt_4) {
45233 for (uint32_t n = 5; n < 8; n++) {
45234 for (size_t k = 1; k <= 5; k += 2) {
45235 GemmMicrokernelTester()
45236 .mr(4)
45237 .nr(4)
45238 .kr(1)
45239 .sr(1)
45240 .m(4)
45241 .n(4)
45242 .k(k)
45243 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45244 }
45245 }
45246}
45247
45248TEST(F32_IGEMM_4X4__SCALAR, n_gt_4_strided_cn) {
45249 for (uint32_t n = 5; n < 8; n++) {
45250 for (size_t k = 1; k <= 5; k += 2) {
45251 GemmMicrokernelTester()
45252 .mr(4)
45253 .nr(4)
45254 .kr(1)
45255 .sr(1)
45256 .m(4)
45257 .n(4)
45258 .k(k)
45259 .cn_stride(7)
45260 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45261 }
45262 }
45263}
45264
45265TEST(F32_IGEMM_4X4__SCALAR, n_gt_4_subtile) {
45266 for (uint32_t n = 5; n < 8; n++) {
45267 for (size_t k = 1; k <= 5; k += 2) {
45268 for (uint32_t m = 1; m <= 4; m++) {
45269 GemmMicrokernelTester()
45270 .mr(4)
45271 .nr(4)
45272 .kr(1)
45273 .sr(1)
45274 .m(m)
45275 .n(n)
45276 .k(k)
45277 .iterations(1)
45278 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45279 }
45280 }
45281 }
45282}
45283
45284TEST(F32_IGEMM_4X4__SCALAR, n_div_4) {
45285 for (uint32_t n = 8; n <= 12; n += 4) {
45286 for (size_t k = 1; k <= 5; k += 2) {
45287 GemmMicrokernelTester()
45288 .mr(4)
45289 .nr(4)
45290 .kr(1)
45291 .sr(1)
45292 .m(4)
45293 .n(4)
45294 .k(k)
45295 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45296 }
45297 }
45298}
45299
45300TEST(F32_IGEMM_4X4__SCALAR, n_div_4_strided_cn) {
45301 for (uint32_t n = 8; n <= 12; n += 4) {
45302 for (size_t k = 1; k <= 5; k += 2) {
45303 GemmMicrokernelTester()
45304 .mr(4)
45305 .nr(4)
45306 .kr(1)
45307 .sr(1)
45308 .m(4)
45309 .n(n)
45310 .k(k)
45311 .cn_stride(7)
45312 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45313 }
45314 }
45315}
45316
45317TEST(F32_IGEMM_4X4__SCALAR, n_div_4_subtile) {
45318 for (uint32_t n = 8; n <= 12; n += 4) {
45319 for (size_t k = 1; k <= 5; k += 2) {
45320 for (uint32_t m = 1; m <= 4; m++) {
45321 GemmMicrokernelTester()
45322 .mr(4)
45323 .nr(4)
45324 .kr(1)
45325 .sr(1)
45326 .m(m)
45327 .n(n)
45328 .k(k)
45329 .iterations(1)
45330 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45331 }
45332 }
45333 }
45334}
45335
45336TEST(F32_IGEMM_4X4__SCALAR, small_kernel) {
45337 for (size_t k = 1; k <= 5; k += 2) {
45338 GemmMicrokernelTester()
45339 .mr(4)
45340 .nr(4)
45341 .kr(1)
45342 .sr(1)
45343 .m(4)
45344 .n(4)
45345 .k(k)
45346 .ks(3)
45347 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45348 }
45349}
45350
45351TEST(F32_IGEMM_4X4__SCALAR, small_kernel_subtile) {
45352 for (size_t k = 1; k <= 5; k += 2) {
45353 for (uint32_t m = 1; m <= 4; m++) {
45354 for (uint32_t n = 1; n <= 4; n++) {
45355 GemmMicrokernelTester()
45356 .mr(4)
45357 .nr(4)
45358 .kr(1)
45359 .sr(1)
45360 .m(m)
45361 .n(n)
45362 .k(k)
45363 .ks(3)
45364 .iterations(1)
45365 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45366 }
45367 }
45368 }
45369}
45370
45371TEST(F32_IGEMM_4X4__SCALAR, n_gt_4_small_kernel) {
45372 for (uint32_t n = 5; n < 8; n++) {
45373 for (size_t k = 1; k <= 5; k += 2) {
45374 GemmMicrokernelTester()
45375 .mr(4)
45376 .nr(4)
45377 .kr(1)
45378 .sr(1)
45379 .m(4)
45380 .n(4)
45381 .k(k)
45382 .ks(3)
45383 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45384 }
45385 }
45386}
45387
45388TEST(F32_IGEMM_4X4__SCALAR, n_div_4_small_kernel) {
45389 for (uint32_t n = 8; n <= 12; n += 4) {
45390 for (size_t k = 1; k <= 5; k += 2) {
45391 GemmMicrokernelTester()
45392 .mr(4)
45393 .nr(4)
45394 .kr(1)
45395 .sr(1)
45396 .m(4)
45397 .n(4)
45398 .k(k)
45399 .ks(3)
45400 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45401 }
45402 }
45403}
45404
45405TEST(F32_IGEMM_4X4__SCALAR, strided_cm_subtile) {
45406 for (size_t k = 1; k <= 5; k += 2) {
45407 for (uint32_t m = 1; m <= 4; m++) {
45408 for (uint32_t n = 1; n <= 4; n++) {
45409 GemmMicrokernelTester()
45410 .mr(4)
45411 .nr(4)
45412 .kr(1)
45413 .sr(1)
45414 .m(m)
45415 .n(n)
45416 .k(k)
45417 .cm_stride(7)
45418 .iterations(1)
45419 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45420 }
45421 }
45422 }
45423}
45424
45425TEST(F32_IGEMM_4X4__SCALAR, a_offset) {
45426 for (size_t k = 1; k <= 5; k += 2) {
45427 GemmMicrokernelTester()
45428 .mr(4)
45429 .nr(4)
45430 .kr(1)
45431 .sr(1)
45432 .m(4)
45433 .n(4)
45434 .k(k)
45435 .ks(3)
45436 .a_offset(23)
45437 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45438 }
45439}
45440
45441TEST(F32_IGEMM_4X4__SCALAR, zero) {
45442 for (uint32_t mz = 0; mz < 4; mz++) {
45443 for (size_t k = 1; k <= 5; k += 2) {
45444 GemmMicrokernelTester()
45445 .mr(4)
45446 .nr(4)
45447 .kr(1)
45448 .sr(1)
45449 .m(4)
45450 .n(4)
45451 .k(k)
45452 .ks(3)
45453 .a_offset(23)
45454 .zero_index(mz)
45455 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45456 }
45457 }
45458}
45459
45460TEST(F32_IGEMM_4X4__SCALAR, qmin) {
45461 GemmMicrokernelTester()
45462 .mr(4)
45463 .nr(4)
45464 .kr(1)
45465 .sr(1)
45466 .m(4)
45467 .n(4)
45468 .k(1)
45469 .qmin(128)
45470 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45471}
45472
45473TEST(F32_IGEMM_4X4__SCALAR, qmax) {
45474 GemmMicrokernelTester()
45475 .mr(4)
45476 .nr(4)
45477 .kr(1)
45478 .sr(1)
45479 .m(4)
45480 .n(4)
45481 .k(1)
45482 .qmax(128)
45483 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45484}
45485
45486TEST(F32_IGEMM_4X4__SCALAR, strided_cm) {
45487 GemmMicrokernelTester()
45488 .mr(4)
45489 .nr(4)
45490 .kr(1)
45491 .sr(1)
45492 .m(4)
45493 .n(4)
45494 .k(1)
45495 .cm_stride(7)
45496 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
45497}
45498
45499
45500TEST(F32_IGEMM_4X2__SCALAR, k_eq_1) {
45501 GemmMicrokernelTester()
45502 .mr(4)
45503 .nr(2)
45504 .kr(1)
45505 .sr(1)
45506 .m(4)
45507 .n(2)
45508 .k(1)
45509 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45510}
45511
45512TEST(F32_IGEMM_4X2__SCALAR, strided_cn) {
45513 GemmMicrokernelTester()
45514 .mr(4)
45515 .nr(2)
45516 .kr(1)
45517 .sr(1)
45518 .m(4)
45519 .n(2)
45520 .k(1)
45521 .cn_stride(5)
45522 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45523}
45524
45525TEST(F32_IGEMM_4X2__SCALAR, k_eq_1_subtile) {
45526 for (uint32_t m = 1; m <= 4; m++) {
45527 for (uint32_t n = 1; n <= 2; n++) {
45528 GemmMicrokernelTester()
45529 .mr(4)
45530 .nr(2)
45531 .kr(1)
45532 .sr(1)
45533 .m(m)
45534 .n(n)
45535 .k(1)
45536 .iterations(1)
45537 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45538 }
45539 }
45540}
45541
45542TEST(F32_IGEMM_4X2__SCALAR, k_eq_1_subtile_m) {
45543 for (uint32_t m = 1; m <= 4; m++) {
45544 GemmMicrokernelTester()
45545 .mr(4)
45546 .nr(2)
45547 .kr(1)
45548 .sr(1)
45549 .m(m)
45550 .n(2)
45551 .k(1)
45552 .iterations(1)
45553 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45554 }
45555}
45556
45557TEST(F32_IGEMM_4X2__SCALAR, k_eq_1_subtile_n) {
45558 for (uint32_t n = 1; n <= 2; n++) {
45559 GemmMicrokernelTester()
45560 .mr(4)
45561 .nr(2)
45562 .kr(1)
45563 .sr(1)
45564 .m(4)
45565 .n(n)
45566 .k(1)
45567 .iterations(1)
45568 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45569 }
45570}
45571
45572TEST(F32_IGEMM_4X2__SCALAR, k_gt_1) {
45573 for (size_t k = 2; k < 10; k++) {
45574 GemmMicrokernelTester()
45575 .mr(4)
45576 .nr(2)
45577 .kr(1)
45578 .sr(1)
45579 .m(4)
45580 .n(2)
45581 .k(k)
45582 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45583 }
45584}
45585
45586TEST(F32_IGEMM_4X2__SCALAR, k_gt_1_subtile) {
45587 for (size_t k = 2; k < 10; k++) {
45588 for (uint32_t m = 1; m <= 4; m++) {
45589 for (uint32_t n = 1; n <= 2; n++) {
45590 GemmMicrokernelTester()
45591 .mr(4)
45592 .nr(2)
45593 .kr(1)
45594 .sr(1)
45595 .m(m)
45596 .n(n)
45597 .k(k)
45598 .iterations(1)
45599 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45600 }
45601 }
45602 }
45603}
45604
45605TEST(F32_IGEMM_4X2__SCALAR, n_gt_2) {
45606 for (uint32_t n = 3; n < 4; n++) {
45607 for (size_t k = 1; k <= 5; k += 2) {
45608 GemmMicrokernelTester()
45609 .mr(4)
45610 .nr(2)
45611 .kr(1)
45612 .sr(1)
45613 .m(4)
45614 .n(2)
45615 .k(k)
45616 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45617 }
45618 }
45619}
45620
45621TEST(F32_IGEMM_4X2__SCALAR, n_gt_2_strided_cn) {
45622 for (uint32_t n = 3; n < 4; n++) {
45623 for (size_t k = 1; k <= 5; k += 2) {
45624 GemmMicrokernelTester()
45625 .mr(4)
45626 .nr(2)
45627 .kr(1)
45628 .sr(1)
45629 .m(4)
45630 .n(2)
45631 .k(k)
45632 .cn_stride(5)
45633 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45634 }
45635 }
45636}
45637
45638TEST(F32_IGEMM_4X2__SCALAR, n_gt_2_subtile) {
45639 for (uint32_t n = 3; n < 4; n++) {
45640 for (size_t k = 1; k <= 5; k += 2) {
45641 for (uint32_t m = 1; m <= 4; m++) {
45642 GemmMicrokernelTester()
45643 .mr(4)
45644 .nr(2)
45645 .kr(1)
45646 .sr(1)
45647 .m(m)
45648 .n(n)
45649 .k(k)
45650 .iterations(1)
45651 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45652 }
45653 }
45654 }
45655}
45656
45657TEST(F32_IGEMM_4X2__SCALAR, n_div_2) {
45658 for (uint32_t n = 4; n <= 6; n += 2) {
45659 for (size_t k = 1; k <= 5; k += 2) {
45660 GemmMicrokernelTester()
45661 .mr(4)
45662 .nr(2)
45663 .kr(1)
45664 .sr(1)
45665 .m(4)
45666 .n(2)
45667 .k(k)
45668 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45669 }
45670 }
45671}
45672
45673TEST(F32_IGEMM_4X2__SCALAR, n_div_2_strided_cn) {
45674 for (uint32_t n = 4; n <= 6; n += 2) {
45675 for (size_t k = 1; k <= 5; k += 2) {
45676 GemmMicrokernelTester()
45677 .mr(4)
45678 .nr(2)
45679 .kr(1)
45680 .sr(1)
45681 .m(4)
45682 .n(n)
45683 .k(k)
45684 .cn_stride(5)
45685 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45686 }
45687 }
45688}
45689
45690TEST(F32_IGEMM_4X2__SCALAR, n_div_2_subtile) {
45691 for (uint32_t n = 4; n <= 6; n += 2) {
45692 for (size_t k = 1; k <= 5; k += 2) {
45693 for (uint32_t m = 1; m <= 4; m++) {
45694 GemmMicrokernelTester()
45695 .mr(4)
45696 .nr(2)
45697 .kr(1)
45698 .sr(1)
45699 .m(m)
45700 .n(n)
45701 .k(k)
45702 .iterations(1)
45703 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45704 }
45705 }
45706 }
45707}
45708
45709TEST(F32_IGEMM_4X2__SCALAR, small_kernel) {
45710 for (size_t k = 1; k <= 5; k += 2) {
45711 GemmMicrokernelTester()
45712 .mr(4)
45713 .nr(2)
45714 .kr(1)
45715 .sr(1)
45716 .m(4)
45717 .n(2)
45718 .k(k)
45719 .ks(3)
45720 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45721 }
45722}
45723
45724TEST(F32_IGEMM_4X2__SCALAR, small_kernel_subtile) {
45725 for (size_t k = 1; k <= 5; k += 2) {
45726 for (uint32_t m = 1; m <= 4; m++) {
45727 for (uint32_t n = 1; n <= 2; n++) {
45728 GemmMicrokernelTester()
45729 .mr(4)
45730 .nr(2)
45731 .kr(1)
45732 .sr(1)
45733 .m(m)
45734 .n(n)
45735 .k(k)
45736 .ks(3)
45737 .iterations(1)
45738 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45739 }
45740 }
45741 }
45742}
45743
45744TEST(F32_IGEMM_4X2__SCALAR, n_gt_2_small_kernel) {
45745 for (uint32_t n = 3; n < 4; n++) {
45746 for (size_t k = 1; k <= 5; k += 2) {
45747 GemmMicrokernelTester()
45748 .mr(4)
45749 .nr(2)
45750 .kr(1)
45751 .sr(1)
45752 .m(4)
45753 .n(2)
45754 .k(k)
45755 .ks(3)
45756 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45757 }
45758 }
45759}
45760
45761TEST(F32_IGEMM_4X2__SCALAR, n_div_2_small_kernel) {
45762 for (uint32_t n = 4; n <= 6; n += 2) {
45763 for (size_t k = 1; k <= 5; k += 2) {
45764 GemmMicrokernelTester()
45765 .mr(4)
45766 .nr(2)
45767 .kr(1)
45768 .sr(1)
45769 .m(4)
45770 .n(2)
45771 .k(k)
45772 .ks(3)
45773 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45774 }
45775 }
45776}
45777
45778TEST(F32_IGEMM_4X2__SCALAR, strided_cm_subtile) {
45779 for (size_t k = 1; k <= 5; k += 2) {
45780 for (uint32_t m = 1; m <= 4; m++) {
45781 for (uint32_t n = 1; n <= 2; n++) {
45782 GemmMicrokernelTester()
45783 .mr(4)
45784 .nr(2)
45785 .kr(1)
45786 .sr(1)
45787 .m(m)
45788 .n(n)
45789 .k(k)
45790 .cm_stride(5)
45791 .iterations(1)
45792 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45793 }
45794 }
45795 }
45796}
45797
45798TEST(F32_IGEMM_4X2__SCALAR, a_offset) {
45799 for (size_t k = 1; k <= 5; k += 2) {
45800 GemmMicrokernelTester()
45801 .mr(4)
45802 .nr(2)
45803 .kr(1)
45804 .sr(1)
45805 .m(4)
45806 .n(2)
45807 .k(k)
45808 .ks(3)
45809 .a_offset(23)
45810 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45811 }
45812}
45813
45814TEST(F32_IGEMM_4X2__SCALAR, zero) {
45815 for (uint32_t mz = 0; mz < 4; mz++) {
45816 for (size_t k = 1; k <= 5; k += 2) {
45817 GemmMicrokernelTester()
45818 .mr(4)
45819 .nr(2)
45820 .kr(1)
45821 .sr(1)
45822 .m(4)
45823 .n(2)
45824 .k(k)
45825 .ks(3)
45826 .a_offset(23)
45827 .zero_index(mz)
45828 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45829 }
45830 }
45831}
45832
45833TEST(F32_IGEMM_4X2__SCALAR, qmin) {
45834 GemmMicrokernelTester()
45835 .mr(4)
45836 .nr(2)
45837 .kr(1)
45838 .sr(1)
45839 .m(4)
45840 .n(2)
45841 .k(1)
45842 .qmin(128)
45843 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45844}
45845
45846TEST(F32_IGEMM_4X2__SCALAR, qmax) {
45847 GemmMicrokernelTester()
45848 .mr(4)
45849 .nr(2)
45850 .kr(1)
45851 .sr(1)
45852 .m(4)
45853 .n(2)
45854 .k(1)
45855 .qmax(128)
45856 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45857}
45858
45859TEST(F32_IGEMM_4X2__SCALAR, strided_cm) {
45860 GemmMicrokernelTester()
45861 .mr(4)
45862 .nr(2)
45863 .kr(1)
45864 .sr(1)
45865 .m(4)
45866 .n(2)
45867 .k(1)
45868 .cm_stride(5)
45869 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
45870}