blob: d6636d183d16f2912c4b5894c894164d76a2630f [file] [log] [blame]
Marat Dukhan0b043742021-06-02 18:29:11 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8//
9// Auto-generated file. Do not edit!
10// Specification: test/qc8-gemm-minmax-fp32.yaml
11// Generator: tools/generate-gemm-test.py
12
13
14#include <gtest/gtest.h>
15
Frank Barchard447aa7b2021-12-28 14:11:40 -080016#include <xnnpack/allocator.h>
Marat Dukhan0b043742021-06-02 18:29:11 -070017#include <xnnpack/common.h>
18#include <xnnpack/isa-checks.h>
19
20#include <xnnpack/gemm.h>
21#include <xnnpack/igemm.h>
22#include <xnnpack/ppmm.h>
23#include "gemm-microkernel-tester.h"
24
25
Frank Barchardac654f12022-01-24 23:51:04 -080026#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS && XNN_ENABLE_ASSEMBLY
27 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, k_eq_8) {
28 TEST_REQUIRES_ARM_NEON_DOT;
29 GemmMicrokernelTester()
30 .mr(4)
31 .nr(8)
32 .kr(4)
33 .sr(1)
34 .m(4)
35 .n(8)
36 .k(8)
37 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
38 }
39
40 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, strided_cn) {
41 TEST_REQUIRES_ARM_NEON_DOT;
42 GemmMicrokernelTester()
43 .mr(4)
44 .nr(8)
45 .kr(4)
46 .sr(1)
47 .m(4)
48 .n(8)
49 .k(8)
50 .cn_stride(11)
51 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
52 }
53
54 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, k_eq_8_strided_a) {
55 TEST_REQUIRES_ARM_NEON_DOT;
56 GemmMicrokernelTester()
57 .mr(4)
58 .nr(8)
59 .kr(4)
60 .sr(1)
61 .m(4)
62 .n(8)
63 .k(8)
64 .a_stride(11)
65 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
66 }
67
68 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, k_eq_8_subtile) {
69 TEST_REQUIRES_ARM_NEON_DOT;
70 for (uint32_t n = 1; n <= 8; n++) {
71 for (uint32_t m = 1; m <= 4; m++) {
72 GemmMicrokernelTester()
73 .mr(4)
74 .nr(8)
75 .kr(4)
76 .sr(1)
77 .m(m)
78 .n(n)
79 .k(8)
80 .iterations(1)
81 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
82 }
83 }
84 }
85
86 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, k_eq_8_subtile_m) {
87 TEST_REQUIRES_ARM_NEON_DOT;
88 for (uint32_t m = 1; m <= 4; m++) {
89 GemmMicrokernelTester()
90 .mr(4)
91 .nr(8)
92 .kr(4)
93 .sr(1)
94 .m(m)
95 .n(8)
96 .k(8)
97 .iterations(1)
98 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
99 }
100 }
101
102 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, k_eq_8_subtile_n) {
103 TEST_REQUIRES_ARM_NEON_DOT;
104 for (uint32_t n = 1; n <= 8; n++) {
105 GemmMicrokernelTester()
106 .mr(4)
107 .nr(8)
108 .kr(4)
109 .sr(1)
110 .m(4)
111 .n(n)
112 .k(8)
113 .iterations(1)
114 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
115 }
116 }
117
118 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, k_lt_8) {
119 TEST_REQUIRES_ARM_NEON_DOT;
120 for (size_t k = 1; k < 8; k++) {
121 GemmMicrokernelTester()
122 .mr(4)
123 .nr(8)
124 .kr(4)
125 .sr(1)
126 .m(4)
127 .n(8)
128 .k(k)
129 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
130 }
131 }
132
133 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, k_lt_8_strided_a) {
134 TEST_REQUIRES_ARM_NEON_DOT;
135 for (size_t k = 1; k < 8; k++) {
136 GemmMicrokernelTester()
137 .mr(4)
138 .nr(8)
139 .kr(4)
140 .sr(1)
141 .m(4)
142 .n(8)
143 .k(k)
144 .a_stride(11)
145 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
146 }
147 }
148
149 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, k_lt_8_subtile) {
150 TEST_REQUIRES_ARM_NEON_DOT;
151 for (size_t k = 1; k < 8; k++) {
152 for (uint32_t n = 1; n <= 8; n++) {
153 for (uint32_t m = 1; m <= 4; m++) {
154 GemmMicrokernelTester()
155 .mr(4)
156 .nr(8)
157 .kr(4)
158 .sr(1)
159 .m(m)
160 .n(n)
161 .k(k)
162 .iterations(1)
163 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
164 }
165 }
166 }
167 }
168
169 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, k_gt_8) {
170 TEST_REQUIRES_ARM_NEON_DOT;
171 for (size_t k = 9; k < 16; k++) {
172 GemmMicrokernelTester()
173 .mr(4)
174 .nr(8)
175 .kr(4)
176 .sr(1)
177 .m(4)
178 .n(8)
179 .k(k)
180 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
181 }
182 }
183
184 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, k_gt_8_strided_a) {
185 TEST_REQUIRES_ARM_NEON_DOT;
186 for (size_t k = 9; k < 16; k++) {
187 GemmMicrokernelTester()
188 .mr(4)
189 .nr(8)
190 .kr(4)
191 .sr(1)
192 .m(4)
193 .n(8)
194 .k(k)
195 .a_stride(19)
196 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
197 }
198 }
199
200 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, k_gt_8_subtile) {
201 TEST_REQUIRES_ARM_NEON_DOT;
202 for (size_t k = 9; k < 16; k++) {
203 for (uint32_t n = 1; n <= 8; n++) {
204 for (uint32_t m = 1; m <= 4; m++) {
205 GemmMicrokernelTester()
206 .mr(4)
207 .nr(8)
208 .kr(4)
209 .sr(1)
210 .m(m)
211 .n(n)
212 .k(k)
213 .iterations(1)
214 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
215 }
216 }
217 }
218 }
219
220 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, k_div_8) {
221 TEST_REQUIRES_ARM_NEON_DOT;
222 for (size_t k = 16; k <= 80; k += 8) {
223 GemmMicrokernelTester()
224 .mr(4)
225 .nr(8)
226 .kr(4)
227 .sr(1)
228 .m(4)
229 .n(8)
230 .k(k)
231 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
232 }
233 }
234
235 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, k_div_8_strided_a) {
236 TEST_REQUIRES_ARM_NEON_DOT;
237 for (size_t k = 16; k <= 80; k += 8) {
238 GemmMicrokernelTester()
239 .mr(4)
240 .nr(8)
241 .kr(4)
242 .sr(1)
243 .m(4)
244 .n(8)
245 .k(k)
246 .a_stride(83)
247 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
248 }
249 }
250
251 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, k_div_8_subtile) {
252 TEST_REQUIRES_ARM_NEON_DOT;
253 for (size_t k = 16; k <= 80; k += 8) {
254 for (uint32_t n = 1; n <= 8; n++) {
255 for (uint32_t m = 1; m <= 4; m++) {
256 GemmMicrokernelTester()
257 .mr(4)
258 .nr(8)
259 .kr(4)
260 .sr(1)
261 .m(m)
262 .n(n)
263 .k(k)
264 .iterations(1)
265 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
266 }
267 }
268 }
269 }
270
271 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, n_gt_8) {
272 TEST_REQUIRES_ARM_NEON_DOT;
273 for (uint32_t n = 9; n < 16; n++) {
274 for (size_t k = 1; k <= 40; k += 9) {
275 GemmMicrokernelTester()
276 .mr(4)
277 .nr(8)
278 .kr(4)
279 .sr(1)
280 .m(4)
281 .n(n)
282 .k(k)
283 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
284 }
285 }
286 }
287
288 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, n_gt_8_strided_cn) {
289 TEST_REQUIRES_ARM_NEON_DOT;
290 for (uint32_t n = 9; n < 16; n++) {
291 for (size_t k = 1; k <= 40; k += 9) {
292 GemmMicrokernelTester()
293 .mr(4)
294 .nr(8)
295 .kr(4)
296 .sr(1)
297 .m(4)
298 .n(n)
299 .k(k)
300 .cn_stride(11)
301 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
302 }
303 }
304 }
305
306 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, n_gt_8_strided_a) {
307 TEST_REQUIRES_ARM_NEON_DOT;
308 for (uint32_t n = 9; n < 16; n++) {
309 for (size_t k = 1; k <= 40; k += 9) {
310 GemmMicrokernelTester()
311 .mr(4)
312 .nr(8)
313 .kr(4)
314 .sr(1)
315 .m(4)
316 .n(n)
317 .k(k)
318 .a_stride(43)
319 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
320 }
321 }
322 }
323
324 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, n_gt_8_subtile) {
325 TEST_REQUIRES_ARM_NEON_DOT;
326 for (uint32_t n = 9; n < 16; n++) {
327 for (size_t k = 1; k <= 40; k += 9) {
328 for (uint32_t m = 1; m <= 4; m++) {
329 GemmMicrokernelTester()
330 .mr(4)
331 .nr(8)
332 .kr(4)
333 .sr(1)
334 .m(m)
335 .n(n)
336 .k(k)
337 .iterations(1)
338 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
339 }
340 }
341 }
342 }
343
344 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, n_div_8) {
345 TEST_REQUIRES_ARM_NEON_DOT;
346 for (uint32_t n = 16; n <= 24; n += 8) {
347 for (size_t k = 1; k <= 40; k += 9) {
348 GemmMicrokernelTester()
349 .mr(4)
350 .nr(8)
351 .kr(4)
352 .sr(1)
353 .m(4)
354 .n(n)
355 .k(k)
356 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
357 }
358 }
359 }
360
361 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, n_div_8_strided_cn) {
362 TEST_REQUIRES_ARM_NEON_DOT;
363 for (uint32_t n = 16; n <= 24; n += 8) {
364 for (size_t k = 1; k <= 40; k += 9) {
365 GemmMicrokernelTester()
366 .mr(4)
367 .nr(8)
368 .kr(4)
369 .sr(1)
370 .m(4)
371 .n(n)
372 .k(k)
373 .cn_stride(11)
374 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
375 }
376 }
377 }
378
379 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, n_div_8_strided_a) {
380 TEST_REQUIRES_ARM_NEON_DOT;
381 for (uint32_t n = 16; n <= 24; n += 8) {
382 for (size_t k = 1; k <= 40; k += 9) {
383 GemmMicrokernelTester()
384 .mr(4)
385 .nr(8)
386 .kr(4)
387 .sr(1)
388 .m(4)
389 .n(n)
390 .k(k)
391 .a_stride(43)
392 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
393 }
394 }
395 }
396
397 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, n_div_8_subtile) {
398 TEST_REQUIRES_ARM_NEON_DOT;
399 for (uint32_t n = 16; n <= 24; n += 8) {
400 for (size_t k = 1; k <= 40; k += 9) {
401 for (uint32_t m = 1; m <= 4; m++) {
402 GemmMicrokernelTester()
403 .mr(4)
404 .nr(8)
405 .kr(4)
406 .sr(1)
407 .m(m)
408 .n(n)
409 .k(k)
410 .iterations(1)
411 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
412 }
413 }
414 }
415 }
416
417 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, strided_cm_subtile) {
418 TEST_REQUIRES_ARM_NEON_DOT;
419 for (size_t k = 1; k <= 40; k += 9) {
420 for (uint32_t n = 1; n <= 8; n++) {
421 for (uint32_t m = 1; m <= 4; m++) {
422 GemmMicrokernelTester()
423 .mr(4)
424 .nr(8)
425 .kr(4)
426 .sr(1)
427 .m(m)
428 .n(n)
429 .k(k)
430 .cm_stride(11)
431 .iterations(1)
432 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
433 }
434 }
435 }
436 }
437
438 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, qmin) {
439 TEST_REQUIRES_ARM_NEON_DOT;
440 GemmMicrokernelTester()
441 .mr(4)
442 .nr(8)
443 .kr(4)
444 .sr(1)
445 .m(4)
446 .n(8)
447 .k(8)
448 .qmin(128)
449 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
450 }
451
452 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, qmax) {
453 TEST_REQUIRES_ARM_NEON_DOT;
454 GemmMicrokernelTester()
455 .mr(4)
456 .nr(8)
457 .kr(4)
458 .sr(1)
459 .m(4)
460 .n(8)
461 .k(8)
462 .qmax(128)
463 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
464 }
465
466 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_CORTEX_A55, strided_cm) {
467 TEST_REQUIRES_ARM_NEON_DOT;
468 GemmMicrokernelTester()
469 .mr(4)
470 .nr(8)
471 .kr(4)
472 .sr(1)
473 .m(4)
474 .n(8)
475 .k(8)
476 .cm_stride(11)
477 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
478 }
479#endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS && XNN_ENABLE_ASSEMBLY
480
481
Frank Barchard101271e2022-02-02 01:49:54 -0800482#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
483 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8) {
484 TEST_REQUIRES_ARM_NEON;
485 GemmMicrokernelTester()
486 .mr(4)
487 .nr(8)
488 .kr(1)
489 .sr(1)
490 .m(4)
491 .n(8)
492 .k(8)
493 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
494 }
495
496 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cn) {
497 TEST_REQUIRES_ARM_NEON;
498 GemmMicrokernelTester()
499 .mr(4)
500 .nr(8)
501 .kr(1)
502 .sr(1)
503 .m(4)
504 .n(8)
505 .k(8)
506 .cn_stride(11)
507 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
508 }
509
510 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_strided_a) {
511 TEST_REQUIRES_ARM_NEON;
512 GemmMicrokernelTester()
513 .mr(4)
514 .nr(8)
515 .kr(1)
516 .sr(1)
517 .m(4)
518 .n(8)
519 .k(8)
520 .a_stride(11)
521 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
522 }
523
524 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile) {
525 TEST_REQUIRES_ARM_NEON;
526 for (uint32_t n = 1; n <= 8; n++) {
527 for (uint32_t m = 1; m <= 4; m++) {
528 GemmMicrokernelTester()
529 .mr(4)
530 .nr(8)
531 .kr(1)
532 .sr(1)
533 .m(m)
534 .n(n)
535 .k(8)
536 .iterations(1)
537 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
538 }
539 }
540 }
541
542 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile_m) {
543 TEST_REQUIRES_ARM_NEON;
544 for (uint32_t m = 1; m <= 4; m++) {
545 GemmMicrokernelTester()
546 .mr(4)
547 .nr(8)
548 .kr(1)
549 .sr(1)
550 .m(m)
551 .n(8)
552 .k(8)
553 .iterations(1)
554 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
555 }
556 }
557
558 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile_n) {
559 TEST_REQUIRES_ARM_NEON;
560 for (uint32_t n = 1; n <= 8; n++) {
561 GemmMicrokernelTester()
562 .mr(4)
563 .nr(8)
564 .kr(1)
565 .sr(1)
566 .m(4)
567 .n(n)
568 .k(8)
569 .iterations(1)
570 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
571 }
572 }
573
574 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_lt_8) {
575 TEST_REQUIRES_ARM_NEON;
576 for (size_t k = 1; k < 8; k++) {
577 GemmMicrokernelTester()
578 .mr(4)
579 .nr(8)
580 .kr(1)
581 .sr(1)
582 .m(4)
583 .n(8)
584 .k(k)
585 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
586 }
587 }
588
589 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_lt_8_strided_a) {
590 TEST_REQUIRES_ARM_NEON;
591 for (size_t k = 1; k < 8; k++) {
592 GemmMicrokernelTester()
593 .mr(4)
594 .nr(8)
595 .kr(1)
596 .sr(1)
597 .m(4)
598 .n(8)
599 .k(k)
600 .a_stride(11)
601 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
602 }
603 }
604
605 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_lt_8_subtile) {
606 TEST_REQUIRES_ARM_NEON;
607 for (size_t k = 1; k < 8; k++) {
608 for (uint32_t n = 1; n <= 8; n++) {
609 for (uint32_t m = 1; m <= 4; m++) {
610 GemmMicrokernelTester()
611 .mr(4)
612 .nr(8)
613 .kr(1)
614 .sr(1)
615 .m(m)
616 .n(n)
617 .k(k)
618 .iterations(1)
619 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
620 }
621 }
622 }
623 }
624
625 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_gt_8) {
626 TEST_REQUIRES_ARM_NEON;
627 for (size_t k = 9; k < 16; k++) {
628 GemmMicrokernelTester()
629 .mr(4)
630 .nr(8)
631 .kr(1)
632 .sr(1)
633 .m(4)
634 .n(8)
635 .k(k)
636 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
637 }
638 }
639
640 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_gt_8_strided_a) {
641 TEST_REQUIRES_ARM_NEON;
642 for (size_t k = 9; k < 16; k++) {
643 GemmMicrokernelTester()
644 .mr(4)
645 .nr(8)
646 .kr(1)
647 .sr(1)
648 .m(4)
649 .n(8)
650 .k(k)
651 .a_stride(19)
652 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
653 }
654 }
655
656 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_gt_8_subtile) {
657 TEST_REQUIRES_ARM_NEON;
658 for (size_t k = 9; k < 16; k++) {
659 for (uint32_t n = 1; n <= 8; n++) {
660 for (uint32_t m = 1; m <= 4; m++) {
661 GemmMicrokernelTester()
662 .mr(4)
663 .nr(8)
664 .kr(1)
665 .sr(1)
666 .m(m)
667 .n(n)
668 .k(k)
669 .iterations(1)
670 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
671 }
672 }
673 }
674 }
675
676 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_div_8) {
677 TEST_REQUIRES_ARM_NEON;
678 for (size_t k = 16; k <= 80; k += 8) {
679 GemmMicrokernelTester()
680 .mr(4)
681 .nr(8)
682 .kr(1)
683 .sr(1)
684 .m(4)
685 .n(8)
686 .k(k)
687 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
688 }
689 }
690
691 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_div_8_strided_a) {
692 TEST_REQUIRES_ARM_NEON;
693 for (size_t k = 16; k <= 80; k += 8) {
694 GemmMicrokernelTester()
695 .mr(4)
696 .nr(8)
697 .kr(1)
698 .sr(1)
699 .m(4)
700 .n(8)
701 .k(k)
702 .a_stride(83)
703 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
704 }
705 }
706
707 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_div_8_subtile) {
708 TEST_REQUIRES_ARM_NEON;
709 for (size_t k = 16; k <= 80; k += 8) {
710 for (uint32_t n = 1; n <= 8; n++) {
711 for (uint32_t m = 1; m <= 4; m++) {
712 GemmMicrokernelTester()
713 .mr(4)
714 .nr(8)
715 .kr(1)
716 .sr(1)
717 .m(m)
718 .n(n)
719 .k(k)
720 .iterations(1)
721 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
722 }
723 }
724 }
725 }
726
727 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_8) {
728 TEST_REQUIRES_ARM_NEON;
729 for (uint32_t n = 9; n < 16; n++) {
730 for (size_t k = 1; k <= 40; k += 9) {
731 GemmMicrokernelTester()
732 .mr(4)
733 .nr(8)
734 .kr(1)
735 .sr(1)
736 .m(4)
737 .n(n)
738 .k(k)
739 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
740 }
741 }
742 }
743
744 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
745 TEST_REQUIRES_ARM_NEON;
746 for (uint32_t n = 9; n < 16; n++) {
747 for (size_t k = 1; k <= 40; k += 9) {
748 GemmMicrokernelTester()
749 .mr(4)
750 .nr(8)
751 .kr(1)
752 .sr(1)
753 .m(4)
754 .n(n)
755 .k(k)
756 .cn_stride(11)
757 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
758 }
759 }
760 }
761
762 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_8_strided_a) {
763 TEST_REQUIRES_ARM_NEON;
764 for (uint32_t n = 9; n < 16; n++) {
765 for (size_t k = 1; k <= 40; k += 9) {
766 GemmMicrokernelTester()
767 .mr(4)
768 .nr(8)
769 .kr(1)
770 .sr(1)
771 .m(4)
772 .n(n)
773 .k(k)
774 .a_stride(43)
775 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
776 }
777 }
778 }
779
780 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_8_subtile) {
781 TEST_REQUIRES_ARM_NEON;
782 for (uint32_t n = 9; n < 16; n++) {
783 for (size_t k = 1; k <= 40; k += 9) {
784 for (uint32_t m = 1; m <= 4; m++) {
785 GemmMicrokernelTester()
786 .mr(4)
787 .nr(8)
788 .kr(1)
789 .sr(1)
790 .m(m)
791 .n(n)
792 .k(k)
793 .iterations(1)
794 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
795 }
796 }
797 }
798 }
799
800 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_8) {
801 TEST_REQUIRES_ARM_NEON;
802 for (uint32_t n = 16; n <= 24; n += 8) {
803 for (size_t k = 1; k <= 40; k += 9) {
804 GemmMicrokernelTester()
805 .mr(4)
806 .nr(8)
807 .kr(1)
808 .sr(1)
809 .m(4)
810 .n(n)
811 .k(k)
812 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
813 }
814 }
815 }
816
817 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_8_strided_cn) {
818 TEST_REQUIRES_ARM_NEON;
819 for (uint32_t n = 16; n <= 24; n += 8) {
820 for (size_t k = 1; k <= 40; k += 9) {
821 GemmMicrokernelTester()
822 .mr(4)
823 .nr(8)
824 .kr(1)
825 .sr(1)
826 .m(4)
827 .n(n)
828 .k(k)
829 .cn_stride(11)
830 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
831 }
832 }
833 }
834
835 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_8_strided_a) {
836 TEST_REQUIRES_ARM_NEON;
837 for (uint32_t n = 16; n <= 24; n += 8) {
838 for (size_t k = 1; k <= 40; k += 9) {
839 GemmMicrokernelTester()
840 .mr(4)
841 .nr(8)
842 .kr(1)
843 .sr(1)
844 .m(4)
845 .n(n)
846 .k(k)
847 .a_stride(43)
848 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
849 }
850 }
851 }
852
853 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_8_subtile) {
854 TEST_REQUIRES_ARM_NEON;
855 for (uint32_t n = 16; n <= 24; n += 8) {
856 for (size_t k = 1; k <= 40; k += 9) {
857 for (uint32_t m = 1; m <= 4; m++) {
858 GemmMicrokernelTester()
859 .mr(4)
860 .nr(8)
861 .kr(1)
862 .sr(1)
863 .m(m)
864 .n(n)
865 .k(k)
866 .iterations(1)
867 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
868 }
869 }
870 }
871 }
872
873 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cm_subtile) {
874 TEST_REQUIRES_ARM_NEON;
875 for (size_t k = 1; k <= 40; k += 9) {
876 for (uint32_t n = 1; n <= 8; n++) {
877 for (uint32_t m = 1; m <= 4; m++) {
878 GemmMicrokernelTester()
879 .mr(4)
880 .nr(8)
881 .kr(1)
882 .sr(1)
883 .m(m)
884 .n(n)
885 .k(k)
886 .cm_stride(11)
887 .iterations(1)
888 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
889 }
890 }
891 }
892 }
893
894 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, qmin) {
895 TEST_REQUIRES_ARM_NEON;
896 GemmMicrokernelTester()
897 .mr(4)
898 .nr(8)
899 .kr(1)
900 .sr(1)
901 .m(4)
902 .n(8)
903 .k(8)
904 .qmin(128)
905 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
906 }
907
908 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, qmax) {
909 TEST_REQUIRES_ARM_NEON;
910 GemmMicrokernelTester()
911 .mr(4)
912 .nr(8)
913 .kr(1)
914 .sr(1)
915 .m(4)
916 .n(8)
917 .k(8)
918 .qmax(128)
919 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
920 }
921
922 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cm) {
923 TEST_REQUIRES_ARM_NEON;
924 GemmMicrokernelTester()
925 .mr(4)
926 .nr(8)
927 .kr(1)
928 .sr(1)
929 .m(4)
930 .n(8)
931 .k(8)
932 .cm_stride(11)
933 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
934 }
935#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
936
937
938#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
939 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_eq_8) {
940 TEST_REQUIRES_ARM_NEON;
941 GemmMicrokernelTester()
942 .mr(4)
943 .nr(8)
944 .kr(1)
945 .sr(1)
946 .m(4)
947 .n(8)
948 .k(8)
949 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
950 }
951
952 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, strided_cn) {
953 TEST_REQUIRES_ARM_NEON;
954 GemmMicrokernelTester()
955 .mr(4)
956 .nr(8)
957 .kr(1)
958 .sr(1)
959 .m(4)
960 .n(8)
961 .k(8)
962 .cn_stride(11)
963 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
964 }
965
966 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_eq_8_strided_a) {
967 TEST_REQUIRES_ARM_NEON;
968 GemmMicrokernelTester()
969 .mr(4)
970 .nr(8)
971 .kr(1)
972 .sr(1)
973 .m(4)
974 .n(8)
975 .k(8)
976 .a_stride(11)
977 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
978 }
979
980 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_eq_8_subtile) {
981 TEST_REQUIRES_ARM_NEON;
982 for (uint32_t n = 1; n <= 8; n++) {
983 for (uint32_t m = 1; m <= 4; m++) {
984 GemmMicrokernelTester()
985 .mr(4)
986 .nr(8)
987 .kr(1)
988 .sr(1)
989 .m(m)
990 .n(n)
991 .k(8)
992 .iterations(1)
993 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
994 }
995 }
996 }
997
998 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_eq_8_subtile_m) {
999 TEST_REQUIRES_ARM_NEON;
1000 for (uint32_t m = 1; m <= 4; m++) {
1001 GemmMicrokernelTester()
1002 .mr(4)
1003 .nr(8)
1004 .kr(1)
1005 .sr(1)
1006 .m(m)
1007 .n(8)
1008 .k(8)
1009 .iterations(1)
1010 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1011 }
1012 }
1013
1014 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_eq_8_subtile_n) {
1015 TEST_REQUIRES_ARM_NEON;
1016 for (uint32_t n = 1; n <= 8; n++) {
1017 GemmMicrokernelTester()
1018 .mr(4)
1019 .nr(8)
1020 .kr(1)
1021 .sr(1)
1022 .m(4)
1023 .n(n)
1024 .k(8)
1025 .iterations(1)
1026 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1027 }
1028 }
1029
1030 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_lt_8) {
1031 TEST_REQUIRES_ARM_NEON;
1032 for (size_t k = 1; k < 8; k++) {
1033 GemmMicrokernelTester()
1034 .mr(4)
1035 .nr(8)
1036 .kr(1)
1037 .sr(1)
1038 .m(4)
1039 .n(8)
1040 .k(k)
1041 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1042 }
1043 }
1044
1045 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_lt_8_strided_a) {
1046 TEST_REQUIRES_ARM_NEON;
1047 for (size_t k = 1; k < 8; k++) {
1048 GemmMicrokernelTester()
1049 .mr(4)
1050 .nr(8)
1051 .kr(1)
1052 .sr(1)
1053 .m(4)
1054 .n(8)
1055 .k(k)
1056 .a_stride(11)
1057 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1058 }
1059 }
1060
1061 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_lt_8_subtile) {
1062 TEST_REQUIRES_ARM_NEON;
1063 for (size_t k = 1; k < 8; k++) {
1064 for (uint32_t n = 1; n <= 8; n++) {
1065 for (uint32_t m = 1; m <= 4; m++) {
1066 GemmMicrokernelTester()
1067 .mr(4)
1068 .nr(8)
1069 .kr(1)
1070 .sr(1)
1071 .m(m)
1072 .n(n)
1073 .k(k)
1074 .iterations(1)
1075 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1076 }
1077 }
1078 }
1079 }
1080
1081 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_gt_8) {
1082 TEST_REQUIRES_ARM_NEON;
1083 for (size_t k = 9; k < 16; k++) {
1084 GemmMicrokernelTester()
1085 .mr(4)
1086 .nr(8)
1087 .kr(1)
1088 .sr(1)
1089 .m(4)
1090 .n(8)
1091 .k(k)
1092 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1093 }
1094 }
1095
1096 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_gt_8_strided_a) {
1097 TEST_REQUIRES_ARM_NEON;
1098 for (size_t k = 9; k < 16; k++) {
1099 GemmMicrokernelTester()
1100 .mr(4)
1101 .nr(8)
1102 .kr(1)
1103 .sr(1)
1104 .m(4)
1105 .n(8)
1106 .k(k)
1107 .a_stride(19)
1108 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1109 }
1110 }
1111
1112 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_gt_8_subtile) {
1113 TEST_REQUIRES_ARM_NEON;
1114 for (size_t k = 9; k < 16; k++) {
1115 for (uint32_t n = 1; n <= 8; n++) {
1116 for (uint32_t m = 1; m <= 4; m++) {
1117 GemmMicrokernelTester()
1118 .mr(4)
1119 .nr(8)
1120 .kr(1)
1121 .sr(1)
1122 .m(m)
1123 .n(n)
1124 .k(k)
1125 .iterations(1)
1126 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1127 }
1128 }
1129 }
1130 }
1131
1132 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_div_8) {
1133 TEST_REQUIRES_ARM_NEON;
1134 for (size_t k = 16; k <= 80; k += 8) {
1135 GemmMicrokernelTester()
1136 .mr(4)
1137 .nr(8)
1138 .kr(1)
1139 .sr(1)
1140 .m(4)
1141 .n(8)
1142 .k(k)
1143 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1144 }
1145 }
1146
1147 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_div_8_strided_a) {
1148 TEST_REQUIRES_ARM_NEON;
1149 for (size_t k = 16; k <= 80; k += 8) {
1150 GemmMicrokernelTester()
1151 .mr(4)
1152 .nr(8)
1153 .kr(1)
1154 .sr(1)
1155 .m(4)
1156 .n(8)
1157 .k(k)
1158 .a_stride(83)
1159 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1160 }
1161 }
1162
1163 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_div_8_subtile) {
1164 TEST_REQUIRES_ARM_NEON;
1165 for (size_t k = 16; k <= 80; k += 8) {
1166 for (uint32_t n = 1; n <= 8; n++) {
1167 for (uint32_t m = 1; m <= 4; m++) {
1168 GemmMicrokernelTester()
1169 .mr(4)
1170 .nr(8)
1171 .kr(1)
1172 .sr(1)
1173 .m(m)
1174 .n(n)
1175 .k(k)
1176 .iterations(1)
1177 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1178 }
1179 }
1180 }
1181 }
1182
1183 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, n_gt_8) {
1184 TEST_REQUIRES_ARM_NEON;
1185 for (uint32_t n = 9; n < 16; n++) {
1186 for (size_t k = 1; k <= 40; k += 9) {
1187 GemmMicrokernelTester()
1188 .mr(4)
1189 .nr(8)
1190 .kr(1)
1191 .sr(1)
1192 .m(4)
1193 .n(n)
1194 .k(k)
1195 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1196 }
1197 }
1198 }
1199
1200 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, n_gt_8_strided_cn) {
1201 TEST_REQUIRES_ARM_NEON;
1202 for (uint32_t n = 9; n < 16; n++) {
1203 for (size_t k = 1; k <= 40; k += 9) {
1204 GemmMicrokernelTester()
1205 .mr(4)
1206 .nr(8)
1207 .kr(1)
1208 .sr(1)
1209 .m(4)
1210 .n(n)
1211 .k(k)
1212 .cn_stride(11)
1213 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1214 }
1215 }
1216 }
1217
1218 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, n_gt_8_strided_a) {
1219 TEST_REQUIRES_ARM_NEON;
1220 for (uint32_t n = 9; n < 16; n++) {
1221 for (size_t k = 1; k <= 40; k += 9) {
1222 GemmMicrokernelTester()
1223 .mr(4)
1224 .nr(8)
1225 .kr(1)
1226 .sr(1)
1227 .m(4)
1228 .n(n)
1229 .k(k)
1230 .a_stride(43)
1231 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1232 }
1233 }
1234 }
1235
1236 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, n_gt_8_subtile) {
1237 TEST_REQUIRES_ARM_NEON;
1238 for (uint32_t n = 9; n < 16; n++) {
1239 for (size_t k = 1; k <= 40; k += 9) {
1240 for (uint32_t m = 1; m <= 4; m++) {
1241 GemmMicrokernelTester()
1242 .mr(4)
1243 .nr(8)
1244 .kr(1)
1245 .sr(1)
1246 .m(m)
1247 .n(n)
1248 .k(k)
1249 .iterations(1)
1250 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1251 }
1252 }
1253 }
1254 }
1255
1256 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, n_div_8) {
1257 TEST_REQUIRES_ARM_NEON;
1258 for (uint32_t n = 16; n <= 24; n += 8) {
1259 for (size_t k = 1; k <= 40; k += 9) {
1260 GemmMicrokernelTester()
1261 .mr(4)
1262 .nr(8)
1263 .kr(1)
1264 .sr(1)
1265 .m(4)
1266 .n(n)
1267 .k(k)
1268 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1269 }
1270 }
1271 }
1272
1273 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, n_div_8_strided_cn) {
1274 TEST_REQUIRES_ARM_NEON;
1275 for (uint32_t n = 16; n <= 24; n += 8) {
1276 for (size_t k = 1; k <= 40; k += 9) {
1277 GemmMicrokernelTester()
1278 .mr(4)
1279 .nr(8)
1280 .kr(1)
1281 .sr(1)
1282 .m(4)
1283 .n(n)
1284 .k(k)
1285 .cn_stride(11)
1286 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1287 }
1288 }
1289 }
1290
1291 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, n_div_8_strided_a) {
1292 TEST_REQUIRES_ARM_NEON;
1293 for (uint32_t n = 16; n <= 24; n += 8) {
1294 for (size_t k = 1; k <= 40; k += 9) {
1295 GemmMicrokernelTester()
1296 .mr(4)
1297 .nr(8)
1298 .kr(1)
1299 .sr(1)
1300 .m(4)
1301 .n(n)
1302 .k(k)
1303 .a_stride(43)
1304 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1305 }
1306 }
1307 }
1308
1309 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, n_div_8_subtile) {
1310 TEST_REQUIRES_ARM_NEON;
1311 for (uint32_t n = 16; n <= 24; n += 8) {
1312 for (size_t k = 1; k <= 40; k += 9) {
1313 for (uint32_t m = 1; m <= 4; m++) {
1314 GemmMicrokernelTester()
1315 .mr(4)
1316 .nr(8)
1317 .kr(1)
1318 .sr(1)
1319 .m(m)
1320 .n(n)
1321 .k(k)
1322 .iterations(1)
1323 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1324 }
1325 }
1326 }
1327 }
1328
1329 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, strided_cm_subtile) {
1330 TEST_REQUIRES_ARM_NEON;
1331 for (size_t k = 1; k <= 40; k += 9) {
1332 for (uint32_t n = 1; n <= 8; n++) {
1333 for (uint32_t m = 1; m <= 4; m++) {
1334 GemmMicrokernelTester()
1335 .mr(4)
1336 .nr(8)
1337 .kr(1)
1338 .sr(1)
1339 .m(m)
1340 .n(n)
1341 .k(k)
1342 .cm_stride(11)
1343 .iterations(1)
1344 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1345 }
1346 }
1347 }
1348 }
1349
1350 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, qmin) {
1351 TEST_REQUIRES_ARM_NEON;
1352 GemmMicrokernelTester()
1353 .mr(4)
1354 .nr(8)
1355 .kr(1)
1356 .sr(1)
1357 .m(4)
1358 .n(8)
1359 .k(8)
1360 .qmin(128)
1361 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1362 }
1363
1364 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, qmax) {
1365 TEST_REQUIRES_ARM_NEON;
1366 GemmMicrokernelTester()
1367 .mr(4)
1368 .nr(8)
1369 .kr(1)
1370 .sr(1)
1371 .m(4)
1372 .n(8)
1373 .k(8)
1374 .qmax(128)
1375 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1376 }
1377
1378 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, strided_cm) {
1379 TEST_REQUIRES_ARM_NEON;
1380 GemmMicrokernelTester()
1381 .mr(4)
1382 .nr(8)
1383 .kr(1)
1384 .sr(1)
1385 .m(4)
1386 .n(8)
1387 .k(8)
1388 .cm_stride(11)
1389 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1390 }
1391#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
1392
1393
Frank Barchardf6237402022-01-05 00:26:09 -08001394#if XNN_ARCH_ARM || XNN_ARCH_ARM64
1395 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_eq_8) {
1396 TEST_REQUIRES_ARM_NEON;
1397 GemmMicrokernelTester()
1398 .mr(1)
1399 .nr(8)
1400 .kr(1)
1401 .sr(1)
1402 .m(1)
1403 .n(8)
1404 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08001405 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001406 }
1407
1408 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, strided_cn) {
1409 TEST_REQUIRES_ARM_NEON;
1410 GemmMicrokernelTester()
1411 .mr(1)
1412 .nr(8)
1413 .kr(1)
1414 .sr(1)
1415 .m(1)
1416 .n(8)
1417 .k(8)
1418 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08001419 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001420 }
1421
1422 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_eq_8_strided_a) {
1423 TEST_REQUIRES_ARM_NEON;
1424 GemmMicrokernelTester()
1425 .mr(1)
1426 .nr(8)
1427 .kr(1)
1428 .sr(1)
1429 .m(1)
1430 .n(8)
1431 .k(8)
1432 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08001433 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001434 }
1435
1436 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_eq_8_subtile) {
1437 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -08001438 for (uint32_t n = 1; n <= 8; n++) {
1439 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08001440 GemmMicrokernelTester()
1441 .mr(1)
1442 .nr(8)
1443 .kr(1)
1444 .sr(1)
1445 .m(m)
1446 .n(n)
1447 .k(8)
1448 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08001449 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001450 }
1451 }
1452 }
1453
1454 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_eq_8_subtile_m) {
1455 TEST_REQUIRES_ARM_NEON;
1456 for (uint32_t m = 1; m <= 1; m++) {
1457 GemmMicrokernelTester()
1458 .mr(1)
1459 .nr(8)
1460 .kr(1)
1461 .sr(1)
1462 .m(m)
1463 .n(8)
1464 .k(8)
1465 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08001466 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001467 }
1468 }
1469
1470 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_eq_8_subtile_n) {
1471 TEST_REQUIRES_ARM_NEON;
1472 for (uint32_t n = 1; n <= 8; n++) {
1473 GemmMicrokernelTester()
1474 .mr(1)
1475 .nr(8)
1476 .kr(1)
1477 .sr(1)
1478 .m(1)
1479 .n(n)
1480 .k(8)
1481 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08001482 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001483 }
1484 }
1485
1486 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_lt_8) {
1487 TEST_REQUIRES_ARM_NEON;
1488 for (size_t k = 1; k < 8; k++) {
1489 GemmMicrokernelTester()
1490 .mr(1)
1491 .nr(8)
1492 .kr(1)
1493 .sr(1)
1494 .m(1)
1495 .n(8)
1496 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08001497 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001498 }
1499 }
1500
1501 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_lt_8_strided_a) {
1502 TEST_REQUIRES_ARM_NEON;
1503 for (size_t k = 1; k < 8; k++) {
1504 GemmMicrokernelTester()
1505 .mr(1)
1506 .nr(8)
1507 .kr(1)
1508 .sr(1)
1509 .m(1)
1510 .n(8)
1511 .k(k)
1512 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08001513 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001514 }
1515 }
1516
1517 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_lt_8_subtile) {
1518 TEST_REQUIRES_ARM_NEON;
1519 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08001520 for (uint32_t n = 1; n <= 8; n++) {
1521 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08001522 GemmMicrokernelTester()
1523 .mr(1)
1524 .nr(8)
1525 .kr(1)
1526 .sr(1)
1527 .m(m)
1528 .n(n)
1529 .k(k)
1530 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08001531 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001532 }
1533 }
1534 }
1535 }
1536
1537 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_gt_8) {
1538 TEST_REQUIRES_ARM_NEON;
1539 for (size_t k = 9; k < 16; k++) {
1540 GemmMicrokernelTester()
1541 .mr(1)
1542 .nr(8)
1543 .kr(1)
1544 .sr(1)
1545 .m(1)
1546 .n(8)
1547 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08001548 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001549 }
1550 }
1551
1552 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_gt_8_strided_a) {
1553 TEST_REQUIRES_ARM_NEON;
1554 for (size_t k = 9; k < 16; k++) {
1555 GemmMicrokernelTester()
1556 .mr(1)
1557 .nr(8)
1558 .kr(1)
1559 .sr(1)
1560 .m(1)
1561 .n(8)
1562 .k(k)
1563 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08001564 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001565 }
1566 }
1567
1568 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_gt_8_subtile) {
1569 TEST_REQUIRES_ARM_NEON;
1570 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08001571 for (uint32_t n = 1; n <= 8; n++) {
1572 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08001573 GemmMicrokernelTester()
1574 .mr(1)
1575 .nr(8)
1576 .kr(1)
1577 .sr(1)
1578 .m(m)
1579 .n(n)
1580 .k(k)
1581 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08001582 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001583 }
1584 }
1585 }
1586 }
1587
1588 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_div_8) {
1589 TEST_REQUIRES_ARM_NEON;
1590 for (size_t k = 16; k <= 80; k += 8) {
1591 GemmMicrokernelTester()
1592 .mr(1)
1593 .nr(8)
1594 .kr(1)
1595 .sr(1)
1596 .m(1)
1597 .n(8)
1598 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08001599 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001600 }
1601 }
1602
1603 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_div_8_strided_a) {
1604 TEST_REQUIRES_ARM_NEON;
1605 for (size_t k = 16; k <= 80; k += 8) {
1606 GemmMicrokernelTester()
1607 .mr(1)
1608 .nr(8)
1609 .kr(1)
1610 .sr(1)
1611 .m(1)
1612 .n(8)
1613 .k(k)
1614 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08001615 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001616 }
1617 }
1618
1619 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_div_8_subtile) {
1620 TEST_REQUIRES_ARM_NEON;
1621 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08001622 for (uint32_t n = 1; n <= 8; n++) {
1623 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08001624 GemmMicrokernelTester()
1625 .mr(1)
1626 .nr(8)
1627 .kr(1)
1628 .sr(1)
1629 .m(m)
1630 .n(n)
1631 .k(k)
1632 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08001633 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001634 }
1635 }
1636 }
1637 }
1638
1639 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_gt_8) {
1640 TEST_REQUIRES_ARM_NEON;
1641 for (uint32_t n = 9; n < 16; n++) {
1642 for (size_t k = 1; k <= 40; k += 9) {
1643 GemmMicrokernelTester()
1644 .mr(1)
1645 .nr(8)
1646 .kr(1)
1647 .sr(1)
1648 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08001649 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08001650 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08001651 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001652 }
1653 }
1654 }
1655
1656 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_gt_8_strided_cn) {
1657 TEST_REQUIRES_ARM_NEON;
1658 for (uint32_t n = 9; n < 16; n++) {
1659 for (size_t k = 1; k <= 40; k += 9) {
1660 GemmMicrokernelTester()
1661 .mr(1)
1662 .nr(8)
1663 .kr(1)
1664 .sr(1)
1665 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08001666 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08001667 .k(k)
1668 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08001669 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001670 }
1671 }
1672 }
1673
1674 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_gt_8_strided_a) {
1675 TEST_REQUIRES_ARM_NEON;
1676 for (uint32_t n = 9; n < 16; n++) {
1677 for (size_t k = 1; k <= 40; k += 9) {
1678 GemmMicrokernelTester()
1679 .mr(1)
1680 .nr(8)
1681 .kr(1)
1682 .sr(1)
1683 .m(1)
1684 .n(n)
1685 .k(k)
1686 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08001687 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001688 }
1689 }
1690 }
1691
1692 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_gt_8_subtile) {
1693 TEST_REQUIRES_ARM_NEON;
1694 for (uint32_t n = 9; n < 16; n++) {
1695 for (size_t k = 1; k <= 40; k += 9) {
1696 for (uint32_t m = 1; m <= 1; m++) {
1697 GemmMicrokernelTester()
1698 .mr(1)
1699 .nr(8)
1700 .kr(1)
1701 .sr(1)
1702 .m(m)
1703 .n(n)
1704 .k(k)
1705 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08001706 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001707 }
1708 }
1709 }
1710 }
1711
1712 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_div_8) {
1713 TEST_REQUIRES_ARM_NEON;
1714 for (uint32_t n = 16; n <= 24; n += 8) {
1715 for (size_t k = 1; k <= 40; k += 9) {
1716 GemmMicrokernelTester()
1717 .mr(1)
1718 .nr(8)
1719 .kr(1)
1720 .sr(1)
1721 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08001722 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08001723 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08001724 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001725 }
1726 }
1727 }
1728
1729 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_div_8_strided_cn) {
1730 TEST_REQUIRES_ARM_NEON;
1731 for (uint32_t n = 16; n <= 24; n += 8) {
1732 for (size_t k = 1; k <= 40; k += 9) {
1733 GemmMicrokernelTester()
1734 .mr(1)
1735 .nr(8)
1736 .kr(1)
1737 .sr(1)
1738 .m(1)
1739 .n(n)
1740 .k(k)
1741 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08001742 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001743 }
1744 }
1745 }
1746
1747 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_div_8_strided_a) {
1748 TEST_REQUIRES_ARM_NEON;
1749 for (uint32_t n = 16; n <= 24; n += 8) {
1750 for (size_t k = 1; k <= 40; k += 9) {
1751 GemmMicrokernelTester()
1752 .mr(1)
1753 .nr(8)
1754 .kr(1)
1755 .sr(1)
1756 .m(1)
1757 .n(n)
1758 .k(k)
1759 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08001760 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001761 }
1762 }
1763 }
1764
1765 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_div_8_subtile) {
1766 TEST_REQUIRES_ARM_NEON;
1767 for (uint32_t n = 16; n <= 24; n += 8) {
1768 for (size_t k = 1; k <= 40; k += 9) {
1769 for (uint32_t m = 1; m <= 1; m++) {
1770 GemmMicrokernelTester()
1771 .mr(1)
1772 .nr(8)
1773 .kr(1)
1774 .sr(1)
1775 .m(m)
1776 .n(n)
1777 .k(k)
1778 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08001779 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001780 }
1781 }
1782 }
1783 }
1784
1785 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, strided_cm_subtile) {
1786 TEST_REQUIRES_ARM_NEON;
1787 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08001788 for (uint32_t n = 1; n <= 8; n++) {
1789 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08001790 GemmMicrokernelTester()
1791 .mr(1)
1792 .nr(8)
1793 .kr(1)
1794 .sr(1)
1795 .m(m)
1796 .n(n)
1797 .k(k)
1798 .cm_stride(11)
1799 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08001800 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001801 }
1802 }
1803 }
1804 }
1805
1806 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, qmin) {
1807 TEST_REQUIRES_ARM_NEON;
1808 GemmMicrokernelTester()
1809 .mr(1)
1810 .nr(8)
1811 .kr(1)
1812 .sr(1)
1813 .m(1)
1814 .n(8)
1815 .k(8)
1816 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08001817 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001818 }
1819
1820 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, qmax) {
1821 TEST_REQUIRES_ARM_NEON;
1822 GemmMicrokernelTester()
1823 .mr(1)
1824 .nr(8)
1825 .kr(1)
1826 .sr(1)
1827 .m(1)
1828 .n(8)
1829 .k(8)
1830 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08001831 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001832 }
1833
1834 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, strided_cm) {
1835 TEST_REQUIRES_ARM_NEON;
1836 GemmMicrokernelTester()
1837 .mr(1)
1838 .nr(8)
1839 .kr(1)
1840 .sr(1)
1841 .m(1)
1842 .n(8)
1843 .k(8)
1844 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08001845 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001846 }
1847#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
1848
1849
1850#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchardf6237402022-01-05 00:26:09 -08001851 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, k_eq_8) {
1852 TEST_REQUIRES_ARM_NEON;
1853 GemmMicrokernelTester()
1854 .mr(3)
1855 .nr(8)
1856 .kr(1)
1857 .sr(1)
1858 .m(3)
1859 .n(8)
1860 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08001861 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001862 }
1863
1864 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, strided_cn) {
1865 TEST_REQUIRES_ARM_NEON;
1866 GemmMicrokernelTester()
1867 .mr(3)
1868 .nr(8)
1869 .kr(1)
1870 .sr(1)
1871 .m(3)
1872 .n(8)
1873 .k(8)
1874 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08001875 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001876 }
1877
1878 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, k_eq_8_strided_a) {
1879 TEST_REQUIRES_ARM_NEON;
1880 GemmMicrokernelTester()
1881 .mr(3)
1882 .nr(8)
1883 .kr(1)
1884 .sr(1)
1885 .m(3)
1886 .n(8)
1887 .k(8)
1888 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08001889 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001890 }
1891
1892 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, k_eq_8_subtile) {
1893 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -08001894 for (uint32_t n = 1; n <= 8; n++) {
1895 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08001896 GemmMicrokernelTester()
1897 .mr(3)
1898 .nr(8)
1899 .kr(1)
1900 .sr(1)
1901 .m(m)
1902 .n(n)
1903 .k(8)
1904 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08001905 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001906 }
1907 }
1908 }
1909
1910 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, k_eq_8_subtile_m) {
1911 TEST_REQUIRES_ARM_NEON;
1912 for (uint32_t m = 1; m <= 3; m++) {
1913 GemmMicrokernelTester()
1914 .mr(3)
1915 .nr(8)
1916 .kr(1)
1917 .sr(1)
1918 .m(m)
1919 .n(8)
1920 .k(8)
1921 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08001922 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001923 }
1924 }
1925
1926 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, k_eq_8_subtile_n) {
1927 TEST_REQUIRES_ARM_NEON;
1928 for (uint32_t n = 1; n <= 8; n++) {
1929 GemmMicrokernelTester()
1930 .mr(3)
1931 .nr(8)
1932 .kr(1)
1933 .sr(1)
1934 .m(3)
1935 .n(n)
1936 .k(8)
1937 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08001938 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001939 }
1940 }
1941
1942 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, k_lt_8) {
1943 TEST_REQUIRES_ARM_NEON;
1944 for (size_t k = 1; k < 8; k++) {
1945 GemmMicrokernelTester()
1946 .mr(3)
1947 .nr(8)
1948 .kr(1)
1949 .sr(1)
1950 .m(3)
1951 .n(8)
1952 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08001953 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001954 }
1955 }
1956
1957 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, k_lt_8_strided_a) {
1958 TEST_REQUIRES_ARM_NEON;
1959 for (size_t k = 1; k < 8; k++) {
1960 GemmMicrokernelTester()
1961 .mr(3)
1962 .nr(8)
1963 .kr(1)
1964 .sr(1)
1965 .m(3)
1966 .n(8)
1967 .k(k)
1968 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08001969 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001970 }
1971 }
1972
1973 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, k_lt_8_subtile) {
1974 TEST_REQUIRES_ARM_NEON;
1975 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08001976 for (uint32_t n = 1; n <= 8; n++) {
1977 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08001978 GemmMicrokernelTester()
1979 .mr(3)
1980 .nr(8)
1981 .kr(1)
1982 .sr(1)
1983 .m(m)
1984 .n(n)
1985 .k(k)
1986 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08001987 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08001988 }
1989 }
1990 }
1991 }
1992
1993 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, k_gt_8) {
1994 TEST_REQUIRES_ARM_NEON;
1995 for (size_t k = 9; k < 16; k++) {
1996 GemmMicrokernelTester()
1997 .mr(3)
1998 .nr(8)
1999 .kr(1)
2000 .sr(1)
2001 .m(3)
2002 .n(8)
2003 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08002004 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002005 }
2006 }
2007
2008 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, k_gt_8_strided_a) {
2009 TEST_REQUIRES_ARM_NEON;
2010 for (size_t k = 9; k < 16; k++) {
2011 GemmMicrokernelTester()
2012 .mr(3)
2013 .nr(8)
2014 .kr(1)
2015 .sr(1)
2016 .m(3)
2017 .n(8)
2018 .k(k)
2019 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08002020 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002021 }
2022 }
2023
2024 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, k_gt_8_subtile) {
2025 TEST_REQUIRES_ARM_NEON;
2026 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08002027 for (uint32_t n = 1; n <= 8; n++) {
2028 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08002029 GemmMicrokernelTester()
2030 .mr(3)
2031 .nr(8)
2032 .kr(1)
2033 .sr(1)
2034 .m(m)
2035 .n(n)
2036 .k(k)
2037 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002038 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002039 }
2040 }
2041 }
2042 }
2043
2044 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, k_div_8) {
2045 TEST_REQUIRES_ARM_NEON;
2046 for (size_t k = 16; k <= 80; k += 8) {
2047 GemmMicrokernelTester()
2048 .mr(3)
2049 .nr(8)
2050 .kr(1)
2051 .sr(1)
2052 .m(3)
2053 .n(8)
2054 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08002055 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002056 }
2057 }
2058
2059 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, k_div_8_strided_a) {
2060 TEST_REQUIRES_ARM_NEON;
2061 for (size_t k = 16; k <= 80; k += 8) {
2062 GemmMicrokernelTester()
2063 .mr(3)
2064 .nr(8)
2065 .kr(1)
2066 .sr(1)
2067 .m(3)
2068 .n(8)
2069 .k(k)
2070 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08002071 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002072 }
2073 }
2074
2075 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, k_div_8_subtile) {
2076 TEST_REQUIRES_ARM_NEON;
2077 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08002078 for (uint32_t n = 1; n <= 8; n++) {
2079 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08002080 GemmMicrokernelTester()
2081 .mr(3)
2082 .nr(8)
2083 .kr(1)
2084 .sr(1)
2085 .m(m)
2086 .n(n)
2087 .k(k)
2088 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002089 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002090 }
2091 }
2092 }
2093 }
2094
2095 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, n_gt_8) {
2096 TEST_REQUIRES_ARM_NEON;
2097 for (uint32_t n = 9; n < 16; n++) {
2098 for (size_t k = 1; k <= 40; k += 9) {
2099 GemmMicrokernelTester()
2100 .mr(3)
2101 .nr(8)
2102 .kr(1)
2103 .sr(1)
2104 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08002105 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08002106 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08002107 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002108 }
2109 }
2110 }
2111
2112 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, n_gt_8_strided_cn) {
2113 TEST_REQUIRES_ARM_NEON;
2114 for (uint32_t n = 9; n < 16; n++) {
2115 for (size_t k = 1; k <= 40; k += 9) {
2116 GemmMicrokernelTester()
2117 .mr(3)
2118 .nr(8)
2119 .kr(1)
2120 .sr(1)
2121 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08002122 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08002123 .k(k)
2124 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08002125 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002126 }
2127 }
2128 }
2129
2130 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, n_gt_8_strided_a) {
2131 TEST_REQUIRES_ARM_NEON;
2132 for (uint32_t n = 9; n < 16; n++) {
2133 for (size_t k = 1; k <= 40; k += 9) {
2134 GemmMicrokernelTester()
2135 .mr(3)
2136 .nr(8)
2137 .kr(1)
2138 .sr(1)
2139 .m(3)
2140 .n(n)
2141 .k(k)
2142 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08002143 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002144 }
2145 }
2146 }
2147
2148 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, n_gt_8_subtile) {
2149 TEST_REQUIRES_ARM_NEON;
2150 for (uint32_t n = 9; n < 16; n++) {
2151 for (size_t k = 1; k <= 40; k += 9) {
2152 for (uint32_t m = 1; m <= 3; m++) {
2153 GemmMicrokernelTester()
2154 .mr(3)
2155 .nr(8)
2156 .kr(1)
2157 .sr(1)
2158 .m(m)
2159 .n(n)
2160 .k(k)
2161 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002162 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002163 }
2164 }
2165 }
2166 }
2167
2168 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, n_div_8) {
2169 TEST_REQUIRES_ARM_NEON;
2170 for (uint32_t n = 16; n <= 24; n += 8) {
2171 for (size_t k = 1; k <= 40; k += 9) {
2172 GemmMicrokernelTester()
2173 .mr(3)
2174 .nr(8)
2175 .kr(1)
2176 .sr(1)
2177 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08002178 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08002179 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08002180 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002181 }
2182 }
2183 }
2184
2185 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, n_div_8_strided_cn) {
2186 TEST_REQUIRES_ARM_NEON;
2187 for (uint32_t n = 16; n <= 24; n += 8) {
2188 for (size_t k = 1; k <= 40; k += 9) {
2189 GemmMicrokernelTester()
2190 .mr(3)
2191 .nr(8)
2192 .kr(1)
2193 .sr(1)
2194 .m(3)
2195 .n(n)
2196 .k(k)
2197 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08002198 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002199 }
2200 }
2201 }
2202
2203 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, n_div_8_strided_a) {
2204 TEST_REQUIRES_ARM_NEON;
2205 for (uint32_t n = 16; n <= 24; n += 8) {
2206 for (size_t k = 1; k <= 40; k += 9) {
2207 GemmMicrokernelTester()
2208 .mr(3)
2209 .nr(8)
2210 .kr(1)
2211 .sr(1)
2212 .m(3)
2213 .n(n)
2214 .k(k)
2215 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08002216 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002217 }
2218 }
2219 }
2220
2221 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, n_div_8_subtile) {
2222 TEST_REQUIRES_ARM_NEON;
2223 for (uint32_t n = 16; n <= 24; n += 8) {
2224 for (size_t k = 1; k <= 40; k += 9) {
2225 for (uint32_t m = 1; m <= 3; m++) {
2226 GemmMicrokernelTester()
2227 .mr(3)
2228 .nr(8)
2229 .kr(1)
2230 .sr(1)
2231 .m(m)
2232 .n(n)
2233 .k(k)
2234 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002235 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002236 }
2237 }
2238 }
2239 }
2240
2241 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, strided_cm_subtile) {
2242 TEST_REQUIRES_ARM_NEON;
2243 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08002244 for (uint32_t n = 1; n <= 8; n++) {
2245 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08002246 GemmMicrokernelTester()
2247 .mr(3)
2248 .nr(8)
2249 .kr(1)
2250 .sr(1)
2251 .m(m)
2252 .n(n)
2253 .k(k)
2254 .cm_stride(11)
2255 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002256 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002257 }
2258 }
2259 }
2260 }
2261
2262 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, qmin) {
2263 TEST_REQUIRES_ARM_NEON;
2264 GemmMicrokernelTester()
2265 .mr(3)
2266 .nr(8)
2267 .kr(1)
2268 .sr(1)
2269 .m(3)
2270 .n(8)
2271 .k(8)
2272 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08002273 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002274 }
2275
2276 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, qmax) {
2277 TEST_REQUIRES_ARM_NEON;
2278 GemmMicrokernelTester()
2279 .mr(3)
2280 .nr(8)
2281 .kr(1)
2282 .sr(1)
2283 .m(3)
2284 .n(8)
2285 .k(8)
2286 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08002287 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002288 }
2289
2290 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE, strided_cm) {
2291 TEST_REQUIRES_ARM_NEON;
2292 GemmMicrokernelTester()
2293 .mr(3)
2294 .nr(8)
2295 .kr(1)
2296 .sr(1)
2297 .m(3)
2298 .n(8)
2299 .k(8)
2300 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08002301 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002302 }
2303#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
2304
2305
2306#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchardf6237402022-01-05 00:26:09 -08002307 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, k_eq_8) {
2308 TEST_REQUIRES_ARM_NEON;
2309 GemmMicrokernelTester()
2310 .mr(3)
2311 .nr(16)
2312 .kr(1)
2313 .sr(1)
2314 .m(3)
2315 .n(16)
2316 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08002317 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002318 }
2319
2320 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, strided_cn) {
2321 TEST_REQUIRES_ARM_NEON;
2322 GemmMicrokernelTester()
2323 .mr(3)
2324 .nr(16)
2325 .kr(1)
2326 .sr(1)
2327 .m(3)
2328 .n(16)
2329 .k(8)
2330 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08002331 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002332 }
2333
2334 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, k_eq_8_strided_a) {
2335 TEST_REQUIRES_ARM_NEON;
2336 GemmMicrokernelTester()
2337 .mr(3)
2338 .nr(16)
2339 .kr(1)
2340 .sr(1)
2341 .m(3)
2342 .n(16)
2343 .k(8)
2344 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08002345 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002346 }
2347
2348 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, k_eq_8_subtile) {
2349 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -08002350 for (uint32_t n = 1; n <= 16; n++) {
2351 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08002352 GemmMicrokernelTester()
2353 .mr(3)
2354 .nr(16)
2355 .kr(1)
2356 .sr(1)
2357 .m(m)
2358 .n(n)
2359 .k(8)
2360 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002361 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002362 }
2363 }
2364 }
2365
2366 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, k_eq_8_subtile_m) {
2367 TEST_REQUIRES_ARM_NEON;
2368 for (uint32_t m = 1; m <= 3; m++) {
2369 GemmMicrokernelTester()
2370 .mr(3)
2371 .nr(16)
2372 .kr(1)
2373 .sr(1)
2374 .m(m)
2375 .n(16)
2376 .k(8)
2377 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002378 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002379 }
2380 }
2381
2382 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, k_eq_8_subtile_n) {
2383 TEST_REQUIRES_ARM_NEON;
2384 for (uint32_t n = 1; n <= 16; n++) {
2385 GemmMicrokernelTester()
2386 .mr(3)
2387 .nr(16)
2388 .kr(1)
2389 .sr(1)
2390 .m(3)
2391 .n(n)
2392 .k(8)
2393 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002394 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002395 }
2396 }
2397
2398 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, k_lt_8) {
2399 TEST_REQUIRES_ARM_NEON;
2400 for (size_t k = 1; k < 8; k++) {
2401 GemmMicrokernelTester()
2402 .mr(3)
2403 .nr(16)
2404 .kr(1)
2405 .sr(1)
2406 .m(3)
2407 .n(16)
2408 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08002409 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002410 }
2411 }
2412
2413 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, k_lt_8_strided_a) {
2414 TEST_REQUIRES_ARM_NEON;
2415 for (size_t k = 1; k < 8; k++) {
2416 GemmMicrokernelTester()
2417 .mr(3)
2418 .nr(16)
2419 .kr(1)
2420 .sr(1)
2421 .m(3)
2422 .n(16)
2423 .k(k)
2424 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08002425 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002426 }
2427 }
2428
2429 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, k_lt_8_subtile) {
2430 TEST_REQUIRES_ARM_NEON;
2431 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08002432 for (uint32_t n = 1; n <= 16; n++) {
2433 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08002434 GemmMicrokernelTester()
2435 .mr(3)
2436 .nr(16)
2437 .kr(1)
2438 .sr(1)
2439 .m(m)
2440 .n(n)
2441 .k(k)
2442 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002443 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002444 }
2445 }
2446 }
2447 }
2448
2449 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, k_gt_8) {
2450 TEST_REQUIRES_ARM_NEON;
2451 for (size_t k = 9; k < 16; k++) {
2452 GemmMicrokernelTester()
2453 .mr(3)
2454 .nr(16)
2455 .kr(1)
2456 .sr(1)
2457 .m(3)
2458 .n(16)
2459 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08002460 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002461 }
2462 }
2463
2464 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, k_gt_8_strided_a) {
2465 TEST_REQUIRES_ARM_NEON;
2466 for (size_t k = 9; k < 16; k++) {
2467 GemmMicrokernelTester()
2468 .mr(3)
2469 .nr(16)
2470 .kr(1)
2471 .sr(1)
2472 .m(3)
2473 .n(16)
2474 .k(k)
2475 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08002476 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002477 }
2478 }
2479
2480 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, k_gt_8_subtile) {
2481 TEST_REQUIRES_ARM_NEON;
2482 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08002483 for (uint32_t n = 1; n <= 16; n++) {
2484 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08002485 GemmMicrokernelTester()
2486 .mr(3)
2487 .nr(16)
2488 .kr(1)
2489 .sr(1)
2490 .m(m)
2491 .n(n)
2492 .k(k)
2493 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002494 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002495 }
2496 }
2497 }
2498 }
2499
2500 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, k_div_8) {
2501 TEST_REQUIRES_ARM_NEON;
2502 for (size_t k = 16; k <= 80; k += 8) {
2503 GemmMicrokernelTester()
2504 .mr(3)
2505 .nr(16)
2506 .kr(1)
2507 .sr(1)
2508 .m(3)
2509 .n(16)
2510 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08002511 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002512 }
2513 }
2514
2515 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, k_div_8_strided_a) {
2516 TEST_REQUIRES_ARM_NEON;
2517 for (size_t k = 16; k <= 80; k += 8) {
2518 GemmMicrokernelTester()
2519 .mr(3)
2520 .nr(16)
2521 .kr(1)
2522 .sr(1)
2523 .m(3)
2524 .n(16)
2525 .k(k)
2526 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08002527 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002528 }
2529 }
2530
2531 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, k_div_8_subtile) {
2532 TEST_REQUIRES_ARM_NEON;
2533 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08002534 for (uint32_t n = 1; n <= 16; n++) {
2535 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08002536 GemmMicrokernelTester()
2537 .mr(3)
2538 .nr(16)
2539 .kr(1)
2540 .sr(1)
2541 .m(m)
2542 .n(n)
2543 .k(k)
2544 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002545 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002546 }
2547 }
2548 }
2549 }
2550
2551 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, n_gt_16) {
2552 TEST_REQUIRES_ARM_NEON;
2553 for (uint32_t n = 17; n < 32; n++) {
2554 for (size_t k = 1; k <= 40; k += 9) {
2555 GemmMicrokernelTester()
2556 .mr(3)
2557 .nr(16)
2558 .kr(1)
2559 .sr(1)
2560 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08002561 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08002562 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08002563 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002564 }
2565 }
2566 }
2567
2568 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, n_gt_16_strided_cn) {
2569 TEST_REQUIRES_ARM_NEON;
2570 for (uint32_t n = 17; n < 32; n++) {
2571 for (size_t k = 1; k <= 40; k += 9) {
2572 GemmMicrokernelTester()
2573 .mr(3)
2574 .nr(16)
2575 .kr(1)
2576 .sr(1)
2577 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08002578 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08002579 .k(k)
2580 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08002581 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002582 }
2583 }
2584 }
2585
2586 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, n_gt_16_strided_a) {
2587 TEST_REQUIRES_ARM_NEON;
2588 for (uint32_t n = 17; n < 32; n++) {
2589 for (size_t k = 1; k <= 40; k += 9) {
2590 GemmMicrokernelTester()
2591 .mr(3)
2592 .nr(16)
2593 .kr(1)
2594 .sr(1)
2595 .m(3)
2596 .n(n)
2597 .k(k)
2598 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08002599 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002600 }
2601 }
2602 }
2603
2604 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, n_gt_16_subtile) {
2605 TEST_REQUIRES_ARM_NEON;
2606 for (uint32_t n = 17; n < 32; n++) {
2607 for (size_t k = 1; k <= 40; k += 9) {
2608 for (uint32_t m = 1; m <= 3; m++) {
2609 GemmMicrokernelTester()
2610 .mr(3)
2611 .nr(16)
2612 .kr(1)
2613 .sr(1)
2614 .m(m)
2615 .n(n)
2616 .k(k)
2617 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002618 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002619 }
2620 }
2621 }
2622 }
2623
2624 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, n_div_16) {
2625 TEST_REQUIRES_ARM_NEON;
2626 for (uint32_t n = 32; n <= 48; n += 16) {
2627 for (size_t k = 1; k <= 40; k += 9) {
2628 GemmMicrokernelTester()
2629 .mr(3)
2630 .nr(16)
2631 .kr(1)
2632 .sr(1)
2633 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08002634 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08002635 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08002636 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002637 }
2638 }
2639 }
2640
2641 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, n_div_16_strided_cn) {
2642 TEST_REQUIRES_ARM_NEON;
2643 for (uint32_t n = 32; n <= 48; n += 16) {
2644 for (size_t k = 1; k <= 40; k += 9) {
2645 GemmMicrokernelTester()
2646 .mr(3)
2647 .nr(16)
2648 .kr(1)
2649 .sr(1)
2650 .m(3)
2651 .n(n)
2652 .k(k)
2653 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08002654 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002655 }
2656 }
2657 }
2658
2659 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, n_div_16_strided_a) {
2660 TEST_REQUIRES_ARM_NEON;
2661 for (uint32_t n = 32; n <= 48; n += 16) {
2662 for (size_t k = 1; k <= 40; k += 9) {
2663 GemmMicrokernelTester()
2664 .mr(3)
2665 .nr(16)
2666 .kr(1)
2667 .sr(1)
2668 .m(3)
2669 .n(n)
2670 .k(k)
2671 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08002672 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002673 }
2674 }
2675 }
2676
2677 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, n_div_16_subtile) {
2678 TEST_REQUIRES_ARM_NEON;
2679 for (uint32_t n = 32; n <= 48; n += 16) {
2680 for (size_t k = 1; k <= 40; k += 9) {
2681 for (uint32_t m = 1; m <= 3; m++) {
2682 GemmMicrokernelTester()
2683 .mr(3)
2684 .nr(16)
2685 .kr(1)
2686 .sr(1)
2687 .m(m)
2688 .n(n)
2689 .k(k)
2690 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002691 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002692 }
2693 }
2694 }
2695 }
2696
2697 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, strided_cm_subtile) {
2698 TEST_REQUIRES_ARM_NEON;
2699 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08002700 for (uint32_t n = 1; n <= 16; n++) {
2701 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08002702 GemmMicrokernelTester()
2703 .mr(3)
2704 .nr(16)
2705 .kr(1)
2706 .sr(1)
2707 .m(m)
2708 .n(n)
2709 .k(k)
2710 .cm_stride(19)
2711 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002712 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002713 }
2714 }
2715 }
2716 }
2717
2718 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, qmin) {
2719 TEST_REQUIRES_ARM_NEON;
2720 GemmMicrokernelTester()
2721 .mr(3)
2722 .nr(16)
2723 .kr(1)
2724 .sr(1)
2725 .m(3)
2726 .n(16)
2727 .k(8)
2728 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08002729 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002730 }
2731
2732 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, qmax) {
2733 TEST_REQUIRES_ARM_NEON;
2734 GemmMicrokernelTester()
2735 .mr(3)
2736 .nr(16)
2737 .kr(1)
2738 .sr(1)
2739 .m(3)
2740 .n(16)
2741 .k(8)
2742 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08002743 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002744 }
2745
2746 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEON_MLAL_LANE, strided_cm) {
2747 TEST_REQUIRES_ARM_NEON;
2748 GemmMicrokernelTester()
2749 .mr(3)
2750 .nr(16)
2751 .kr(1)
2752 .sr(1)
2753 .m(3)
2754 .n(16)
2755 .k(8)
2756 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08002757 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002758 }
2759#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
2760
2761
2762#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchardf6237402022-01-05 00:26:09 -08002763 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, k_eq_8) {
2764 TEST_REQUIRES_ARM_NEON;
2765 GemmMicrokernelTester()
2766 .mr(2)
2767 .nr(16)
2768 .kr(1)
2769 .sr(1)
2770 .m(2)
2771 .n(16)
2772 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08002773 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002774 }
2775
2776 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, strided_cn) {
2777 TEST_REQUIRES_ARM_NEON;
2778 GemmMicrokernelTester()
2779 .mr(2)
2780 .nr(16)
2781 .kr(1)
2782 .sr(1)
2783 .m(2)
2784 .n(16)
2785 .k(8)
2786 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08002787 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002788 }
2789
2790 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, k_eq_8_strided_a) {
2791 TEST_REQUIRES_ARM_NEON;
2792 GemmMicrokernelTester()
2793 .mr(2)
2794 .nr(16)
2795 .kr(1)
2796 .sr(1)
2797 .m(2)
2798 .n(16)
2799 .k(8)
2800 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08002801 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002802 }
2803
2804 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, k_eq_8_subtile) {
2805 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -08002806 for (uint32_t n = 1; n <= 16; n++) {
2807 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08002808 GemmMicrokernelTester()
2809 .mr(2)
2810 .nr(16)
2811 .kr(1)
2812 .sr(1)
2813 .m(m)
2814 .n(n)
2815 .k(8)
2816 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002817 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002818 }
2819 }
2820 }
2821
2822 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, k_eq_8_subtile_m) {
2823 TEST_REQUIRES_ARM_NEON;
2824 for (uint32_t m = 1; m <= 2; m++) {
2825 GemmMicrokernelTester()
2826 .mr(2)
2827 .nr(16)
2828 .kr(1)
2829 .sr(1)
2830 .m(m)
2831 .n(16)
2832 .k(8)
2833 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002834 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002835 }
2836 }
2837
2838 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, k_eq_8_subtile_n) {
2839 TEST_REQUIRES_ARM_NEON;
2840 for (uint32_t n = 1; n <= 16; n++) {
2841 GemmMicrokernelTester()
2842 .mr(2)
2843 .nr(16)
2844 .kr(1)
2845 .sr(1)
2846 .m(2)
2847 .n(n)
2848 .k(8)
2849 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002850 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002851 }
2852 }
2853
2854 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, k_lt_8) {
2855 TEST_REQUIRES_ARM_NEON;
2856 for (size_t k = 1; k < 8; k++) {
2857 GemmMicrokernelTester()
2858 .mr(2)
2859 .nr(16)
2860 .kr(1)
2861 .sr(1)
2862 .m(2)
2863 .n(16)
2864 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08002865 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002866 }
2867 }
2868
2869 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, k_lt_8_strided_a) {
2870 TEST_REQUIRES_ARM_NEON;
2871 for (size_t k = 1; k < 8; k++) {
2872 GemmMicrokernelTester()
2873 .mr(2)
2874 .nr(16)
2875 .kr(1)
2876 .sr(1)
2877 .m(2)
2878 .n(16)
2879 .k(k)
2880 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08002881 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002882 }
2883 }
2884
2885 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, k_lt_8_subtile) {
2886 TEST_REQUIRES_ARM_NEON;
2887 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08002888 for (uint32_t n = 1; n <= 16; n++) {
2889 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08002890 GemmMicrokernelTester()
2891 .mr(2)
2892 .nr(16)
2893 .kr(1)
2894 .sr(1)
2895 .m(m)
2896 .n(n)
2897 .k(k)
2898 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002899 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002900 }
2901 }
2902 }
2903 }
2904
2905 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, k_gt_8) {
2906 TEST_REQUIRES_ARM_NEON;
2907 for (size_t k = 9; k < 16; k++) {
2908 GemmMicrokernelTester()
2909 .mr(2)
2910 .nr(16)
2911 .kr(1)
2912 .sr(1)
2913 .m(2)
2914 .n(16)
2915 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08002916 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002917 }
2918 }
2919
2920 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, k_gt_8_strided_a) {
2921 TEST_REQUIRES_ARM_NEON;
2922 for (size_t k = 9; k < 16; k++) {
2923 GemmMicrokernelTester()
2924 .mr(2)
2925 .nr(16)
2926 .kr(1)
2927 .sr(1)
2928 .m(2)
2929 .n(16)
2930 .k(k)
2931 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08002932 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002933 }
2934 }
2935
2936 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, k_gt_8_subtile) {
2937 TEST_REQUIRES_ARM_NEON;
2938 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08002939 for (uint32_t n = 1; n <= 16; n++) {
2940 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08002941 GemmMicrokernelTester()
2942 .mr(2)
2943 .nr(16)
2944 .kr(1)
2945 .sr(1)
2946 .m(m)
2947 .n(n)
2948 .k(k)
2949 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002950 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002951 }
2952 }
2953 }
2954 }
2955
2956 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, k_div_8) {
2957 TEST_REQUIRES_ARM_NEON;
2958 for (size_t k = 16; k <= 80; k += 8) {
2959 GemmMicrokernelTester()
2960 .mr(2)
2961 .nr(16)
2962 .kr(1)
2963 .sr(1)
2964 .m(2)
2965 .n(16)
2966 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08002967 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002968 }
2969 }
2970
2971 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, k_div_8_strided_a) {
2972 TEST_REQUIRES_ARM_NEON;
2973 for (size_t k = 16; k <= 80; k += 8) {
2974 GemmMicrokernelTester()
2975 .mr(2)
2976 .nr(16)
2977 .kr(1)
2978 .sr(1)
2979 .m(2)
2980 .n(16)
2981 .k(k)
2982 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08002983 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08002984 }
2985 }
2986
2987 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, k_div_8_subtile) {
2988 TEST_REQUIRES_ARM_NEON;
2989 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08002990 for (uint32_t n = 1; n <= 16; n++) {
2991 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08002992 GemmMicrokernelTester()
2993 .mr(2)
2994 .nr(16)
2995 .kr(1)
2996 .sr(1)
2997 .m(m)
2998 .n(n)
2999 .k(k)
3000 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003001 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003002 }
3003 }
3004 }
3005 }
3006
3007 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, n_gt_16) {
3008 TEST_REQUIRES_ARM_NEON;
3009 for (uint32_t n = 17; n < 32; n++) {
3010 for (size_t k = 1; k <= 40; k += 9) {
3011 GemmMicrokernelTester()
3012 .mr(2)
3013 .nr(16)
3014 .kr(1)
3015 .sr(1)
3016 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08003017 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08003018 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08003019 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003020 }
3021 }
3022 }
3023
3024 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, n_gt_16_strided_cn) {
3025 TEST_REQUIRES_ARM_NEON;
3026 for (uint32_t n = 17; n < 32; n++) {
3027 for (size_t k = 1; k <= 40; k += 9) {
3028 GemmMicrokernelTester()
3029 .mr(2)
3030 .nr(16)
3031 .kr(1)
3032 .sr(1)
3033 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08003034 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08003035 .k(k)
3036 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08003037 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003038 }
3039 }
3040 }
3041
3042 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, n_gt_16_strided_a) {
3043 TEST_REQUIRES_ARM_NEON;
3044 for (uint32_t n = 17; n < 32; n++) {
3045 for (size_t k = 1; k <= 40; k += 9) {
3046 GemmMicrokernelTester()
3047 .mr(2)
3048 .nr(16)
3049 .kr(1)
3050 .sr(1)
3051 .m(2)
3052 .n(n)
3053 .k(k)
3054 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08003055 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003056 }
3057 }
3058 }
3059
3060 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, n_gt_16_subtile) {
3061 TEST_REQUIRES_ARM_NEON;
3062 for (uint32_t n = 17; n < 32; n++) {
3063 for (size_t k = 1; k <= 40; k += 9) {
3064 for (uint32_t m = 1; m <= 2; m++) {
3065 GemmMicrokernelTester()
3066 .mr(2)
3067 .nr(16)
3068 .kr(1)
3069 .sr(1)
3070 .m(m)
3071 .n(n)
3072 .k(k)
3073 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003074 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003075 }
3076 }
3077 }
3078 }
3079
3080 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, n_div_16) {
3081 TEST_REQUIRES_ARM_NEON;
3082 for (uint32_t n = 32; n <= 48; n += 16) {
3083 for (size_t k = 1; k <= 40; k += 9) {
3084 GemmMicrokernelTester()
3085 .mr(2)
3086 .nr(16)
3087 .kr(1)
3088 .sr(1)
3089 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08003090 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08003091 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08003092 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003093 }
3094 }
3095 }
3096
3097 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, n_div_16_strided_cn) {
3098 TEST_REQUIRES_ARM_NEON;
3099 for (uint32_t n = 32; n <= 48; n += 16) {
3100 for (size_t k = 1; k <= 40; k += 9) {
3101 GemmMicrokernelTester()
3102 .mr(2)
3103 .nr(16)
3104 .kr(1)
3105 .sr(1)
3106 .m(2)
3107 .n(n)
3108 .k(k)
3109 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08003110 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003111 }
3112 }
3113 }
3114
3115 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, n_div_16_strided_a) {
3116 TEST_REQUIRES_ARM_NEON;
3117 for (uint32_t n = 32; n <= 48; n += 16) {
3118 for (size_t k = 1; k <= 40; k += 9) {
3119 GemmMicrokernelTester()
3120 .mr(2)
3121 .nr(16)
3122 .kr(1)
3123 .sr(1)
3124 .m(2)
3125 .n(n)
3126 .k(k)
3127 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08003128 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003129 }
3130 }
3131 }
3132
3133 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, n_div_16_subtile) {
3134 TEST_REQUIRES_ARM_NEON;
3135 for (uint32_t n = 32; n <= 48; n += 16) {
3136 for (size_t k = 1; k <= 40; k += 9) {
3137 for (uint32_t m = 1; m <= 2; m++) {
3138 GemmMicrokernelTester()
3139 .mr(2)
3140 .nr(16)
3141 .kr(1)
3142 .sr(1)
3143 .m(m)
3144 .n(n)
3145 .k(k)
3146 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003147 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003148 }
3149 }
3150 }
3151 }
3152
3153 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, strided_cm_subtile) {
3154 TEST_REQUIRES_ARM_NEON;
3155 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08003156 for (uint32_t n = 1; n <= 16; n++) {
3157 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08003158 GemmMicrokernelTester()
3159 .mr(2)
3160 .nr(16)
3161 .kr(1)
3162 .sr(1)
3163 .m(m)
3164 .n(n)
3165 .k(k)
3166 .cm_stride(19)
3167 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003168 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003169 }
3170 }
3171 }
3172 }
3173
3174 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, qmin) {
3175 TEST_REQUIRES_ARM_NEON;
3176 GemmMicrokernelTester()
3177 .mr(2)
3178 .nr(16)
3179 .kr(1)
3180 .sr(1)
3181 .m(2)
3182 .n(16)
3183 .k(8)
3184 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08003185 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003186 }
3187
3188 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, qmax) {
3189 TEST_REQUIRES_ARM_NEON;
3190 GemmMicrokernelTester()
3191 .mr(2)
3192 .nr(16)
3193 .kr(1)
3194 .sr(1)
3195 .m(2)
3196 .n(16)
3197 .k(8)
3198 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08003199 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003200 }
3201
3202 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE_PRFM, strided_cm) {
3203 TEST_REQUIRES_ARM_NEON;
3204 GemmMicrokernelTester()
3205 .mr(2)
3206 .nr(16)
3207 .kr(1)
3208 .sr(1)
3209 .m(2)
3210 .n(16)
3211 .k(8)
3212 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08003213 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003214 }
3215#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
3216
3217
3218#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchardf6237402022-01-05 00:26:09 -08003219 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_eq_8) {
3220 TEST_REQUIRES_ARM_NEON_V8;
3221 GemmMicrokernelTester()
3222 .mr(1)
3223 .nr(16)
3224 .kr(1)
3225 .sr(1)
3226 .m(1)
3227 .n(16)
3228 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08003229 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003230 }
3231
3232 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, strided_cn) {
3233 TEST_REQUIRES_ARM_NEON_V8;
3234 GemmMicrokernelTester()
3235 .mr(1)
3236 .nr(16)
3237 .kr(1)
3238 .sr(1)
3239 .m(1)
3240 .n(16)
3241 .k(8)
3242 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08003243 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003244 }
3245
3246 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_eq_8_strided_a) {
3247 TEST_REQUIRES_ARM_NEON_V8;
3248 GemmMicrokernelTester()
3249 .mr(1)
3250 .nr(16)
3251 .kr(1)
3252 .sr(1)
3253 .m(1)
3254 .n(16)
3255 .k(8)
3256 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08003257 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003258 }
3259
3260 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_eq_8_subtile) {
3261 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -08003262 for (uint32_t n = 1; n <= 16; n++) {
3263 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08003264 GemmMicrokernelTester()
3265 .mr(1)
3266 .nr(16)
3267 .kr(1)
3268 .sr(1)
3269 .m(m)
3270 .n(n)
3271 .k(8)
3272 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003273 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003274 }
3275 }
3276 }
3277
3278 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_eq_8_subtile_m) {
3279 TEST_REQUIRES_ARM_NEON_V8;
3280 for (uint32_t m = 1; m <= 1; m++) {
3281 GemmMicrokernelTester()
3282 .mr(1)
3283 .nr(16)
3284 .kr(1)
3285 .sr(1)
3286 .m(m)
3287 .n(16)
3288 .k(8)
3289 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003290 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003291 }
3292 }
3293
3294 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_eq_8_subtile_n) {
3295 TEST_REQUIRES_ARM_NEON_V8;
3296 for (uint32_t n = 1; n <= 16; n++) {
3297 GemmMicrokernelTester()
3298 .mr(1)
3299 .nr(16)
3300 .kr(1)
3301 .sr(1)
3302 .m(1)
3303 .n(n)
3304 .k(8)
3305 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003306 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003307 }
3308 }
3309
3310 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_lt_8) {
3311 TEST_REQUIRES_ARM_NEON_V8;
3312 for (size_t k = 1; k < 8; k++) {
3313 GemmMicrokernelTester()
3314 .mr(1)
3315 .nr(16)
3316 .kr(1)
3317 .sr(1)
3318 .m(1)
3319 .n(16)
3320 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08003321 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003322 }
3323 }
3324
3325 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_lt_8_strided_a) {
3326 TEST_REQUIRES_ARM_NEON_V8;
3327 for (size_t k = 1; k < 8; k++) {
3328 GemmMicrokernelTester()
3329 .mr(1)
3330 .nr(16)
3331 .kr(1)
3332 .sr(1)
3333 .m(1)
3334 .n(16)
3335 .k(k)
3336 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08003337 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003338 }
3339 }
3340
3341 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_lt_8_subtile) {
3342 TEST_REQUIRES_ARM_NEON_V8;
3343 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08003344 for (uint32_t n = 1; n <= 16; n++) {
3345 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08003346 GemmMicrokernelTester()
3347 .mr(1)
3348 .nr(16)
3349 .kr(1)
3350 .sr(1)
3351 .m(m)
3352 .n(n)
3353 .k(k)
3354 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003355 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003356 }
3357 }
3358 }
3359 }
3360
3361 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_gt_8) {
3362 TEST_REQUIRES_ARM_NEON_V8;
3363 for (size_t k = 9; k < 16; k++) {
3364 GemmMicrokernelTester()
3365 .mr(1)
3366 .nr(16)
3367 .kr(1)
3368 .sr(1)
3369 .m(1)
3370 .n(16)
3371 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08003372 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003373 }
3374 }
3375
3376 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_gt_8_strided_a) {
3377 TEST_REQUIRES_ARM_NEON_V8;
3378 for (size_t k = 9; k < 16; k++) {
3379 GemmMicrokernelTester()
3380 .mr(1)
3381 .nr(16)
3382 .kr(1)
3383 .sr(1)
3384 .m(1)
3385 .n(16)
3386 .k(k)
3387 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08003388 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003389 }
3390 }
3391
3392 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_gt_8_subtile) {
3393 TEST_REQUIRES_ARM_NEON_V8;
3394 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08003395 for (uint32_t n = 1; n <= 16; n++) {
3396 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08003397 GemmMicrokernelTester()
3398 .mr(1)
3399 .nr(16)
3400 .kr(1)
3401 .sr(1)
3402 .m(m)
3403 .n(n)
3404 .k(k)
3405 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003406 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003407 }
3408 }
3409 }
3410 }
3411
3412 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_div_8) {
3413 TEST_REQUIRES_ARM_NEON_V8;
3414 for (size_t k = 16; k <= 80; k += 8) {
3415 GemmMicrokernelTester()
3416 .mr(1)
3417 .nr(16)
3418 .kr(1)
3419 .sr(1)
3420 .m(1)
3421 .n(16)
3422 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08003423 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003424 }
3425 }
3426
3427 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_div_8_strided_a) {
3428 TEST_REQUIRES_ARM_NEON_V8;
3429 for (size_t k = 16; k <= 80; k += 8) {
3430 GemmMicrokernelTester()
3431 .mr(1)
3432 .nr(16)
3433 .kr(1)
3434 .sr(1)
3435 .m(1)
3436 .n(16)
3437 .k(k)
3438 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08003439 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003440 }
3441 }
3442
3443 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_div_8_subtile) {
3444 TEST_REQUIRES_ARM_NEON_V8;
3445 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08003446 for (uint32_t n = 1; n <= 16; n++) {
3447 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08003448 GemmMicrokernelTester()
3449 .mr(1)
3450 .nr(16)
3451 .kr(1)
3452 .sr(1)
3453 .m(m)
3454 .n(n)
3455 .k(k)
3456 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003457 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003458 }
3459 }
3460 }
3461 }
3462
3463 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_gt_16) {
3464 TEST_REQUIRES_ARM_NEON_V8;
3465 for (uint32_t n = 17; n < 32; n++) {
3466 for (size_t k = 1; k <= 40; k += 9) {
3467 GemmMicrokernelTester()
3468 .mr(1)
3469 .nr(16)
3470 .kr(1)
3471 .sr(1)
3472 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08003473 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08003474 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08003475 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003476 }
3477 }
3478 }
3479
3480 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_gt_16_strided_cn) {
3481 TEST_REQUIRES_ARM_NEON_V8;
3482 for (uint32_t n = 17; n < 32; n++) {
3483 for (size_t k = 1; k <= 40; k += 9) {
3484 GemmMicrokernelTester()
3485 .mr(1)
3486 .nr(16)
3487 .kr(1)
3488 .sr(1)
3489 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08003490 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08003491 .k(k)
3492 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08003493 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003494 }
3495 }
3496 }
3497
3498 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_gt_16_strided_a) {
3499 TEST_REQUIRES_ARM_NEON_V8;
3500 for (uint32_t n = 17; n < 32; n++) {
3501 for (size_t k = 1; k <= 40; k += 9) {
3502 GemmMicrokernelTester()
3503 .mr(1)
3504 .nr(16)
3505 .kr(1)
3506 .sr(1)
3507 .m(1)
3508 .n(n)
3509 .k(k)
3510 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08003511 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003512 }
3513 }
3514 }
3515
3516 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_gt_16_subtile) {
3517 TEST_REQUIRES_ARM_NEON_V8;
3518 for (uint32_t n = 17; n < 32; n++) {
3519 for (size_t k = 1; k <= 40; k += 9) {
3520 for (uint32_t m = 1; m <= 1; m++) {
3521 GemmMicrokernelTester()
3522 .mr(1)
3523 .nr(16)
3524 .kr(1)
3525 .sr(1)
3526 .m(m)
3527 .n(n)
3528 .k(k)
3529 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003530 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003531 }
3532 }
3533 }
3534 }
3535
3536 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_div_16) {
3537 TEST_REQUIRES_ARM_NEON_V8;
3538 for (uint32_t n = 32; n <= 48; n += 16) {
3539 for (size_t k = 1; k <= 40; k += 9) {
3540 GemmMicrokernelTester()
3541 .mr(1)
3542 .nr(16)
3543 .kr(1)
3544 .sr(1)
3545 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08003546 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08003547 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08003548 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003549 }
3550 }
3551 }
3552
3553 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_div_16_strided_cn) {
3554 TEST_REQUIRES_ARM_NEON_V8;
3555 for (uint32_t n = 32; n <= 48; n += 16) {
3556 for (size_t k = 1; k <= 40; k += 9) {
3557 GemmMicrokernelTester()
3558 .mr(1)
3559 .nr(16)
3560 .kr(1)
3561 .sr(1)
3562 .m(1)
3563 .n(n)
3564 .k(k)
3565 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08003566 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003567 }
3568 }
3569 }
3570
3571 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_div_16_strided_a) {
3572 TEST_REQUIRES_ARM_NEON_V8;
3573 for (uint32_t n = 32; n <= 48; n += 16) {
3574 for (size_t k = 1; k <= 40; k += 9) {
3575 GemmMicrokernelTester()
3576 .mr(1)
3577 .nr(16)
3578 .kr(1)
3579 .sr(1)
3580 .m(1)
3581 .n(n)
3582 .k(k)
3583 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08003584 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003585 }
3586 }
3587 }
3588
3589 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_div_16_subtile) {
3590 TEST_REQUIRES_ARM_NEON_V8;
3591 for (uint32_t n = 32; n <= 48; n += 16) {
3592 for (size_t k = 1; k <= 40; k += 9) {
3593 for (uint32_t m = 1; m <= 1; m++) {
3594 GemmMicrokernelTester()
3595 .mr(1)
3596 .nr(16)
3597 .kr(1)
3598 .sr(1)
3599 .m(m)
3600 .n(n)
3601 .k(k)
3602 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003603 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003604 }
3605 }
3606 }
3607 }
3608
3609 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, strided_cm_subtile) {
3610 TEST_REQUIRES_ARM_NEON_V8;
3611 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08003612 for (uint32_t n = 1; n <= 16; n++) {
3613 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08003614 GemmMicrokernelTester()
3615 .mr(1)
3616 .nr(16)
3617 .kr(1)
3618 .sr(1)
3619 .m(m)
3620 .n(n)
3621 .k(k)
3622 .cm_stride(19)
3623 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003624 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003625 }
3626 }
3627 }
3628 }
3629
3630 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, qmin) {
3631 TEST_REQUIRES_ARM_NEON_V8;
3632 GemmMicrokernelTester()
3633 .mr(1)
3634 .nr(16)
3635 .kr(1)
3636 .sr(1)
3637 .m(1)
3638 .n(16)
3639 .k(8)
3640 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08003641 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003642 }
3643
3644 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, qmax) {
3645 TEST_REQUIRES_ARM_NEON_V8;
3646 GemmMicrokernelTester()
3647 .mr(1)
3648 .nr(16)
3649 .kr(1)
3650 .sr(1)
3651 .m(1)
3652 .n(16)
3653 .k(8)
3654 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08003655 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003656 }
3657
3658 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, strided_cm) {
3659 TEST_REQUIRES_ARM_NEON_V8;
3660 GemmMicrokernelTester()
3661 .mr(1)
3662 .nr(16)
3663 .kr(1)
3664 .sr(1)
3665 .m(1)
3666 .n(16)
3667 .k(8)
3668 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08003669 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003670 }
3671#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
3672
3673
3674#if XNN_ARCH_ARM || XNN_ARCH_ARM64
3675 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, k_eq_8) {
3676 TEST_REQUIRES_ARM_NEON_V8;
3677 GemmMicrokernelTester()
3678 .mr(2)
3679 .nr(16)
3680 .kr(1)
3681 .sr(1)
3682 .m(2)
3683 .n(16)
3684 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08003685 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003686 }
3687
3688 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, strided_cn) {
3689 TEST_REQUIRES_ARM_NEON_V8;
3690 GemmMicrokernelTester()
3691 .mr(2)
3692 .nr(16)
3693 .kr(1)
3694 .sr(1)
3695 .m(2)
3696 .n(16)
3697 .k(8)
3698 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08003699 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003700 }
3701
3702 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, k_eq_8_strided_a) {
3703 TEST_REQUIRES_ARM_NEON_V8;
3704 GemmMicrokernelTester()
3705 .mr(2)
3706 .nr(16)
3707 .kr(1)
3708 .sr(1)
3709 .m(2)
3710 .n(16)
3711 .k(8)
3712 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08003713 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003714 }
3715
3716 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, k_eq_8_subtile) {
3717 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -08003718 for (uint32_t n = 1; n <= 16; n++) {
3719 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08003720 GemmMicrokernelTester()
3721 .mr(2)
3722 .nr(16)
3723 .kr(1)
3724 .sr(1)
3725 .m(m)
3726 .n(n)
3727 .k(8)
3728 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003729 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003730 }
3731 }
3732 }
3733
3734 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, k_eq_8_subtile_m) {
3735 TEST_REQUIRES_ARM_NEON_V8;
3736 for (uint32_t m = 1; m <= 2; m++) {
3737 GemmMicrokernelTester()
3738 .mr(2)
3739 .nr(16)
3740 .kr(1)
3741 .sr(1)
3742 .m(m)
3743 .n(16)
3744 .k(8)
3745 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003746 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003747 }
3748 }
3749
3750 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, k_eq_8_subtile_n) {
3751 TEST_REQUIRES_ARM_NEON_V8;
3752 for (uint32_t n = 1; n <= 16; n++) {
3753 GemmMicrokernelTester()
3754 .mr(2)
3755 .nr(16)
3756 .kr(1)
3757 .sr(1)
3758 .m(2)
3759 .n(n)
3760 .k(8)
3761 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003762 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003763 }
3764 }
3765
3766 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, k_lt_8) {
3767 TEST_REQUIRES_ARM_NEON_V8;
3768 for (size_t k = 1; k < 8; k++) {
3769 GemmMicrokernelTester()
3770 .mr(2)
3771 .nr(16)
3772 .kr(1)
3773 .sr(1)
3774 .m(2)
3775 .n(16)
3776 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08003777 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003778 }
3779 }
3780
3781 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, k_lt_8_strided_a) {
3782 TEST_REQUIRES_ARM_NEON_V8;
3783 for (size_t k = 1; k < 8; k++) {
3784 GemmMicrokernelTester()
3785 .mr(2)
3786 .nr(16)
3787 .kr(1)
3788 .sr(1)
3789 .m(2)
3790 .n(16)
3791 .k(k)
3792 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08003793 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003794 }
3795 }
3796
3797 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, k_lt_8_subtile) {
3798 TEST_REQUIRES_ARM_NEON_V8;
3799 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08003800 for (uint32_t n = 1; n <= 16; n++) {
3801 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08003802 GemmMicrokernelTester()
3803 .mr(2)
3804 .nr(16)
3805 .kr(1)
3806 .sr(1)
3807 .m(m)
3808 .n(n)
3809 .k(k)
3810 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003811 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003812 }
3813 }
3814 }
3815 }
3816
3817 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, k_gt_8) {
3818 TEST_REQUIRES_ARM_NEON_V8;
3819 for (size_t k = 9; k < 16; k++) {
3820 GemmMicrokernelTester()
3821 .mr(2)
3822 .nr(16)
3823 .kr(1)
3824 .sr(1)
3825 .m(2)
3826 .n(16)
3827 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08003828 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003829 }
3830 }
3831
3832 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, k_gt_8_strided_a) {
3833 TEST_REQUIRES_ARM_NEON_V8;
3834 for (size_t k = 9; k < 16; k++) {
3835 GemmMicrokernelTester()
3836 .mr(2)
3837 .nr(16)
3838 .kr(1)
3839 .sr(1)
3840 .m(2)
3841 .n(16)
3842 .k(k)
3843 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08003844 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003845 }
3846 }
3847
3848 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, k_gt_8_subtile) {
3849 TEST_REQUIRES_ARM_NEON_V8;
3850 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08003851 for (uint32_t n = 1; n <= 16; n++) {
3852 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08003853 GemmMicrokernelTester()
3854 .mr(2)
3855 .nr(16)
3856 .kr(1)
3857 .sr(1)
3858 .m(m)
3859 .n(n)
3860 .k(k)
3861 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003862 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003863 }
3864 }
3865 }
3866 }
3867
3868 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, k_div_8) {
3869 TEST_REQUIRES_ARM_NEON_V8;
3870 for (size_t k = 16; k <= 80; k += 8) {
3871 GemmMicrokernelTester()
3872 .mr(2)
3873 .nr(16)
3874 .kr(1)
3875 .sr(1)
3876 .m(2)
3877 .n(16)
3878 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08003879 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003880 }
3881 }
3882
3883 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, k_div_8_strided_a) {
3884 TEST_REQUIRES_ARM_NEON_V8;
3885 for (size_t k = 16; k <= 80; k += 8) {
3886 GemmMicrokernelTester()
3887 .mr(2)
3888 .nr(16)
3889 .kr(1)
3890 .sr(1)
3891 .m(2)
3892 .n(16)
3893 .k(k)
3894 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08003895 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003896 }
3897 }
3898
3899 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, k_div_8_subtile) {
3900 TEST_REQUIRES_ARM_NEON_V8;
3901 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08003902 for (uint32_t n = 1; n <= 16; n++) {
3903 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08003904 GemmMicrokernelTester()
3905 .mr(2)
3906 .nr(16)
3907 .kr(1)
3908 .sr(1)
3909 .m(m)
3910 .n(n)
3911 .k(k)
3912 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003913 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003914 }
3915 }
3916 }
3917 }
3918
3919 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, n_gt_16) {
3920 TEST_REQUIRES_ARM_NEON_V8;
3921 for (uint32_t n = 17; n < 32; n++) {
3922 for (size_t k = 1; k <= 40; k += 9) {
3923 GemmMicrokernelTester()
3924 .mr(2)
3925 .nr(16)
3926 .kr(1)
3927 .sr(1)
3928 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08003929 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08003930 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08003931 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003932 }
3933 }
3934 }
3935
3936 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, n_gt_16_strided_cn) {
3937 TEST_REQUIRES_ARM_NEON_V8;
3938 for (uint32_t n = 17; n < 32; n++) {
3939 for (size_t k = 1; k <= 40; k += 9) {
3940 GemmMicrokernelTester()
3941 .mr(2)
3942 .nr(16)
3943 .kr(1)
3944 .sr(1)
3945 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08003946 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08003947 .k(k)
3948 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08003949 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003950 }
3951 }
3952 }
3953
3954 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, n_gt_16_strided_a) {
3955 TEST_REQUIRES_ARM_NEON_V8;
3956 for (uint32_t n = 17; n < 32; n++) {
3957 for (size_t k = 1; k <= 40; k += 9) {
3958 GemmMicrokernelTester()
3959 .mr(2)
3960 .nr(16)
3961 .kr(1)
3962 .sr(1)
3963 .m(2)
3964 .n(n)
3965 .k(k)
3966 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08003967 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003968 }
3969 }
3970 }
3971
3972 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, n_gt_16_subtile) {
3973 TEST_REQUIRES_ARM_NEON_V8;
3974 for (uint32_t n = 17; n < 32; n++) {
3975 for (size_t k = 1; k <= 40; k += 9) {
3976 for (uint32_t m = 1; m <= 2; m++) {
3977 GemmMicrokernelTester()
3978 .mr(2)
3979 .nr(16)
3980 .kr(1)
3981 .sr(1)
3982 .m(m)
3983 .n(n)
3984 .k(k)
3985 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003986 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08003987 }
3988 }
3989 }
3990 }
3991
3992 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, n_div_16) {
3993 TEST_REQUIRES_ARM_NEON_V8;
3994 for (uint32_t n = 32; n <= 48; n += 16) {
3995 for (size_t k = 1; k <= 40; k += 9) {
3996 GemmMicrokernelTester()
3997 .mr(2)
3998 .nr(16)
3999 .kr(1)
4000 .sr(1)
4001 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08004002 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08004003 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08004004 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004005 }
4006 }
4007 }
4008
4009 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, n_div_16_strided_cn) {
4010 TEST_REQUIRES_ARM_NEON_V8;
4011 for (uint32_t n = 32; n <= 48; n += 16) {
4012 for (size_t k = 1; k <= 40; k += 9) {
4013 GemmMicrokernelTester()
4014 .mr(2)
4015 .nr(16)
4016 .kr(1)
4017 .sr(1)
4018 .m(2)
4019 .n(n)
4020 .k(k)
4021 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08004022 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004023 }
4024 }
4025 }
4026
4027 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, n_div_16_strided_a) {
4028 TEST_REQUIRES_ARM_NEON_V8;
4029 for (uint32_t n = 32; n <= 48; n += 16) {
4030 for (size_t k = 1; k <= 40; k += 9) {
4031 GemmMicrokernelTester()
4032 .mr(2)
4033 .nr(16)
4034 .kr(1)
4035 .sr(1)
4036 .m(2)
4037 .n(n)
4038 .k(k)
4039 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08004040 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004041 }
4042 }
4043 }
4044
4045 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, n_div_16_subtile) {
4046 TEST_REQUIRES_ARM_NEON_V8;
4047 for (uint32_t n = 32; n <= 48; n += 16) {
4048 for (size_t k = 1; k <= 40; k += 9) {
4049 for (uint32_t m = 1; m <= 2; m++) {
4050 GemmMicrokernelTester()
4051 .mr(2)
4052 .nr(16)
4053 .kr(1)
4054 .sr(1)
4055 .m(m)
4056 .n(n)
4057 .k(k)
4058 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004059 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004060 }
4061 }
4062 }
4063 }
4064
4065 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, strided_cm_subtile) {
4066 TEST_REQUIRES_ARM_NEON_V8;
4067 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08004068 for (uint32_t n = 1; n <= 16; n++) {
4069 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08004070 GemmMicrokernelTester()
4071 .mr(2)
4072 .nr(16)
4073 .kr(1)
4074 .sr(1)
4075 .m(m)
4076 .n(n)
4077 .k(k)
4078 .cm_stride(19)
4079 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004080 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004081 }
4082 }
4083 }
4084 }
4085
4086 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, qmin) {
4087 TEST_REQUIRES_ARM_NEON_V8;
4088 GemmMicrokernelTester()
4089 .mr(2)
4090 .nr(16)
4091 .kr(1)
4092 .sr(1)
4093 .m(2)
4094 .n(16)
4095 .k(8)
4096 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08004097 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004098 }
4099
4100 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, qmax) {
4101 TEST_REQUIRES_ARM_NEON_V8;
4102 GemmMicrokernelTester()
4103 .mr(2)
4104 .nr(16)
4105 .kr(1)
4106 .sr(1)
4107 .m(2)
4108 .n(16)
4109 .k(8)
4110 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08004111 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004112 }
4113
4114 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE, strided_cm) {
4115 TEST_REQUIRES_ARM_NEON_V8;
4116 GemmMicrokernelTester()
4117 .mr(2)
4118 .nr(16)
4119 .kr(1)
4120 .sr(1)
4121 .m(2)
4122 .n(16)
4123 .k(8)
4124 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08004125 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004126 }
4127#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
4128
4129
4130#if XNN_ARCH_ARM || XNN_ARCH_ARM64
4131 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, k_eq_8) {
4132 TEST_REQUIRES_ARM_NEON_V8;
4133 GemmMicrokernelTester()
4134 .mr(3)
4135 .nr(16)
4136 .kr(1)
4137 .sr(1)
4138 .m(3)
4139 .n(16)
4140 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08004141 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004142 }
4143
4144 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, strided_cn) {
4145 TEST_REQUIRES_ARM_NEON_V8;
4146 GemmMicrokernelTester()
4147 .mr(3)
4148 .nr(16)
4149 .kr(1)
4150 .sr(1)
4151 .m(3)
4152 .n(16)
4153 .k(8)
4154 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08004155 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004156 }
4157
4158 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, k_eq_8_strided_a) {
4159 TEST_REQUIRES_ARM_NEON_V8;
4160 GemmMicrokernelTester()
4161 .mr(3)
4162 .nr(16)
4163 .kr(1)
4164 .sr(1)
4165 .m(3)
4166 .n(16)
4167 .k(8)
4168 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08004169 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004170 }
4171
4172 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, k_eq_8_subtile) {
4173 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -08004174 for (uint32_t n = 1; n <= 16; n++) {
4175 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08004176 GemmMicrokernelTester()
4177 .mr(3)
4178 .nr(16)
4179 .kr(1)
4180 .sr(1)
4181 .m(m)
4182 .n(n)
4183 .k(8)
4184 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004185 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004186 }
4187 }
4188 }
4189
4190 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, k_eq_8_subtile_m) {
4191 TEST_REQUIRES_ARM_NEON_V8;
4192 for (uint32_t m = 1; m <= 3; m++) {
4193 GemmMicrokernelTester()
4194 .mr(3)
4195 .nr(16)
4196 .kr(1)
4197 .sr(1)
4198 .m(m)
4199 .n(16)
4200 .k(8)
4201 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004202 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004203 }
4204 }
4205
4206 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, k_eq_8_subtile_n) {
4207 TEST_REQUIRES_ARM_NEON_V8;
4208 for (uint32_t n = 1; n <= 16; n++) {
4209 GemmMicrokernelTester()
4210 .mr(3)
4211 .nr(16)
4212 .kr(1)
4213 .sr(1)
4214 .m(3)
4215 .n(n)
4216 .k(8)
4217 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004218 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004219 }
4220 }
4221
4222 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, k_lt_8) {
4223 TEST_REQUIRES_ARM_NEON_V8;
4224 for (size_t k = 1; k < 8; k++) {
4225 GemmMicrokernelTester()
4226 .mr(3)
4227 .nr(16)
4228 .kr(1)
4229 .sr(1)
4230 .m(3)
4231 .n(16)
4232 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08004233 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004234 }
4235 }
4236
4237 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, k_lt_8_strided_a) {
4238 TEST_REQUIRES_ARM_NEON_V8;
4239 for (size_t k = 1; k < 8; k++) {
4240 GemmMicrokernelTester()
4241 .mr(3)
4242 .nr(16)
4243 .kr(1)
4244 .sr(1)
4245 .m(3)
4246 .n(16)
4247 .k(k)
4248 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08004249 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004250 }
4251 }
4252
4253 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, k_lt_8_subtile) {
4254 TEST_REQUIRES_ARM_NEON_V8;
4255 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08004256 for (uint32_t n = 1; n <= 16; n++) {
4257 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08004258 GemmMicrokernelTester()
4259 .mr(3)
4260 .nr(16)
4261 .kr(1)
4262 .sr(1)
4263 .m(m)
4264 .n(n)
4265 .k(k)
4266 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004267 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004268 }
4269 }
4270 }
4271 }
4272
4273 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, k_gt_8) {
4274 TEST_REQUIRES_ARM_NEON_V8;
4275 for (size_t k = 9; k < 16; k++) {
4276 GemmMicrokernelTester()
4277 .mr(3)
4278 .nr(16)
4279 .kr(1)
4280 .sr(1)
4281 .m(3)
4282 .n(16)
4283 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08004284 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004285 }
4286 }
4287
4288 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, k_gt_8_strided_a) {
4289 TEST_REQUIRES_ARM_NEON_V8;
4290 for (size_t k = 9; k < 16; k++) {
4291 GemmMicrokernelTester()
4292 .mr(3)
4293 .nr(16)
4294 .kr(1)
4295 .sr(1)
4296 .m(3)
4297 .n(16)
4298 .k(k)
4299 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08004300 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004301 }
4302 }
4303
4304 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, k_gt_8_subtile) {
4305 TEST_REQUIRES_ARM_NEON_V8;
4306 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08004307 for (uint32_t n = 1; n <= 16; n++) {
4308 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08004309 GemmMicrokernelTester()
4310 .mr(3)
4311 .nr(16)
4312 .kr(1)
4313 .sr(1)
4314 .m(m)
4315 .n(n)
4316 .k(k)
4317 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004318 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004319 }
4320 }
4321 }
4322 }
4323
4324 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, k_div_8) {
4325 TEST_REQUIRES_ARM_NEON_V8;
4326 for (size_t k = 16; k <= 80; k += 8) {
4327 GemmMicrokernelTester()
4328 .mr(3)
4329 .nr(16)
4330 .kr(1)
4331 .sr(1)
4332 .m(3)
4333 .n(16)
4334 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08004335 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004336 }
4337 }
4338
4339 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, k_div_8_strided_a) {
4340 TEST_REQUIRES_ARM_NEON_V8;
4341 for (size_t k = 16; k <= 80; k += 8) {
4342 GemmMicrokernelTester()
4343 .mr(3)
4344 .nr(16)
4345 .kr(1)
4346 .sr(1)
4347 .m(3)
4348 .n(16)
4349 .k(k)
4350 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08004351 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004352 }
4353 }
4354
4355 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, k_div_8_subtile) {
4356 TEST_REQUIRES_ARM_NEON_V8;
4357 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08004358 for (uint32_t n = 1; n <= 16; n++) {
4359 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08004360 GemmMicrokernelTester()
4361 .mr(3)
4362 .nr(16)
4363 .kr(1)
4364 .sr(1)
4365 .m(m)
4366 .n(n)
4367 .k(k)
4368 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004369 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004370 }
4371 }
4372 }
4373 }
4374
4375 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, n_gt_16) {
4376 TEST_REQUIRES_ARM_NEON_V8;
4377 for (uint32_t n = 17; n < 32; n++) {
4378 for (size_t k = 1; k <= 40; k += 9) {
4379 GemmMicrokernelTester()
4380 .mr(3)
4381 .nr(16)
4382 .kr(1)
4383 .sr(1)
4384 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08004385 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08004386 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08004387 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004388 }
4389 }
4390 }
4391
4392 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, n_gt_16_strided_cn) {
4393 TEST_REQUIRES_ARM_NEON_V8;
4394 for (uint32_t n = 17; n < 32; n++) {
4395 for (size_t k = 1; k <= 40; k += 9) {
4396 GemmMicrokernelTester()
4397 .mr(3)
4398 .nr(16)
4399 .kr(1)
4400 .sr(1)
4401 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08004402 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08004403 .k(k)
4404 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08004405 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004406 }
4407 }
4408 }
4409
4410 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, n_gt_16_strided_a) {
4411 TEST_REQUIRES_ARM_NEON_V8;
4412 for (uint32_t n = 17; n < 32; n++) {
4413 for (size_t k = 1; k <= 40; k += 9) {
4414 GemmMicrokernelTester()
4415 .mr(3)
4416 .nr(16)
4417 .kr(1)
4418 .sr(1)
4419 .m(3)
4420 .n(n)
4421 .k(k)
4422 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08004423 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004424 }
4425 }
4426 }
4427
4428 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, n_gt_16_subtile) {
4429 TEST_REQUIRES_ARM_NEON_V8;
4430 for (uint32_t n = 17; n < 32; n++) {
4431 for (size_t k = 1; k <= 40; k += 9) {
4432 for (uint32_t m = 1; m <= 3; m++) {
4433 GemmMicrokernelTester()
4434 .mr(3)
4435 .nr(16)
4436 .kr(1)
4437 .sr(1)
4438 .m(m)
4439 .n(n)
4440 .k(k)
4441 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004442 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004443 }
4444 }
4445 }
4446 }
4447
4448 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, n_div_16) {
4449 TEST_REQUIRES_ARM_NEON_V8;
4450 for (uint32_t n = 32; n <= 48; n += 16) {
4451 for (size_t k = 1; k <= 40; k += 9) {
4452 GemmMicrokernelTester()
4453 .mr(3)
4454 .nr(16)
4455 .kr(1)
4456 .sr(1)
4457 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08004458 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08004459 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08004460 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004461 }
4462 }
4463 }
4464
4465 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, n_div_16_strided_cn) {
4466 TEST_REQUIRES_ARM_NEON_V8;
4467 for (uint32_t n = 32; n <= 48; n += 16) {
4468 for (size_t k = 1; k <= 40; k += 9) {
4469 GemmMicrokernelTester()
4470 .mr(3)
4471 .nr(16)
4472 .kr(1)
4473 .sr(1)
4474 .m(3)
4475 .n(n)
4476 .k(k)
4477 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08004478 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004479 }
4480 }
4481 }
4482
4483 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, n_div_16_strided_a) {
4484 TEST_REQUIRES_ARM_NEON_V8;
4485 for (uint32_t n = 32; n <= 48; n += 16) {
4486 for (size_t k = 1; k <= 40; k += 9) {
4487 GemmMicrokernelTester()
4488 .mr(3)
4489 .nr(16)
4490 .kr(1)
4491 .sr(1)
4492 .m(3)
4493 .n(n)
4494 .k(k)
4495 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08004496 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004497 }
4498 }
4499 }
4500
4501 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, n_div_16_subtile) {
4502 TEST_REQUIRES_ARM_NEON_V8;
4503 for (uint32_t n = 32; n <= 48; n += 16) {
4504 for (size_t k = 1; k <= 40; k += 9) {
4505 for (uint32_t m = 1; m <= 3; m++) {
4506 GemmMicrokernelTester()
4507 .mr(3)
4508 .nr(16)
4509 .kr(1)
4510 .sr(1)
4511 .m(m)
4512 .n(n)
4513 .k(k)
4514 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004515 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004516 }
4517 }
4518 }
4519 }
4520
4521 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, strided_cm_subtile) {
4522 TEST_REQUIRES_ARM_NEON_V8;
4523 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08004524 for (uint32_t n = 1; n <= 16; n++) {
4525 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08004526 GemmMicrokernelTester()
4527 .mr(3)
4528 .nr(16)
4529 .kr(1)
4530 .sr(1)
4531 .m(m)
4532 .n(n)
4533 .k(k)
4534 .cm_stride(19)
4535 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004536 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004537 }
4538 }
4539 }
4540 }
4541
4542 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, qmin) {
4543 TEST_REQUIRES_ARM_NEON_V8;
4544 GemmMicrokernelTester()
4545 .mr(3)
4546 .nr(16)
4547 .kr(1)
4548 .sr(1)
4549 .m(3)
4550 .n(16)
4551 .k(8)
4552 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08004553 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004554 }
4555
4556 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, qmax) {
4557 TEST_REQUIRES_ARM_NEON_V8;
4558 GemmMicrokernelTester()
4559 .mr(3)
4560 .nr(16)
4561 .kr(1)
4562 .sr(1)
4563 .m(3)
4564 .n(16)
4565 .k(8)
4566 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08004567 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004568 }
4569
4570 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE, strided_cm) {
4571 TEST_REQUIRES_ARM_NEON_V8;
4572 GemmMicrokernelTester()
4573 .mr(3)
4574 .nr(16)
4575 .kr(1)
4576 .sr(1)
4577 .m(3)
4578 .n(16)
4579 .k(8)
4580 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08004581 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004582 }
4583#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
4584
4585
4586#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchardf6237402022-01-05 00:26:09 -08004587 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, k_eq_8) {
4588 TEST_REQUIRES_ARM_NEON_V8;
4589 GemmMicrokernelTester()
4590 .mr(1)
4591 .nr(8)
4592 .kr(1)
4593 .sr(1)
4594 .m(1)
4595 .n(8)
4596 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08004597 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004598 }
4599
4600 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, strided_cn) {
4601 TEST_REQUIRES_ARM_NEON_V8;
4602 GemmMicrokernelTester()
4603 .mr(1)
4604 .nr(8)
4605 .kr(1)
4606 .sr(1)
4607 .m(1)
4608 .n(8)
4609 .k(8)
4610 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08004611 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004612 }
4613
4614 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, k_eq_8_strided_a) {
4615 TEST_REQUIRES_ARM_NEON_V8;
4616 GemmMicrokernelTester()
4617 .mr(1)
4618 .nr(8)
4619 .kr(1)
4620 .sr(1)
4621 .m(1)
4622 .n(8)
4623 .k(8)
4624 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08004625 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004626 }
4627
4628 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile) {
4629 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -08004630 for (uint32_t n = 1; n <= 8; n++) {
4631 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08004632 GemmMicrokernelTester()
4633 .mr(1)
4634 .nr(8)
4635 .kr(1)
4636 .sr(1)
4637 .m(m)
4638 .n(n)
4639 .k(8)
4640 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004641 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004642 }
4643 }
4644 }
4645
4646 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile_m) {
4647 TEST_REQUIRES_ARM_NEON_V8;
4648 for (uint32_t m = 1; m <= 1; m++) {
4649 GemmMicrokernelTester()
4650 .mr(1)
4651 .nr(8)
4652 .kr(1)
4653 .sr(1)
4654 .m(m)
4655 .n(8)
4656 .k(8)
4657 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004658 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004659 }
4660 }
4661
4662 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile_n) {
4663 TEST_REQUIRES_ARM_NEON_V8;
4664 for (uint32_t n = 1; n <= 8; n++) {
4665 GemmMicrokernelTester()
4666 .mr(1)
4667 .nr(8)
4668 .kr(1)
4669 .sr(1)
4670 .m(1)
4671 .n(n)
4672 .k(8)
4673 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004674 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004675 }
4676 }
4677
4678 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, k_lt_8) {
4679 TEST_REQUIRES_ARM_NEON_V8;
4680 for (size_t k = 1; k < 8; k++) {
4681 GemmMicrokernelTester()
4682 .mr(1)
4683 .nr(8)
4684 .kr(1)
4685 .sr(1)
4686 .m(1)
4687 .n(8)
4688 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08004689 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004690 }
4691 }
4692
4693 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, k_lt_8_strided_a) {
4694 TEST_REQUIRES_ARM_NEON_V8;
4695 for (size_t k = 1; k < 8; k++) {
4696 GemmMicrokernelTester()
4697 .mr(1)
4698 .nr(8)
4699 .kr(1)
4700 .sr(1)
4701 .m(1)
4702 .n(8)
4703 .k(k)
4704 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08004705 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004706 }
4707 }
4708
4709 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, k_lt_8_subtile) {
4710 TEST_REQUIRES_ARM_NEON_V8;
4711 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08004712 for (uint32_t n = 1; n <= 8; n++) {
4713 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08004714 GemmMicrokernelTester()
4715 .mr(1)
4716 .nr(8)
4717 .kr(1)
4718 .sr(1)
4719 .m(m)
4720 .n(n)
4721 .k(k)
4722 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004723 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004724 }
4725 }
4726 }
4727 }
4728
4729 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, k_gt_8) {
4730 TEST_REQUIRES_ARM_NEON_V8;
4731 for (size_t k = 9; k < 16; k++) {
4732 GemmMicrokernelTester()
4733 .mr(1)
4734 .nr(8)
4735 .kr(1)
4736 .sr(1)
4737 .m(1)
4738 .n(8)
4739 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08004740 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004741 }
4742 }
4743
4744 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, k_gt_8_strided_a) {
4745 TEST_REQUIRES_ARM_NEON_V8;
4746 for (size_t k = 9; k < 16; k++) {
4747 GemmMicrokernelTester()
4748 .mr(1)
4749 .nr(8)
4750 .kr(1)
4751 .sr(1)
4752 .m(1)
4753 .n(8)
4754 .k(k)
4755 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08004756 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004757 }
4758 }
4759
4760 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, k_gt_8_subtile) {
4761 TEST_REQUIRES_ARM_NEON_V8;
4762 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08004763 for (uint32_t n = 1; n <= 8; n++) {
4764 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08004765 GemmMicrokernelTester()
4766 .mr(1)
4767 .nr(8)
4768 .kr(1)
4769 .sr(1)
4770 .m(m)
4771 .n(n)
4772 .k(k)
4773 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004774 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004775 }
4776 }
4777 }
4778 }
4779
4780 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, k_div_8) {
4781 TEST_REQUIRES_ARM_NEON_V8;
4782 for (size_t k = 16; k <= 80; k += 8) {
4783 GemmMicrokernelTester()
4784 .mr(1)
4785 .nr(8)
4786 .kr(1)
4787 .sr(1)
4788 .m(1)
4789 .n(8)
4790 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08004791 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004792 }
4793 }
4794
4795 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, k_div_8_strided_a) {
4796 TEST_REQUIRES_ARM_NEON_V8;
4797 for (size_t k = 16; k <= 80; k += 8) {
4798 GemmMicrokernelTester()
4799 .mr(1)
4800 .nr(8)
4801 .kr(1)
4802 .sr(1)
4803 .m(1)
4804 .n(8)
4805 .k(k)
4806 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08004807 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004808 }
4809 }
4810
4811 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, k_div_8_subtile) {
4812 TEST_REQUIRES_ARM_NEON_V8;
4813 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08004814 for (uint32_t n = 1; n <= 8; n++) {
4815 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08004816 GemmMicrokernelTester()
4817 .mr(1)
4818 .nr(8)
4819 .kr(1)
4820 .sr(1)
4821 .m(m)
4822 .n(n)
4823 .k(k)
4824 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004825 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004826 }
4827 }
4828 }
4829 }
4830
4831 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, n_gt_8) {
4832 TEST_REQUIRES_ARM_NEON_V8;
4833 for (uint32_t n = 9; n < 16; n++) {
4834 for (size_t k = 1; k <= 40; k += 9) {
4835 GemmMicrokernelTester()
4836 .mr(1)
4837 .nr(8)
4838 .kr(1)
4839 .sr(1)
4840 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08004841 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08004842 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08004843 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004844 }
4845 }
4846 }
4847
4848 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, n_gt_8_strided_cn) {
4849 TEST_REQUIRES_ARM_NEON_V8;
4850 for (uint32_t n = 9; n < 16; n++) {
4851 for (size_t k = 1; k <= 40; k += 9) {
4852 GemmMicrokernelTester()
4853 .mr(1)
4854 .nr(8)
4855 .kr(1)
4856 .sr(1)
4857 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08004858 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08004859 .k(k)
4860 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08004861 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004862 }
4863 }
4864 }
4865
4866 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, n_gt_8_strided_a) {
4867 TEST_REQUIRES_ARM_NEON_V8;
4868 for (uint32_t n = 9; n < 16; n++) {
4869 for (size_t k = 1; k <= 40; k += 9) {
4870 GemmMicrokernelTester()
4871 .mr(1)
4872 .nr(8)
4873 .kr(1)
4874 .sr(1)
4875 .m(1)
4876 .n(n)
4877 .k(k)
4878 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08004879 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004880 }
4881 }
4882 }
4883
4884 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, n_gt_8_subtile) {
4885 TEST_REQUIRES_ARM_NEON_V8;
4886 for (uint32_t n = 9; n < 16; n++) {
4887 for (size_t k = 1; k <= 40; k += 9) {
4888 for (uint32_t m = 1; m <= 1; m++) {
4889 GemmMicrokernelTester()
4890 .mr(1)
4891 .nr(8)
4892 .kr(1)
4893 .sr(1)
4894 .m(m)
4895 .n(n)
4896 .k(k)
4897 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004898 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004899 }
4900 }
4901 }
4902 }
4903
4904 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, n_div_8) {
4905 TEST_REQUIRES_ARM_NEON_V8;
4906 for (uint32_t n = 16; n <= 24; n += 8) {
4907 for (size_t k = 1; k <= 40; k += 9) {
4908 GemmMicrokernelTester()
4909 .mr(1)
4910 .nr(8)
4911 .kr(1)
4912 .sr(1)
4913 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08004914 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08004915 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08004916 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004917 }
4918 }
4919 }
4920
4921 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, n_div_8_strided_cn) {
4922 TEST_REQUIRES_ARM_NEON_V8;
4923 for (uint32_t n = 16; n <= 24; n += 8) {
4924 for (size_t k = 1; k <= 40; k += 9) {
4925 GemmMicrokernelTester()
4926 .mr(1)
4927 .nr(8)
4928 .kr(1)
4929 .sr(1)
4930 .m(1)
4931 .n(n)
4932 .k(k)
4933 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08004934 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004935 }
4936 }
4937 }
4938
4939 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, n_div_8_strided_a) {
4940 TEST_REQUIRES_ARM_NEON_V8;
4941 for (uint32_t n = 16; n <= 24; n += 8) {
4942 for (size_t k = 1; k <= 40; k += 9) {
4943 GemmMicrokernelTester()
4944 .mr(1)
4945 .nr(8)
4946 .kr(1)
4947 .sr(1)
4948 .m(1)
4949 .n(n)
4950 .k(k)
4951 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08004952 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004953 }
4954 }
4955 }
4956
4957 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, n_div_8_subtile) {
4958 TEST_REQUIRES_ARM_NEON_V8;
4959 for (uint32_t n = 16; n <= 24; n += 8) {
4960 for (size_t k = 1; k <= 40; k += 9) {
4961 for (uint32_t m = 1; m <= 1; m++) {
4962 GemmMicrokernelTester()
4963 .mr(1)
4964 .nr(8)
4965 .kr(1)
4966 .sr(1)
4967 .m(m)
4968 .n(n)
4969 .k(k)
4970 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004971 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004972 }
4973 }
4974 }
4975 }
4976
4977 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, strided_cm_subtile) {
4978 TEST_REQUIRES_ARM_NEON_V8;
4979 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08004980 for (uint32_t n = 1; n <= 8; n++) {
4981 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08004982 GemmMicrokernelTester()
4983 .mr(1)
4984 .nr(8)
4985 .kr(1)
4986 .sr(1)
4987 .m(m)
4988 .n(n)
4989 .k(k)
4990 .cm_stride(11)
4991 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004992 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08004993 }
4994 }
4995 }
4996 }
4997
4998 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, qmin) {
4999 TEST_REQUIRES_ARM_NEON_V8;
5000 GemmMicrokernelTester()
5001 .mr(1)
5002 .nr(8)
5003 .kr(1)
5004 .sr(1)
5005 .m(1)
5006 .n(8)
5007 .k(8)
5008 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08005009 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005010 }
5011
5012 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, qmax) {
5013 TEST_REQUIRES_ARM_NEON_V8;
5014 GemmMicrokernelTester()
5015 .mr(1)
5016 .nr(8)
5017 .kr(1)
5018 .sr(1)
5019 .m(1)
5020 .n(8)
5021 .k(8)
5022 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08005023 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005024 }
5025
5026 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE_PRFM, strided_cm) {
5027 TEST_REQUIRES_ARM_NEON_V8;
5028 GemmMicrokernelTester()
5029 .mr(1)
5030 .nr(8)
5031 .kr(1)
5032 .sr(1)
5033 .m(1)
5034 .n(8)
5035 .k(8)
5036 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005037 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005038 }
5039#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
5040
5041
5042#if XNN_ARCH_ARM || XNN_ARCH_ARM64
5043 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, k_eq_8) {
5044 TEST_REQUIRES_ARM_NEON_V8;
5045 GemmMicrokernelTester()
5046 .mr(2)
5047 .nr(8)
5048 .kr(1)
5049 .sr(1)
5050 .m(2)
5051 .n(8)
5052 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08005053 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005054 }
5055
5056 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, strided_cn) {
5057 TEST_REQUIRES_ARM_NEON_V8;
5058 GemmMicrokernelTester()
5059 .mr(2)
5060 .nr(8)
5061 .kr(1)
5062 .sr(1)
5063 .m(2)
5064 .n(8)
5065 .k(8)
5066 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005067 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005068 }
5069
5070 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, k_eq_8_strided_a) {
5071 TEST_REQUIRES_ARM_NEON_V8;
5072 GemmMicrokernelTester()
5073 .mr(2)
5074 .nr(8)
5075 .kr(1)
5076 .sr(1)
5077 .m(2)
5078 .n(8)
5079 .k(8)
5080 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005081 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005082 }
5083
5084 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile) {
5085 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -08005086 for (uint32_t n = 1; n <= 8; n++) {
5087 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08005088 GemmMicrokernelTester()
5089 .mr(2)
5090 .nr(8)
5091 .kr(1)
5092 .sr(1)
5093 .m(m)
5094 .n(n)
5095 .k(8)
5096 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005097 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005098 }
5099 }
5100 }
5101
5102 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile_m) {
5103 TEST_REQUIRES_ARM_NEON_V8;
5104 for (uint32_t m = 1; m <= 2; m++) {
5105 GemmMicrokernelTester()
5106 .mr(2)
5107 .nr(8)
5108 .kr(1)
5109 .sr(1)
5110 .m(m)
5111 .n(8)
5112 .k(8)
5113 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005114 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005115 }
5116 }
5117
5118 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile_n) {
5119 TEST_REQUIRES_ARM_NEON_V8;
5120 for (uint32_t n = 1; n <= 8; n++) {
5121 GemmMicrokernelTester()
5122 .mr(2)
5123 .nr(8)
5124 .kr(1)
5125 .sr(1)
5126 .m(2)
5127 .n(n)
5128 .k(8)
5129 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005130 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005131 }
5132 }
5133
5134 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, k_lt_8) {
5135 TEST_REQUIRES_ARM_NEON_V8;
5136 for (size_t k = 1; k < 8; k++) {
5137 GemmMicrokernelTester()
5138 .mr(2)
5139 .nr(8)
5140 .kr(1)
5141 .sr(1)
5142 .m(2)
5143 .n(8)
5144 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08005145 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005146 }
5147 }
5148
5149 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, k_lt_8_strided_a) {
5150 TEST_REQUIRES_ARM_NEON_V8;
5151 for (size_t k = 1; k < 8; k++) {
5152 GemmMicrokernelTester()
5153 .mr(2)
5154 .nr(8)
5155 .kr(1)
5156 .sr(1)
5157 .m(2)
5158 .n(8)
5159 .k(k)
5160 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005161 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005162 }
5163 }
5164
5165 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, k_lt_8_subtile) {
5166 TEST_REQUIRES_ARM_NEON_V8;
5167 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08005168 for (uint32_t n = 1; n <= 8; n++) {
5169 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08005170 GemmMicrokernelTester()
5171 .mr(2)
5172 .nr(8)
5173 .kr(1)
5174 .sr(1)
5175 .m(m)
5176 .n(n)
5177 .k(k)
5178 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005179 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005180 }
5181 }
5182 }
5183 }
5184
5185 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, k_gt_8) {
5186 TEST_REQUIRES_ARM_NEON_V8;
5187 for (size_t k = 9; k < 16; k++) {
5188 GemmMicrokernelTester()
5189 .mr(2)
5190 .nr(8)
5191 .kr(1)
5192 .sr(1)
5193 .m(2)
5194 .n(8)
5195 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08005196 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005197 }
5198 }
5199
5200 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, k_gt_8_strided_a) {
5201 TEST_REQUIRES_ARM_NEON_V8;
5202 for (size_t k = 9; k < 16; k++) {
5203 GemmMicrokernelTester()
5204 .mr(2)
5205 .nr(8)
5206 .kr(1)
5207 .sr(1)
5208 .m(2)
5209 .n(8)
5210 .k(k)
5211 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08005212 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005213 }
5214 }
5215
5216 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, k_gt_8_subtile) {
5217 TEST_REQUIRES_ARM_NEON_V8;
5218 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08005219 for (uint32_t n = 1; n <= 8; n++) {
5220 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08005221 GemmMicrokernelTester()
5222 .mr(2)
5223 .nr(8)
5224 .kr(1)
5225 .sr(1)
5226 .m(m)
5227 .n(n)
5228 .k(k)
5229 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005230 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005231 }
5232 }
5233 }
5234 }
5235
5236 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, k_div_8) {
5237 TEST_REQUIRES_ARM_NEON_V8;
5238 for (size_t k = 16; k <= 80; k += 8) {
5239 GemmMicrokernelTester()
5240 .mr(2)
5241 .nr(8)
5242 .kr(1)
5243 .sr(1)
5244 .m(2)
5245 .n(8)
5246 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08005247 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005248 }
5249 }
5250
5251 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, k_div_8_strided_a) {
5252 TEST_REQUIRES_ARM_NEON_V8;
5253 for (size_t k = 16; k <= 80; k += 8) {
5254 GemmMicrokernelTester()
5255 .mr(2)
5256 .nr(8)
5257 .kr(1)
5258 .sr(1)
5259 .m(2)
5260 .n(8)
5261 .k(k)
5262 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08005263 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005264 }
5265 }
5266
5267 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, k_div_8_subtile) {
5268 TEST_REQUIRES_ARM_NEON_V8;
5269 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08005270 for (uint32_t n = 1; n <= 8; n++) {
5271 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08005272 GemmMicrokernelTester()
5273 .mr(2)
5274 .nr(8)
5275 .kr(1)
5276 .sr(1)
5277 .m(m)
5278 .n(n)
5279 .k(k)
5280 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005281 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005282 }
5283 }
5284 }
5285 }
5286
5287 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, n_gt_8) {
5288 TEST_REQUIRES_ARM_NEON_V8;
5289 for (uint32_t n = 9; n < 16; n++) {
5290 for (size_t k = 1; k <= 40; k += 9) {
5291 GemmMicrokernelTester()
5292 .mr(2)
5293 .nr(8)
5294 .kr(1)
5295 .sr(1)
5296 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08005297 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08005298 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08005299 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005300 }
5301 }
5302 }
5303
5304 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, n_gt_8_strided_cn) {
5305 TEST_REQUIRES_ARM_NEON_V8;
5306 for (uint32_t n = 9; n < 16; n++) {
5307 for (size_t k = 1; k <= 40; k += 9) {
5308 GemmMicrokernelTester()
5309 .mr(2)
5310 .nr(8)
5311 .kr(1)
5312 .sr(1)
5313 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08005314 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08005315 .k(k)
5316 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005317 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005318 }
5319 }
5320 }
5321
5322 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, n_gt_8_strided_a) {
5323 TEST_REQUIRES_ARM_NEON_V8;
5324 for (uint32_t n = 9; n < 16; n++) {
5325 for (size_t k = 1; k <= 40; k += 9) {
5326 GemmMicrokernelTester()
5327 .mr(2)
5328 .nr(8)
5329 .kr(1)
5330 .sr(1)
5331 .m(2)
5332 .n(n)
5333 .k(k)
5334 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08005335 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005336 }
5337 }
5338 }
5339
5340 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, n_gt_8_subtile) {
5341 TEST_REQUIRES_ARM_NEON_V8;
5342 for (uint32_t n = 9; n < 16; n++) {
5343 for (size_t k = 1; k <= 40; k += 9) {
5344 for (uint32_t m = 1; m <= 2; m++) {
5345 GemmMicrokernelTester()
5346 .mr(2)
5347 .nr(8)
5348 .kr(1)
5349 .sr(1)
5350 .m(m)
5351 .n(n)
5352 .k(k)
5353 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005354 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005355 }
5356 }
5357 }
5358 }
5359
5360 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, n_div_8) {
5361 TEST_REQUIRES_ARM_NEON_V8;
5362 for (uint32_t n = 16; n <= 24; n += 8) {
5363 for (size_t k = 1; k <= 40; k += 9) {
5364 GemmMicrokernelTester()
5365 .mr(2)
5366 .nr(8)
5367 .kr(1)
5368 .sr(1)
5369 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08005370 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08005371 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08005372 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005373 }
5374 }
5375 }
5376
5377 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, n_div_8_strided_cn) {
5378 TEST_REQUIRES_ARM_NEON_V8;
5379 for (uint32_t n = 16; n <= 24; n += 8) {
5380 for (size_t k = 1; k <= 40; k += 9) {
5381 GemmMicrokernelTester()
5382 .mr(2)
5383 .nr(8)
5384 .kr(1)
5385 .sr(1)
5386 .m(2)
5387 .n(n)
5388 .k(k)
5389 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005390 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005391 }
5392 }
5393 }
5394
5395 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, n_div_8_strided_a) {
5396 TEST_REQUIRES_ARM_NEON_V8;
5397 for (uint32_t n = 16; n <= 24; n += 8) {
5398 for (size_t k = 1; k <= 40; k += 9) {
5399 GemmMicrokernelTester()
5400 .mr(2)
5401 .nr(8)
5402 .kr(1)
5403 .sr(1)
5404 .m(2)
5405 .n(n)
5406 .k(k)
5407 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08005408 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005409 }
5410 }
5411 }
5412
5413 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, n_div_8_subtile) {
5414 TEST_REQUIRES_ARM_NEON_V8;
5415 for (uint32_t n = 16; n <= 24; n += 8) {
5416 for (size_t k = 1; k <= 40; k += 9) {
5417 for (uint32_t m = 1; m <= 2; m++) {
5418 GemmMicrokernelTester()
5419 .mr(2)
5420 .nr(8)
5421 .kr(1)
5422 .sr(1)
5423 .m(m)
5424 .n(n)
5425 .k(k)
5426 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005427 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005428 }
5429 }
5430 }
5431 }
5432
5433 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, strided_cm_subtile) {
5434 TEST_REQUIRES_ARM_NEON_V8;
5435 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08005436 for (uint32_t n = 1; n <= 8; n++) {
5437 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08005438 GemmMicrokernelTester()
5439 .mr(2)
5440 .nr(8)
5441 .kr(1)
5442 .sr(1)
5443 .m(m)
5444 .n(n)
5445 .k(k)
5446 .cm_stride(11)
5447 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005448 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005449 }
5450 }
5451 }
5452 }
5453
5454 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, qmin) {
5455 TEST_REQUIRES_ARM_NEON_V8;
5456 GemmMicrokernelTester()
5457 .mr(2)
5458 .nr(8)
5459 .kr(1)
5460 .sr(1)
5461 .m(2)
5462 .n(8)
5463 .k(8)
5464 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08005465 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005466 }
5467
5468 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, qmax) {
5469 TEST_REQUIRES_ARM_NEON_V8;
5470 GemmMicrokernelTester()
5471 .mr(2)
5472 .nr(8)
5473 .kr(1)
5474 .sr(1)
5475 .m(2)
5476 .n(8)
5477 .k(8)
5478 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08005479 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005480 }
5481
5482 TEST(QC8_GEMM_MINMAX_FP32_2X8__NEONV8_MLAL_LANE_PRFM, strided_cm) {
5483 TEST_REQUIRES_ARM_NEON_V8;
5484 GemmMicrokernelTester()
5485 .mr(2)
5486 .nr(8)
5487 .kr(1)
5488 .sr(1)
5489 .m(2)
5490 .n(8)
5491 .k(8)
5492 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005493 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005494 }
5495#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
5496
5497
5498#if XNN_ARCH_ARM || XNN_ARCH_ARM64
5499 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, k_eq_8) {
5500 TEST_REQUIRES_ARM_NEON_V8;
5501 GemmMicrokernelTester()
5502 .mr(3)
5503 .nr(8)
5504 .kr(1)
5505 .sr(1)
5506 .m(3)
5507 .n(8)
5508 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08005509 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005510 }
5511
5512 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, strided_cn) {
5513 TEST_REQUIRES_ARM_NEON_V8;
5514 GemmMicrokernelTester()
5515 .mr(3)
5516 .nr(8)
5517 .kr(1)
5518 .sr(1)
5519 .m(3)
5520 .n(8)
5521 .k(8)
5522 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005523 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005524 }
5525
5526 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, k_eq_8_strided_a) {
5527 TEST_REQUIRES_ARM_NEON_V8;
5528 GemmMicrokernelTester()
5529 .mr(3)
5530 .nr(8)
5531 .kr(1)
5532 .sr(1)
5533 .m(3)
5534 .n(8)
5535 .k(8)
5536 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005537 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005538 }
5539
5540 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile) {
5541 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -08005542 for (uint32_t n = 1; n <= 8; n++) {
5543 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08005544 GemmMicrokernelTester()
5545 .mr(3)
5546 .nr(8)
5547 .kr(1)
5548 .sr(1)
5549 .m(m)
5550 .n(n)
5551 .k(8)
5552 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005553 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005554 }
5555 }
5556 }
5557
5558 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile_m) {
5559 TEST_REQUIRES_ARM_NEON_V8;
5560 for (uint32_t m = 1; m <= 3; m++) {
5561 GemmMicrokernelTester()
5562 .mr(3)
5563 .nr(8)
5564 .kr(1)
5565 .sr(1)
5566 .m(m)
5567 .n(8)
5568 .k(8)
5569 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005570 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005571 }
5572 }
5573
5574 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile_n) {
5575 TEST_REQUIRES_ARM_NEON_V8;
5576 for (uint32_t n = 1; n <= 8; n++) {
5577 GemmMicrokernelTester()
5578 .mr(3)
5579 .nr(8)
5580 .kr(1)
5581 .sr(1)
5582 .m(3)
5583 .n(n)
5584 .k(8)
5585 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005586 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005587 }
5588 }
5589
5590 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, k_lt_8) {
5591 TEST_REQUIRES_ARM_NEON_V8;
5592 for (size_t k = 1; k < 8; k++) {
5593 GemmMicrokernelTester()
5594 .mr(3)
5595 .nr(8)
5596 .kr(1)
5597 .sr(1)
5598 .m(3)
5599 .n(8)
5600 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08005601 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005602 }
5603 }
5604
5605 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, k_lt_8_strided_a) {
5606 TEST_REQUIRES_ARM_NEON_V8;
5607 for (size_t k = 1; k < 8; k++) {
5608 GemmMicrokernelTester()
5609 .mr(3)
5610 .nr(8)
5611 .kr(1)
5612 .sr(1)
5613 .m(3)
5614 .n(8)
5615 .k(k)
5616 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005617 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005618 }
5619 }
5620
5621 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, k_lt_8_subtile) {
5622 TEST_REQUIRES_ARM_NEON_V8;
5623 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08005624 for (uint32_t n = 1; n <= 8; n++) {
5625 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08005626 GemmMicrokernelTester()
5627 .mr(3)
5628 .nr(8)
5629 .kr(1)
5630 .sr(1)
5631 .m(m)
5632 .n(n)
5633 .k(k)
5634 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005635 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005636 }
5637 }
5638 }
5639 }
5640
5641 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, k_gt_8) {
5642 TEST_REQUIRES_ARM_NEON_V8;
5643 for (size_t k = 9; k < 16; k++) {
5644 GemmMicrokernelTester()
5645 .mr(3)
5646 .nr(8)
5647 .kr(1)
5648 .sr(1)
5649 .m(3)
5650 .n(8)
5651 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08005652 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005653 }
5654 }
5655
5656 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, k_gt_8_strided_a) {
5657 TEST_REQUIRES_ARM_NEON_V8;
5658 for (size_t k = 9; k < 16; k++) {
5659 GemmMicrokernelTester()
5660 .mr(3)
5661 .nr(8)
5662 .kr(1)
5663 .sr(1)
5664 .m(3)
5665 .n(8)
5666 .k(k)
5667 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08005668 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005669 }
5670 }
5671
5672 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, k_gt_8_subtile) {
5673 TEST_REQUIRES_ARM_NEON_V8;
5674 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08005675 for (uint32_t n = 1; n <= 8; n++) {
5676 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08005677 GemmMicrokernelTester()
5678 .mr(3)
5679 .nr(8)
5680 .kr(1)
5681 .sr(1)
5682 .m(m)
5683 .n(n)
5684 .k(k)
5685 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005686 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005687 }
5688 }
5689 }
5690 }
5691
5692 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, k_div_8) {
5693 TEST_REQUIRES_ARM_NEON_V8;
5694 for (size_t k = 16; k <= 80; k += 8) {
5695 GemmMicrokernelTester()
5696 .mr(3)
5697 .nr(8)
5698 .kr(1)
5699 .sr(1)
5700 .m(3)
5701 .n(8)
5702 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08005703 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005704 }
5705 }
5706
5707 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, k_div_8_strided_a) {
5708 TEST_REQUIRES_ARM_NEON_V8;
5709 for (size_t k = 16; k <= 80; k += 8) {
5710 GemmMicrokernelTester()
5711 .mr(3)
5712 .nr(8)
5713 .kr(1)
5714 .sr(1)
5715 .m(3)
5716 .n(8)
5717 .k(k)
5718 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08005719 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005720 }
5721 }
5722
5723 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, k_div_8_subtile) {
5724 TEST_REQUIRES_ARM_NEON_V8;
5725 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08005726 for (uint32_t n = 1; n <= 8; n++) {
5727 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08005728 GemmMicrokernelTester()
5729 .mr(3)
5730 .nr(8)
5731 .kr(1)
5732 .sr(1)
5733 .m(m)
5734 .n(n)
5735 .k(k)
5736 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005737 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005738 }
5739 }
5740 }
5741 }
5742
5743 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, n_gt_8) {
5744 TEST_REQUIRES_ARM_NEON_V8;
5745 for (uint32_t n = 9; n < 16; n++) {
5746 for (size_t k = 1; k <= 40; k += 9) {
5747 GemmMicrokernelTester()
5748 .mr(3)
5749 .nr(8)
5750 .kr(1)
5751 .sr(1)
5752 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08005753 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08005754 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08005755 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005756 }
5757 }
5758 }
5759
5760 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, n_gt_8_strided_cn) {
5761 TEST_REQUIRES_ARM_NEON_V8;
5762 for (uint32_t n = 9; n < 16; n++) {
5763 for (size_t k = 1; k <= 40; k += 9) {
5764 GemmMicrokernelTester()
5765 .mr(3)
5766 .nr(8)
5767 .kr(1)
5768 .sr(1)
5769 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08005770 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08005771 .k(k)
5772 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005773 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005774 }
5775 }
5776 }
5777
5778 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, n_gt_8_strided_a) {
5779 TEST_REQUIRES_ARM_NEON_V8;
5780 for (uint32_t n = 9; n < 16; n++) {
5781 for (size_t k = 1; k <= 40; k += 9) {
5782 GemmMicrokernelTester()
5783 .mr(3)
5784 .nr(8)
5785 .kr(1)
5786 .sr(1)
5787 .m(3)
5788 .n(n)
5789 .k(k)
5790 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08005791 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005792 }
5793 }
5794 }
5795
5796 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, n_gt_8_subtile) {
5797 TEST_REQUIRES_ARM_NEON_V8;
5798 for (uint32_t n = 9; n < 16; n++) {
5799 for (size_t k = 1; k <= 40; k += 9) {
5800 for (uint32_t m = 1; m <= 3; m++) {
5801 GemmMicrokernelTester()
5802 .mr(3)
5803 .nr(8)
5804 .kr(1)
5805 .sr(1)
5806 .m(m)
5807 .n(n)
5808 .k(k)
5809 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005810 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005811 }
5812 }
5813 }
5814 }
5815
5816 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, n_div_8) {
5817 TEST_REQUIRES_ARM_NEON_V8;
5818 for (uint32_t n = 16; n <= 24; n += 8) {
5819 for (size_t k = 1; k <= 40; k += 9) {
5820 GemmMicrokernelTester()
5821 .mr(3)
5822 .nr(8)
5823 .kr(1)
5824 .sr(1)
5825 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08005826 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08005827 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08005828 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005829 }
5830 }
5831 }
5832
5833 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, n_div_8_strided_cn) {
5834 TEST_REQUIRES_ARM_NEON_V8;
5835 for (uint32_t n = 16; n <= 24; n += 8) {
5836 for (size_t k = 1; k <= 40; k += 9) {
5837 GemmMicrokernelTester()
5838 .mr(3)
5839 .nr(8)
5840 .kr(1)
5841 .sr(1)
5842 .m(3)
5843 .n(n)
5844 .k(k)
5845 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005846 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005847 }
5848 }
5849 }
5850
5851 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, n_div_8_strided_a) {
5852 TEST_REQUIRES_ARM_NEON_V8;
5853 for (uint32_t n = 16; n <= 24; n += 8) {
5854 for (size_t k = 1; k <= 40; k += 9) {
5855 GemmMicrokernelTester()
5856 .mr(3)
5857 .nr(8)
5858 .kr(1)
5859 .sr(1)
5860 .m(3)
5861 .n(n)
5862 .k(k)
5863 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08005864 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005865 }
5866 }
5867 }
5868
5869 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, n_div_8_subtile) {
5870 TEST_REQUIRES_ARM_NEON_V8;
5871 for (uint32_t n = 16; n <= 24; n += 8) {
5872 for (size_t k = 1; k <= 40; k += 9) {
5873 for (uint32_t m = 1; m <= 3; m++) {
5874 GemmMicrokernelTester()
5875 .mr(3)
5876 .nr(8)
5877 .kr(1)
5878 .sr(1)
5879 .m(m)
5880 .n(n)
5881 .k(k)
5882 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005883 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005884 }
5885 }
5886 }
5887 }
5888
5889 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, strided_cm_subtile) {
5890 TEST_REQUIRES_ARM_NEON_V8;
5891 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08005892 for (uint32_t n = 1; n <= 8; n++) {
5893 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08005894 GemmMicrokernelTester()
5895 .mr(3)
5896 .nr(8)
5897 .kr(1)
5898 .sr(1)
5899 .m(m)
5900 .n(n)
5901 .k(k)
5902 .cm_stride(11)
5903 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005904 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005905 }
5906 }
5907 }
5908 }
5909
5910 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, qmin) {
5911 TEST_REQUIRES_ARM_NEON_V8;
5912 GemmMicrokernelTester()
5913 .mr(3)
5914 .nr(8)
5915 .kr(1)
5916 .sr(1)
5917 .m(3)
5918 .n(8)
5919 .k(8)
5920 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08005921 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005922 }
5923
5924 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, qmax) {
5925 TEST_REQUIRES_ARM_NEON_V8;
5926 GemmMicrokernelTester()
5927 .mr(3)
5928 .nr(8)
5929 .kr(1)
5930 .sr(1)
5931 .m(3)
5932 .n(8)
5933 .k(8)
5934 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08005935 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005936 }
5937
5938 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE_PRFM, strided_cm) {
5939 TEST_REQUIRES_ARM_NEON_V8;
5940 GemmMicrokernelTester()
5941 .mr(3)
5942 .nr(8)
5943 .kr(1)
5944 .sr(1)
5945 .m(3)
5946 .n(8)
5947 .k(8)
5948 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005949 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005950 }
5951#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
5952
5953
5954#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchardf6237402022-01-05 00:26:09 -08005955 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, k_eq_8) {
5956 TEST_REQUIRES_ARM_NEON_V8;
5957 GemmMicrokernelTester()
5958 .mr(1)
5959 .nr(16)
5960 .kr(1)
5961 .sr(1)
5962 .m(1)
5963 .n(16)
5964 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08005965 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005966 }
5967
5968 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, strided_cn) {
5969 TEST_REQUIRES_ARM_NEON_V8;
5970 GemmMicrokernelTester()
5971 .mr(1)
5972 .nr(16)
5973 .kr(1)
5974 .sr(1)
5975 .m(1)
5976 .n(16)
5977 .k(8)
5978 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08005979 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005980 }
5981
5982 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, k_eq_8_strided_a) {
5983 TEST_REQUIRES_ARM_NEON_V8;
5984 GemmMicrokernelTester()
5985 .mr(1)
5986 .nr(16)
5987 .kr(1)
5988 .sr(1)
5989 .m(1)
5990 .n(16)
5991 .k(8)
5992 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005993 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08005994 }
5995
5996 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile) {
5997 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -08005998 for (uint32_t n = 1; n <= 16; n++) {
5999 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08006000 GemmMicrokernelTester()
6001 .mr(1)
6002 .nr(16)
6003 .kr(1)
6004 .sr(1)
6005 .m(m)
6006 .n(n)
6007 .k(8)
6008 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006009 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006010 }
6011 }
6012 }
6013
6014 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile_m) {
6015 TEST_REQUIRES_ARM_NEON_V8;
6016 for (uint32_t m = 1; m <= 1; m++) {
6017 GemmMicrokernelTester()
6018 .mr(1)
6019 .nr(16)
6020 .kr(1)
6021 .sr(1)
6022 .m(m)
6023 .n(16)
6024 .k(8)
6025 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006026 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006027 }
6028 }
6029
6030 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile_n) {
6031 TEST_REQUIRES_ARM_NEON_V8;
6032 for (uint32_t n = 1; n <= 16; n++) {
6033 GemmMicrokernelTester()
6034 .mr(1)
6035 .nr(16)
6036 .kr(1)
6037 .sr(1)
6038 .m(1)
6039 .n(n)
6040 .k(8)
6041 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006042 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006043 }
6044 }
6045
6046 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, k_lt_8) {
6047 TEST_REQUIRES_ARM_NEON_V8;
6048 for (size_t k = 1; k < 8; k++) {
6049 GemmMicrokernelTester()
6050 .mr(1)
6051 .nr(16)
6052 .kr(1)
6053 .sr(1)
6054 .m(1)
6055 .n(16)
6056 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08006057 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006058 }
6059 }
6060
6061 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, k_lt_8_strided_a) {
6062 TEST_REQUIRES_ARM_NEON_V8;
6063 for (size_t k = 1; k < 8; k++) {
6064 GemmMicrokernelTester()
6065 .mr(1)
6066 .nr(16)
6067 .kr(1)
6068 .sr(1)
6069 .m(1)
6070 .n(16)
6071 .k(k)
6072 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08006073 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006074 }
6075 }
6076
6077 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, k_lt_8_subtile) {
6078 TEST_REQUIRES_ARM_NEON_V8;
6079 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08006080 for (uint32_t n = 1; n <= 16; n++) {
6081 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08006082 GemmMicrokernelTester()
6083 .mr(1)
6084 .nr(16)
6085 .kr(1)
6086 .sr(1)
6087 .m(m)
6088 .n(n)
6089 .k(k)
6090 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006091 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006092 }
6093 }
6094 }
6095 }
6096
6097 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, k_gt_8) {
6098 TEST_REQUIRES_ARM_NEON_V8;
6099 for (size_t k = 9; k < 16; k++) {
6100 GemmMicrokernelTester()
6101 .mr(1)
6102 .nr(16)
6103 .kr(1)
6104 .sr(1)
6105 .m(1)
6106 .n(16)
6107 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08006108 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006109 }
6110 }
6111
6112 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, k_gt_8_strided_a) {
6113 TEST_REQUIRES_ARM_NEON_V8;
6114 for (size_t k = 9; k < 16; k++) {
6115 GemmMicrokernelTester()
6116 .mr(1)
6117 .nr(16)
6118 .kr(1)
6119 .sr(1)
6120 .m(1)
6121 .n(16)
6122 .k(k)
6123 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08006124 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006125 }
6126 }
6127
6128 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, k_gt_8_subtile) {
6129 TEST_REQUIRES_ARM_NEON_V8;
6130 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08006131 for (uint32_t n = 1; n <= 16; n++) {
6132 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08006133 GemmMicrokernelTester()
6134 .mr(1)
6135 .nr(16)
6136 .kr(1)
6137 .sr(1)
6138 .m(m)
6139 .n(n)
6140 .k(k)
6141 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006142 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006143 }
6144 }
6145 }
6146 }
6147
6148 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, k_div_8) {
6149 TEST_REQUIRES_ARM_NEON_V8;
6150 for (size_t k = 16; k <= 80; k += 8) {
6151 GemmMicrokernelTester()
6152 .mr(1)
6153 .nr(16)
6154 .kr(1)
6155 .sr(1)
6156 .m(1)
6157 .n(16)
6158 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08006159 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006160 }
6161 }
6162
6163 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, k_div_8_strided_a) {
6164 TEST_REQUIRES_ARM_NEON_V8;
6165 for (size_t k = 16; k <= 80; k += 8) {
6166 GemmMicrokernelTester()
6167 .mr(1)
6168 .nr(16)
6169 .kr(1)
6170 .sr(1)
6171 .m(1)
6172 .n(16)
6173 .k(k)
6174 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08006175 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006176 }
6177 }
6178
6179 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, k_div_8_subtile) {
6180 TEST_REQUIRES_ARM_NEON_V8;
6181 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08006182 for (uint32_t n = 1; n <= 16; n++) {
6183 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08006184 GemmMicrokernelTester()
6185 .mr(1)
6186 .nr(16)
6187 .kr(1)
6188 .sr(1)
6189 .m(m)
6190 .n(n)
6191 .k(k)
6192 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006193 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006194 }
6195 }
6196 }
6197 }
6198
6199 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, n_gt_16) {
6200 TEST_REQUIRES_ARM_NEON_V8;
6201 for (uint32_t n = 17; n < 32; n++) {
6202 for (size_t k = 1; k <= 40; k += 9) {
6203 GemmMicrokernelTester()
6204 .mr(1)
6205 .nr(16)
6206 .kr(1)
6207 .sr(1)
6208 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08006209 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08006210 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08006211 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006212 }
6213 }
6214 }
6215
6216 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, n_gt_16_strided_cn) {
6217 TEST_REQUIRES_ARM_NEON_V8;
6218 for (uint32_t n = 17; n < 32; n++) {
6219 for (size_t k = 1; k <= 40; k += 9) {
6220 GemmMicrokernelTester()
6221 .mr(1)
6222 .nr(16)
6223 .kr(1)
6224 .sr(1)
6225 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08006226 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08006227 .k(k)
6228 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08006229 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006230 }
6231 }
6232 }
6233
6234 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, n_gt_16_strided_a) {
6235 TEST_REQUIRES_ARM_NEON_V8;
6236 for (uint32_t n = 17; n < 32; n++) {
6237 for (size_t k = 1; k <= 40; k += 9) {
6238 GemmMicrokernelTester()
6239 .mr(1)
6240 .nr(16)
6241 .kr(1)
6242 .sr(1)
6243 .m(1)
6244 .n(n)
6245 .k(k)
6246 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08006247 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006248 }
6249 }
6250 }
6251
6252 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, n_gt_16_subtile) {
6253 TEST_REQUIRES_ARM_NEON_V8;
6254 for (uint32_t n = 17; n < 32; n++) {
6255 for (size_t k = 1; k <= 40; k += 9) {
6256 for (uint32_t m = 1; m <= 1; m++) {
6257 GemmMicrokernelTester()
6258 .mr(1)
6259 .nr(16)
6260 .kr(1)
6261 .sr(1)
6262 .m(m)
6263 .n(n)
6264 .k(k)
6265 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006266 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006267 }
6268 }
6269 }
6270 }
6271
6272 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, n_div_16) {
6273 TEST_REQUIRES_ARM_NEON_V8;
6274 for (uint32_t n = 32; n <= 48; n += 16) {
6275 for (size_t k = 1; k <= 40; k += 9) {
6276 GemmMicrokernelTester()
6277 .mr(1)
6278 .nr(16)
6279 .kr(1)
6280 .sr(1)
6281 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08006282 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08006283 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08006284 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006285 }
6286 }
6287 }
6288
6289 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, n_div_16_strided_cn) {
6290 TEST_REQUIRES_ARM_NEON_V8;
6291 for (uint32_t n = 32; n <= 48; n += 16) {
6292 for (size_t k = 1; k <= 40; k += 9) {
6293 GemmMicrokernelTester()
6294 .mr(1)
6295 .nr(16)
6296 .kr(1)
6297 .sr(1)
6298 .m(1)
6299 .n(n)
6300 .k(k)
6301 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08006302 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006303 }
6304 }
6305 }
6306
6307 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, n_div_16_strided_a) {
6308 TEST_REQUIRES_ARM_NEON_V8;
6309 for (uint32_t n = 32; n <= 48; n += 16) {
6310 for (size_t k = 1; k <= 40; k += 9) {
6311 GemmMicrokernelTester()
6312 .mr(1)
6313 .nr(16)
6314 .kr(1)
6315 .sr(1)
6316 .m(1)
6317 .n(n)
6318 .k(k)
6319 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08006320 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006321 }
6322 }
6323 }
6324
6325 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, n_div_16_subtile) {
6326 TEST_REQUIRES_ARM_NEON_V8;
6327 for (uint32_t n = 32; n <= 48; n += 16) {
6328 for (size_t k = 1; k <= 40; k += 9) {
6329 for (uint32_t m = 1; m <= 1; m++) {
6330 GemmMicrokernelTester()
6331 .mr(1)
6332 .nr(16)
6333 .kr(1)
6334 .sr(1)
6335 .m(m)
6336 .n(n)
6337 .k(k)
6338 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006339 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006340 }
6341 }
6342 }
6343 }
6344
6345 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, strided_cm_subtile) {
6346 TEST_REQUIRES_ARM_NEON_V8;
6347 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08006348 for (uint32_t n = 1; n <= 16; n++) {
6349 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08006350 GemmMicrokernelTester()
6351 .mr(1)
6352 .nr(16)
6353 .kr(1)
6354 .sr(1)
6355 .m(m)
6356 .n(n)
6357 .k(k)
6358 .cm_stride(19)
6359 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006360 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006361 }
6362 }
6363 }
6364 }
6365
6366 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, qmin) {
6367 TEST_REQUIRES_ARM_NEON_V8;
6368 GemmMicrokernelTester()
6369 .mr(1)
6370 .nr(16)
6371 .kr(1)
6372 .sr(1)
6373 .m(1)
6374 .n(16)
6375 .k(8)
6376 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08006377 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006378 }
6379
6380 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, qmax) {
6381 TEST_REQUIRES_ARM_NEON_V8;
6382 GemmMicrokernelTester()
6383 .mr(1)
6384 .nr(16)
6385 .kr(1)
6386 .sr(1)
6387 .m(1)
6388 .n(16)
6389 .k(8)
6390 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08006391 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006392 }
6393
6394 TEST(QC8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE_PRFM, strided_cm) {
6395 TEST_REQUIRES_ARM_NEON_V8;
6396 GemmMicrokernelTester()
6397 .mr(1)
6398 .nr(16)
6399 .kr(1)
6400 .sr(1)
6401 .m(1)
6402 .n(16)
6403 .k(8)
6404 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08006405 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006406 }
6407#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
6408
6409
6410#if XNN_ARCH_ARM || XNN_ARCH_ARM64
6411 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, k_eq_8) {
6412 TEST_REQUIRES_ARM_NEON_V8;
6413 GemmMicrokernelTester()
6414 .mr(2)
6415 .nr(16)
6416 .kr(1)
6417 .sr(1)
6418 .m(2)
6419 .n(16)
6420 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08006421 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006422 }
6423
6424 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, strided_cn) {
6425 TEST_REQUIRES_ARM_NEON_V8;
6426 GemmMicrokernelTester()
6427 .mr(2)
6428 .nr(16)
6429 .kr(1)
6430 .sr(1)
6431 .m(2)
6432 .n(16)
6433 .k(8)
6434 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08006435 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006436 }
6437
6438 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, k_eq_8_strided_a) {
6439 TEST_REQUIRES_ARM_NEON_V8;
6440 GemmMicrokernelTester()
6441 .mr(2)
6442 .nr(16)
6443 .kr(1)
6444 .sr(1)
6445 .m(2)
6446 .n(16)
6447 .k(8)
6448 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08006449 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006450 }
6451
6452 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile) {
6453 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -08006454 for (uint32_t n = 1; n <= 16; n++) {
6455 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08006456 GemmMicrokernelTester()
6457 .mr(2)
6458 .nr(16)
6459 .kr(1)
6460 .sr(1)
6461 .m(m)
6462 .n(n)
6463 .k(8)
6464 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006465 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006466 }
6467 }
6468 }
6469
6470 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile_m) {
6471 TEST_REQUIRES_ARM_NEON_V8;
6472 for (uint32_t m = 1; m <= 2; m++) {
6473 GemmMicrokernelTester()
6474 .mr(2)
6475 .nr(16)
6476 .kr(1)
6477 .sr(1)
6478 .m(m)
6479 .n(16)
6480 .k(8)
6481 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006482 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006483 }
6484 }
6485
6486 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile_n) {
6487 TEST_REQUIRES_ARM_NEON_V8;
6488 for (uint32_t n = 1; n <= 16; n++) {
6489 GemmMicrokernelTester()
6490 .mr(2)
6491 .nr(16)
6492 .kr(1)
6493 .sr(1)
6494 .m(2)
6495 .n(n)
6496 .k(8)
6497 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006498 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006499 }
6500 }
6501
6502 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, k_lt_8) {
6503 TEST_REQUIRES_ARM_NEON_V8;
6504 for (size_t k = 1; k < 8; k++) {
6505 GemmMicrokernelTester()
6506 .mr(2)
6507 .nr(16)
6508 .kr(1)
6509 .sr(1)
6510 .m(2)
6511 .n(16)
6512 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08006513 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006514 }
6515 }
6516
6517 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, k_lt_8_strided_a) {
6518 TEST_REQUIRES_ARM_NEON_V8;
6519 for (size_t k = 1; k < 8; k++) {
6520 GemmMicrokernelTester()
6521 .mr(2)
6522 .nr(16)
6523 .kr(1)
6524 .sr(1)
6525 .m(2)
6526 .n(16)
6527 .k(k)
6528 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08006529 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006530 }
6531 }
6532
6533 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, k_lt_8_subtile) {
6534 TEST_REQUIRES_ARM_NEON_V8;
6535 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08006536 for (uint32_t n = 1; n <= 16; n++) {
6537 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08006538 GemmMicrokernelTester()
6539 .mr(2)
6540 .nr(16)
6541 .kr(1)
6542 .sr(1)
6543 .m(m)
6544 .n(n)
6545 .k(k)
6546 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006547 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006548 }
6549 }
6550 }
6551 }
6552
6553 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, k_gt_8) {
6554 TEST_REQUIRES_ARM_NEON_V8;
6555 for (size_t k = 9; k < 16; k++) {
6556 GemmMicrokernelTester()
6557 .mr(2)
6558 .nr(16)
6559 .kr(1)
6560 .sr(1)
6561 .m(2)
6562 .n(16)
6563 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08006564 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006565 }
6566 }
6567
6568 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, k_gt_8_strided_a) {
6569 TEST_REQUIRES_ARM_NEON_V8;
6570 for (size_t k = 9; k < 16; k++) {
6571 GemmMicrokernelTester()
6572 .mr(2)
6573 .nr(16)
6574 .kr(1)
6575 .sr(1)
6576 .m(2)
6577 .n(16)
6578 .k(k)
6579 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08006580 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006581 }
6582 }
6583
6584 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, k_gt_8_subtile) {
6585 TEST_REQUIRES_ARM_NEON_V8;
6586 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08006587 for (uint32_t n = 1; n <= 16; n++) {
6588 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08006589 GemmMicrokernelTester()
6590 .mr(2)
6591 .nr(16)
6592 .kr(1)
6593 .sr(1)
6594 .m(m)
6595 .n(n)
6596 .k(k)
6597 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006598 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006599 }
6600 }
6601 }
6602 }
6603
6604 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, k_div_8) {
6605 TEST_REQUIRES_ARM_NEON_V8;
6606 for (size_t k = 16; k <= 80; k += 8) {
6607 GemmMicrokernelTester()
6608 .mr(2)
6609 .nr(16)
6610 .kr(1)
6611 .sr(1)
6612 .m(2)
6613 .n(16)
6614 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08006615 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006616 }
6617 }
6618
6619 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, k_div_8_strided_a) {
6620 TEST_REQUIRES_ARM_NEON_V8;
6621 for (size_t k = 16; k <= 80; k += 8) {
6622 GemmMicrokernelTester()
6623 .mr(2)
6624 .nr(16)
6625 .kr(1)
6626 .sr(1)
6627 .m(2)
6628 .n(16)
6629 .k(k)
6630 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08006631 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006632 }
6633 }
6634
6635 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, k_div_8_subtile) {
6636 TEST_REQUIRES_ARM_NEON_V8;
6637 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08006638 for (uint32_t n = 1; n <= 16; n++) {
6639 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08006640 GemmMicrokernelTester()
6641 .mr(2)
6642 .nr(16)
6643 .kr(1)
6644 .sr(1)
6645 .m(m)
6646 .n(n)
6647 .k(k)
6648 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006649 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006650 }
6651 }
6652 }
6653 }
6654
6655 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, n_gt_16) {
6656 TEST_REQUIRES_ARM_NEON_V8;
6657 for (uint32_t n = 17; n < 32; n++) {
6658 for (size_t k = 1; k <= 40; k += 9) {
6659 GemmMicrokernelTester()
6660 .mr(2)
6661 .nr(16)
6662 .kr(1)
6663 .sr(1)
6664 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08006665 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08006666 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08006667 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006668 }
6669 }
6670 }
6671
6672 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, n_gt_16_strided_cn) {
6673 TEST_REQUIRES_ARM_NEON_V8;
6674 for (uint32_t n = 17; n < 32; n++) {
6675 for (size_t k = 1; k <= 40; k += 9) {
6676 GemmMicrokernelTester()
6677 .mr(2)
6678 .nr(16)
6679 .kr(1)
6680 .sr(1)
6681 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08006682 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08006683 .k(k)
6684 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08006685 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006686 }
6687 }
6688 }
6689
6690 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, n_gt_16_strided_a) {
6691 TEST_REQUIRES_ARM_NEON_V8;
6692 for (uint32_t n = 17; n < 32; n++) {
6693 for (size_t k = 1; k <= 40; k += 9) {
6694 GemmMicrokernelTester()
6695 .mr(2)
6696 .nr(16)
6697 .kr(1)
6698 .sr(1)
6699 .m(2)
6700 .n(n)
6701 .k(k)
6702 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08006703 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006704 }
6705 }
6706 }
6707
6708 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, n_gt_16_subtile) {
6709 TEST_REQUIRES_ARM_NEON_V8;
6710 for (uint32_t n = 17; n < 32; n++) {
6711 for (size_t k = 1; k <= 40; k += 9) {
6712 for (uint32_t m = 1; m <= 2; m++) {
6713 GemmMicrokernelTester()
6714 .mr(2)
6715 .nr(16)
6716 .kr(1)
6717 .sr(1)
6718 .m(m)
6719 .n(n)
6720 .k(k)
6721 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006722 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006723 }
6724 }
6725 }
6726 }
6727
6728 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, n_div_16) {
6729 TEST_REQUIRES_ARM_NEON_V8;
6730 for (uint32_t n = 32; n <= 48; n += 16) {
6731 for (size_t k = 1; k <= 40; k += 9) {
6732 GemmMicrokernelTester()
6733 .mr(2)
6734 .nr(16)
6735 .kr(1)
6736 .sr(1)
6737 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08006738 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08006739 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08006740 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006741 }
6742 }
6743 }
6744
6745 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, n_div_16_strided_cn) {
6746 TEST_REQUIRES_ARM_NEON_V8;
6747 for (uint32_t n = 32; n <= 48; n += 16) {
6748 for (size_t k = 1; k <= 40; k += 9) {
6749 GemmMicrokernelTester()
6750 .mr(2)
6751 .nr(16)
6752 .kr(1)
6753 .sr(1)
6754 .m(2)
6755 .n(n)
6756 .k(k)
6757 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08006758 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006759 }
6760 }
6761 }
6762
6763 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, n_div_16_strided_a) {
6764 TEST_REQUIRES_ARM_NEON_V8;
6765 for (uint32_t n = 32; n <= 48; n += 16) {
6766 for (size_t k = 1; k <= 40; k += 9) {
6767 GemmMicrokernelTester()
6768 .mr(2)
6769 .nr(16)
6770 .kr(1)
6771 .sr(1)
6772 .m(2)
6773 .n(n)
6774 .k(k)
6775 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08006776 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006777 }
6778 }
6779 }
6780
6781 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, n_div_16_subtile) {
6782 TEST_REQUIRES_ARM_NEON_V8;
6783 for (uint32_t n = 32; n <= 48; n += 16) {
6784 for (size_t k = 1; k <= 40; k += 9) {
6785 for (uint32_t m = 1; m <= 2; m++) {
6786 GemmMicrokernelTester()
6787 .mr(2)
6788 .nr(16)
6789 .kr(1)
6790 .sr(1)
6791 .m(m)
6792 .n(n)
6793 .k(k)
6794 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006795 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006796 }
6797 }
6798 }
6799 }
6800
6801 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, strided_cm_subtile) {
6802 TEST_REQUIRES_ARM_NEON_V8;
6803 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08006804 for (uint32_t n = 1; n <= 16; n++) {
6805 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08006806 GemmMicrokernelTester()
6807 .mr(2)
6808 .nr(16)
6809 .kr(1)
6810 .sr(1)
6811 .m(m)
6812 .n(n)
6813 .k(k)
6814 .cm_stride(19)
6815 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006816 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006817 }
6818 }
6819 }
6820 }
6821
6822 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, qmin) {
6823 TEST_REQUIRES_ARM_NEON_V8;
6824 GemmMicrokernelTester()
6825 .mr(2)
6826 .nr(16)
6827 .kr(1)
6828 .sr(1)
6829 .m(2)
6830 .n(16)
6831 .k(8)
6832 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08006833 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006834 }
6835
6836 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, qmax) {
6837 TEST_REQUIRES_ARM_NEON_V8;
6838 GemmMicrokernelTester()
6839 .mr(2)
6840 .nr(16)
6841 .kr(1)
6842 .sr(1)
6843 .m(2)
6844 .n(16)
6845 .k(8)
6846 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08006847 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006848 }
6849
6850 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEONV8_MLAL_LANE_PRFM, strided_cm) {
6851 TEST_REQUIRES_ARM_NEON_V8;
6852 GemmMicrokernelTester()
6853 .mr(2)
6854 .nr(16)
6855 .kr(1)
6856 .sr(1)
6857 .m(2)
6858 .n(16)
6859 .k(8)
6860 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08006861 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006862 }
6863#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
6864
6865
6866#if XNN_ARCH_ARM || XNN_ARCH_ARM64
6867 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, k_eq_8) {
6868 TEST_REQUIRES_ARM_NEON_V8;
6869 GemmMicrokernelTester()
6870 .mr(3)
6871 .nr(16)
6872 .kr(1)
6873 .sr(1)
6874 .m(3)
6875 .n(16)
6876 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08006877 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006878 }
6879
6880 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, strided_cn) {
6881 TEST_REQUIRES_ARM_NEON_V8;
6882 GemmMicrokernelTester()
6883 .mr(3)
6884 .nr(16)
6885 .kr(1)
6886 .sr(1)
6887 .m(3)
6888 .n(16)
6889 .k(8)
6890 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08006891 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006892 }
6893
6894 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, k_eq_8_strided_a) {
6895 TEST_REQUIRES_ARM_NEON_V8;
6896 GemmMicrokernelTester()
6897 .mr(3)
6898 .nr(16)
6899 .kr(1)
6900 .sr(1)
6901 .m(3)
6902 .n(16)
6903 .k(8)
6904 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08006905 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006906 }
6907
6908 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile) {
6909 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -08006910 for (uint32_t n = 1; n <= 16; n++) {
6911 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08006912 GemmMicrokernelTester()
6913 .mr(3)
6914 .nr(16)
6915 .kr(1)
6916 .sr(1)
6917 .m(m)
6918 .n(n)
6919 .k(8)
6920 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006921 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006922 }
6923 }
6924 }
6925
6926 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile_m) {
6927 TEST_REQUIRES_ARM_NEON_V8;
6928 for (uint32_t m = 1; m <= 3; m++) {
6929 GemmMicrokernelTester()
6930 .mr(3)
6931 .nr(16)
6932 .kr(1)
6933 .sr(1)
6934 .m(m)
6935 .n(16)
6936 .k(8)
6937 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006938 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006939 }
6940 }
6941
6942 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile_n) {
6943 TEST_REQUIRES_ARM_NEON_V8;
6944 for (uint32_t n = 1; n <= 16; n++) {
6945 GemmMicrokernelTester()
6946 .mr(3)
6947 .nr(16)
6948 .kr(1)
6949 .sr(1)
6950 .m(3)
6951 .n(n)
6952 .k(8)
6953 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006954 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006955 }
6956 }
6957
6958 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, k_lt_8) {
6959 TEST_REQUIRES_ARM_NEON_V8;
6960 for (size_t k = 1; k < 8; k++) {
6961 GemmMicrokernelTester()
6962 .mr(3)
6963 .nr(16)
6964 .kr(1)
6965 .sr(1)
6966 .m(3)
6967 .n(16)
6968 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08006969 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006970 }
6971 }
6972
6973 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, k_lt_8_strided_a) {
6974 TEST_REQUIRES_ARM_NEON_V8;
6975 for (size_t k = 1; k < 8; k++) {
6976 GemmMicrokernelTester()
6977 .mr(3)
6978 .nr(16)
6979 .kr(1)
6980 .sr(1)
6981 .m(3)
6982 .n(16)
6983 .k(k)
6984 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08006985 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08006986 }
6987 }
6988
6989 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, k_lt_8_subtile) {
6990 TEST_REQUIRES_ARM_NEON_V8;
6991 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08006992 for (uint32_t n = 1; n <= 16; n++) {
6993 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08006994 GemmMicrokernelTester()
6995 .mr(3)
6996 .nr(16)
6997 .kr(1)
6998 .sr(1)
6999 .m(m)
7000 .n(n)
7001 .k(k)
7002 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007003 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007004 }
7005 }
7006 }
7007 }
7008
7009 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, k_gt_8) {
7010 TEST_REQUIRES_ARM_NEON_V8;
7011 for (size_t k = 9; k < 16; k++) {
7012 GemmMicrokernelTester()
7013 .mr(3)
7014 .nr(16)
7015 .kr(1)
7016 .sr(1)
7017 .m(3)
7018 .n(16)
7019 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007020 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007021 }
7022 }
7023
7024 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, k_gt_8_strided_a) {
7025 TEST_REQUIRES_ARM_NEON_V8;
7026 for (size_t k = 9; k < 16; k++) {
7027 GemmMicrokernelTester()
7028 .mr(3)
7029 .nr(16)
7030 .kr(1)
7031 .sr(1)
7032 .m(3)
7033 .n(16)
7034 .k(k)
7035 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08007036 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007037 }
7038 }
7039
7040 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, k_gt_8_subtile) {
7041 TEST_REQUIRES_ARM_NEON_V8;
7042 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08007043 for (uint32_t n = 1; n <= 16; n++) {
7044 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08007045 GemmMicrokernelTester()
7046 .mr(3)
7047 .nr(16)
7048 .kr(1)
7049 .sr(1)
7050 .m(m)
7051 .n(n)
7052 .k(k)
7053 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007054 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007055 }
7056 }
7057 }
7058 }
7059
7060 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, k_div_8) {
7061 TEST_REQUIRES_ARM_NEON_V8;
7062 for (size_t k = 16; k <= 80; k += 8) {
7063 GemmMicrokernelTester()
7064 .mr(3)
7065 .nr(16)
7066 .kr(1)
7067 .sr(1)
7068 .m(3)
7069 .n(16)
7070 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007071 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007072 }
7073 }
7074
7075 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, k_div_8_strided_a) {
7076 TEST_REQUIRES_ARM_NEON_V8;
7077 for (size_t k = 16; k <= 80; k += 8) {
7078 GemmMicrokernelTester()
7079 .mr(3)
7080 .nr(16)
7081 .kr(1)
7082 .sr(1)
7083 .m(3)
7084 .n(16)
7085 .k(k)
7086 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08007087 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007088 }
7089 }
7090
7091 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, k_div_8_subtile) {
7092 TEST_REQUIRES_ARM_NEON_V8;
7093 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08007094 for (uint32_t n = 1; n <= 16; n++) {
7095 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08007096 GemmMicrokernelTester()
7097 .mr(3)
7098 .nr(16)
7099 .kr(1)
7100 .sr(1)
7101 .m(m)
7102 .n(n)
7103 .k(k)
7104 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007105 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007106 }
7107 }
7108 }
7109 }
7110
7111 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, n_gt_16) {
7112 TEST_REQUIRES_ARM_NEON_V8;
7113 for (uint32_t n = 17; n < 32; n++) {
7114 for (size_t k = 1; k <= 40; k += 9) {
7115 GemmMicrokernelTester()
7116 .mr(3)
7117 .nr(16)
7118 .kr(1)
7119 .sr(1)
7120 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08007121 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08007122 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007123 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007124 }
7125 }
7126 }
7127
7128 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, n_gt_16_strided_cn) {
7129 TEST_REQUIRES_ARM_NEON_V8;
7130 for (uint32_t n = 17; n < 32; n++) {
7131 for (size_t k = 1; k <= 40; k += 9) {
7132 GemmMicrokernelTester()
7133 .mr(3)
7134 .nr(16)
7135 .kr(1)
7136 .sr(1)
7137 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08007138 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08007139 .k(k)
7140 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08007141 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007142 }
7143 }
7144 }
7145
7146 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, n_gt_16_strided_a) {
7147 TEST_REQUIRES_ARM_NEON_V8;
7148 for (uint32_t n = 17; n < 32; n++) {
7149 for (size_t k = 1; k <= 40; k += 9) {
7150 GemmMicrokernelTester()
7151 .mr(3)
7152 .nr(16)
7153 .kr(1)
7154 .sr(1)
7155 .m(3)
7156 .n(n)
7157 .k(k)
7158 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08007159 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007160 }
7161 }
7162 }
7163
7164 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, n_gt_16_subtile) {
7165 TEST_REQUIRES_ARM_NEON_V8;
7166 for (uint32_t n = 17; n < 32; n++) {
7167 for (size_t k = 1; k <= 40; k += 9) {
7168 for (uint32_t m = 1; m <= 3; m++) {
7169 GemmMicrokernelTester()
7170 .mr(3)
7171 .nr(16)
7172 .kr(1)
7173 .sr(1)
7174 .m(m)
7175 .n(n)
7176 .k(k)
7177 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007178 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007179 }
7180 }
7181 }
7182 }
7183
7184 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, n_div_16) {
7185 TEST_REQUIRES_ARM_NEON_V8;
7186 for (uint32_t n = 32; n <= 48; n += 16) {
7187 for (size_t k = 1; k <= 40; k += 9) {
7188 GemmMicrokernelTester()
7189 .mr(3)
7190 .nr(16)
7191 .kr(1)
7192 .sr(1)
7193 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08007194 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08007195 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007196 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007197 }
7198 }
7199 }
7200
7201 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, n_div_16_strided_cn) {
7202 TEST_REQUIRES_ARM_NEON_V8;
7203 for (uint32_t n = 32; n <= 48; n += 16) {
7204 for (size_t k = 1; k <= 40; k += 9) {
7205 GemmMicrokernelTester()
7206 .mr(3)
7207 .nr(16)
7208 .kr(1)
7209 .sr(1)
7210 .m(3)
7211 .n(n)
7212 .k(k)
7213 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08007214 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007215 }
7216 }
7217 }
7218
7219 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, n_div_16_strided_a) {
7220 TEST_REQUIRES_ARM_NEON_V8;
7221 for (uint32_t n = 32; n <= 48; n += 16) {
7222 for (size_t k = 1; k <= 40; k += 9) {
7223 GemmMicrokernelTester()
7224 .mr(3)
7225 .nr(16)
7226 .kr(1)
7227 .sr(1)
7228 .m(3)
7229 .n(n)
7230 .k(k)
7231 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08007232 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007233 }
7234 }
7235 }
7236
7237 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, n_div_16_subtile) {
7238 TEST_REQUIRES_ARM_NEON_V8;
7239 for (uint32_t n = 32; n <= 48; n += 16) {
7240 for (size_t k = 1; k <= 40; k += 9) {
7241 for (uint32_t m = 1; m <= 3; m++) {
7242 GemmMicrokernelTester()
7243 .mr(3)
7244 .nr(16)
7245 .kr(1)
7246 .sr(1)
7247 .m(m)
7248 .n(n)
7249 .k(k)
7250 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007251 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007252 }
7253 }
7254 }
7255 }
7256
7257 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, strided_cm_subtile) {
7258 TEST_REQUIRES_ARM_NEON_V8;
7259 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08007260 for (uint32_t n = 1; n <= 16; n++) {
7261 for (uint32_t m = 1; m <= 3; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08007262 GemmMicrokernelTester()
7263 .mr(3)
7264 .nr(16)
7265 .kr(1)
7266 .sr(1)
7267 .m(m)
7268 .n(n)
7269 .k(k)
7270 .cm_stride(19)
7271 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007272 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007273 }
7274 }
7275 }
7276 }
7277
7278 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, qmin) {
7279 TEST_REQUIRES_ARM_NEON_V8;
7280 GemmMicrokernelTester()
7281 .mr(3)
7282 .nr(16)
7283 .kr(1)
7284 .sr(1)
7285 .m(3)
7286 .n(16)
7287 .k(8)
7288 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08007289 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007290 }
7291
7292 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, qmax) {
7293 TEST_REQUIRES_ARM_NEON_V8;
7294 GemmMicrokernelTester()
7295 .mr(3)
7296 .nr(16)
7297 .kr(1)
7298 .sr(1)
7299 .m(3)
7300 .n(16)
7301 .k(8)
7302 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08007303 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007304 }
7305
7306 TEST(QC8_GEMM_MINMAX_FP32_3X16__NEONV8_MLAL_LANE_PRFM, strided_cm) {
7307 TEST_REQUIRES_ARM_NEON_V8;
7308 GemmMicrokernelTester()
7309 .mr(3)
7310 .nr(16)
7311 .kr(1)
7312 .sr(1)
7313 .m(3)
7314 .n(16)
7315 .k(8)
7316 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08007317 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007318 }
7319#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
7320
7321
7322#if XNN_ARCH_ARM || XNN_ARCH_ARM64
7323 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, k_eq_8) {
7324 TEST_REQUIRES_ARM_NEON_V8;
7325 GemmMicrokernelTester()
7326 .mr(4)
7327 .nr(16)
7328 .kr(1)
7329 .sr(1)
7330 .m(4)
7331 .n(16)
7332 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08007333 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007334 }
7335
7336 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, strided_cn) {
7337 TEST_REQUIRES_ARM_NEON_V8;
7338 GemmMicrokernelTester()
7339 .mr(4)
7340 .nr(16)
7341 .kr(1)
7342 .sr(1)
7343 .m(4)
7344 .n(16)
7345 .k(8)
7346 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08007347 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007348 }
7349
7350 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, k_eq_8_strided_a) {
7351 TEST_REQUIRES_ARM_NEON_V8;
7352 GemmMicrokernelTester()
7353 .mr(4)
7354 .nr(16)
7355 .kr(1)
7356 .sr(1)
7357 .m(4)
7358 .n(16)
7359 .k(8)
7360 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08007361 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007362 }
7363
7364 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile) {
7365 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -08007366 for (uint32_t n = 1; n <= 16; n++) {
7367 for (uint32_t m = 1; m <= 4; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08007368 GemmMicrokernelTester()
7369 .mr(4)
7370 .nr(16)
7371 .kr(1)
7372 .sr(1)
7373 .m(m)
7374 .n(n)
7375 .k(8)
7376 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007377 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007378 }
7379 }
7380 }
7381
7382 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile_m) {
7383 TEST_REQUIRES_ARM_NEON_V8;
7384 for (uint32_t m = 1; m <= 4; m++) {
7385 GemmMicrokernelTester()
7386 .mr(4)
7387 .nr(16)
7388 .kr(1)
7389 .sr(1)
7390 .m(m)
7391 .n(16)
7392 .k(8)
7393 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007394 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007395 }
7396 }
7397
7398 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile_n) {
7399 TEST_REQUIRES_ARM_NEON_V8;
7400 for (uint32_t n = 1; n <= 16; n++) {
7401 GemmMicrokernelTester()
7402 .mr(4)
7403 .nr(16)
7404 .kr(1)
7405 .sr(1)
7406 .m(4)
7407 .n(n)
7408 .k(8)
7409 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007410 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007411 }
7412 }
7413
7414 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, k_lt_8) {
7415 TEST_REQUIRES_ARM_NEON_V8;
7416 for (size_t k = 1; k < 8; k++) {
7417 GemmMicrokernelTester()
7418 .mr(4)
7419 .nr(16)
7420 .kr(1)
7421 .sr(1)
7422 .m(4)
7423 .n(16)
7424 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007425 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007426 }
7427 }
7428
7429 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, k_lt_8_strided_a) {
7430 TEST_REQUIRES_ARM_NEON_V8;
7431 for (size_t k = 1; k < 8; k++) {
7432 GemmMicrokernelTester()
7433 .mr(4)
7434 .nr(16)
7435 .kr(1)
7436 .sr(1)
7437 .m(4)
7438 .n(16)
7439 .k(k)
7440 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08007441 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007442 }
7443 }
7444
7445 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, k_lt_8_subtile) {
7446 TEST_REQUIRES_ARM_NEON_V8;
7447 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08007448 for (uint32_t n = 1; n <= 16; n++) {
7449 for (uint32_t m = 1; m <= 4; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08007450 GemmMicrokernelTester()
7451 .mr(4)
7452 .nr(16)
7453 .kr(1)
7454 .sr(1)
7455 .m(m)
7456 .n(n)
7457 .k(k)
7458 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007459 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007460 }
7461 }
7462 }
7463 }
7464
7465 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, k_gt_8) {
7466 TEST_REQUIRES_ARM_NEON_V8;
7467 for (size_t k = 9; k < 16; k++) {
7468 GemmMicrokernelTester()
7469 .mr(4)
7470 .nr(16)
7471 .kr(1)
7472 .sr(1)
7473 .m(4)
7474 .n(16)
7475 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007476 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007477 }
7478 }
7479
7480 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, k_gt_8_strided_a) {
7481 TEST_REQUIRES_ARM_NEON_V8;
7482 for (size_t k = 9; k < 16; k++) {
7483 GemmMicrokernelTester()
7484 .mr(4)
7485 .nr(16)
7486 .kr(1)
7487 .sr(1)
7488 .m(4)
7489 .n(16)
7490 .k(k)
7491 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08007492 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007493 }
7494 }
7495
7496 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, k_gt_8_subtile) {
7497 TEST_REQUIRES_ARM_NEON_V8;
7498 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08007499 for (uint32_t n = 1; n <= 16; n++) {
7500 for (uint32_t m = 1; m <= 4; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08007501 GemmMicrokernelTester()
7502 .mr(4)
7503 .nr(16)
7504 .kr(1)
7505 .sr(1)
7506 .m(m)
7507 .n(n)
7508 .k(k)
7509 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007510 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007511 }
7512 }
7513 }
7514 }
7515
7516 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, k_div_8) {
7517 TEST_REQUIRES_ARM_NEON_V8;
7518 for (size_t k = 16; k <= 80; k += 8) {
7519 GemmMicrokernelTester()
7520 .mr(4)
7521 .nr(16)
7522 .kr(1)
7523 .sr(1)
7524 .m(4)
7525 .n(16)
7526 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007527 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007528 }
7529 }
7530
7531 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, k_div_8_strided_a) {
7532 TEST_REQUIRES_ARM_NEON_V8;
7533 for (size_t k = 16; k <= 80; k += 8) {
7534 GemmMicrokernelTester()
7535 .mr(4)
7536 .nr(16)
7537 .kr(1)
7538 .sr(1)
7539 .m(4)
7540 .n(16)
7541 .k(k)
7542 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08007543 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007544 }
7545 }
7546
7547 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, k_div_8_subtile) {
7548 TEST_REQUIRES_ARM_NEON_V8;
7549 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08007550 for (uint32_t n = 1; n <= 16; n++) {
7551 for (uint32_t m = 1; m <= 4; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08007552 GemmMicrokernelTester()
7553 .mr(4)
7554 .nr(16)
7555 .kr(1)
7556 .sr(1)
7557 .m(m)
7558 .n(n)
7559 .k(k)
7560 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007561 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007562 }
7563 }
7564 }
7565 }
7566
7567 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, n_gt_16) {
7568 TEST_REQUIRES_ARM_NEON_V8;
7569 for (uint32_t n = 17; n < 32; n++) {
7570 for (size_t k = 1; k <= 40; k += 9) {
7571 GemmMicrokernelTester()
7572 .mr(4)
7573 .nr(16)
7574 .kr(1)
7575 .sr(1)
7576 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08007577 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08007578 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007579 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007580 }
7581 }
7582 }
7583
7584 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, n_gt_16_strided_cn) {
7585 TEST_REQUIRES_ARM_NEON_V8;
7586 for (uint32_t n = 17; n < 32; n++) {
7587 for (size_t k = 1; k <= 40; k += 9) {
7588 GemmMicrokernelTester()
7589 .mr(4)
7590 .nr(16)
7591 .kr(1)
7592 .sr(1)
7593 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08007594 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08007595 .k(k)
7596 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08007597 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007598 }
7599 }
7600 }
7601
7602 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, n_gt_16_strided_a) {
7603 TEST_REQUIRES_ARM_NEON_V8;
7604 for (uint32_t n = 17; n < 32; n++) {
7605 for (size_t k = 1; k <= 40; k += 9) {
7606 GemmMicrokernelTester()
7607 .mr(4)
7608 .nr(16)
7609 .kr(1)
7610 .sr(1)
7611 .m(4)
7612 .n(n)
7613 .k(k)
7614 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08007615 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007616 }
7617 }
7618 }
7619
7620 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, n_gt_16_subtile) {
7621 TEST_REQUIRES_ARM_NEON_V8;
7622 for (uint32_t n = 17; n < 32; n++) {
7623 for (size_t k = 1; k <= 40; k += 9) {
7624 for (uint32_t m = 1; m <= 4; m++) {
7625 GemmMicrokernelTester()
7626 .mr(4)
7627 .nr(16)
7628 .kr(1)
7629 .sr(1)
7630 .m(m)
7631 .n(n)
7632 .k(k)
7633 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007634 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007635 }
7636 }
7637 }
7638 }
7639
7640 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, n_div_16) {
7641 TEST_REQUIRES_ARM_NEON_V8;
7642 for (uint32_t n = 32; n <= 48; n += 16) {
7643 for (size_t k = 1; k <= 40; k += 9) {
7644 GemmMicrokernelTester()
7645 .mr(4)
7646 .nr(16)
7647 .kr(1)
7648 .sr(1)
7649 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08007650 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08007651 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007652 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007653 }
7654 }
7655 }
7656
7657 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, n_div_16_strided_cn) {
7658 TEST_REQUIRES_ARM_NEON_V8;
7659 for (uint32_t n = 32; n <= 48; n += 16) {
7660 for (size_t k = 1; k <= 40; k += 9) {
7661 GemmMicrokernelTester()
7662 .mr(4)
7663 .nr(16)
7664 .kr(1)
7665 .sr(1)
7666 .m(4)
7667 .n(n)
7668 .k(k)
7669 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08007670 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007671 }
7672 }
7673 }
7674
7675 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, n_div_16_strided_a) {
7676 TEST_REQUIRES_ARM_NEON_V8;
7677 for (uint32_t n = 32; n <= 48; n += 16) {
7678 for (size_t k = 1; k <= 40; k += 9) {
7679 GemmMicrokernelTester()
7680 .mr(4)
7681 .nr(16)
7682 .kr(1)
7683 .sr(1)
7684 .m(4)
7685 .n(n)
7686 .k(k)
7687 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08007688 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007689 }
7690 }
7691 }
7692
7693 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, n_div_16_subtile) {
7694 TEST_REQUIRES_ARM_NEON_V8;
7695 for (uint32_t n = 32; n <= 48; n += 16) {
7696 for (size_t k = 1; k <= 40; k += 9) {
7697 for (uint32_t m = 1; m <= 4; m++) {
7698 GemmMicrokernelTester()
7699 .mr(4)
7700 .nr(16)
7701 .kr(1)
7702 .sr(1)
7703 .m(m)
7704 .n(n)
7705 .k(k)
7706 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007707 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007708 }
7709 }
7710 }
7711 }
7712
7713 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, strided_cm_subtile) {
7714 TEST_REQUIRES_ARM_NEON_V8;
7715 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08007716 for (uint32_t n = 1; n <= 16; n++) {
7717 for (uint32_t m = 1; m <= 4; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08007718 GemmMicrokernelTester()
7719 .mr(4)
7720 .nr(16)
7721 .kr(1)
7722 .sr(1)
7723 .m(m)
7724 .n(n)
7725 .k(k)
7726 .cm_stride(19)
7727 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007728 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007729 }
7730 }
7731 }
7732 }
7733
7734 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, qmin) {
7735 TEST_REQUIRES_ARM_NEON_V8;
7736 GemmMicrokernelTester()
7737 .mr(4)
7738 .nr(16)
7739 .kr(1)
7740 .sr(1)
7741 .m(4)
7742 .n(16)
7743 .k(8)
7744 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08007745 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007746 }
7747
7748 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, qmax) {
7749 TEST_REQUIRES_ARM_NEON_V8;
7750 GemmMicrokernelTester()
7751 .mr(4)
7752 .nr(16)
7753 .kr(1)
7754 .sr(1)
7755 .m(4)
7756 .n(16)
7757 .k(8)
7758 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08007759 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007760 }
7761
7762 TEST(QC8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE_PRFM, strided_cm) {
7763 TEST_REQUIRES_ARM_NEON_V8;
7764 GemmMicrokernelTester()
7765 .mr(4)
7766 .nr(16)
7767 .kr(1)
7768 .sr(1)
7769 .m(4)
7770 .n(16)
7771 .k(8)
7772 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08007773 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007774 }
7775#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
7776
7777
7778#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007779 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, k_eq_16) {
Frank Barchardf6237402022-01-05 00:26:09 -08007780 TEST_REQUIRES_ARM_NEON_V8;
7781 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007782 .mr(1)
7783 .nr(8)
7784 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08007785 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007786 .m(1)
7787 .n(8)
7788 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -08007789 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007790 }
7791
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007792 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, strided_cn) {
Frank Barchardf6237402022-01-05 00:26:09 -08007793 TEST_REQUIRES_ARM_NEON_V8;
7794 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007795 .mr(1)
7796 .nr(8)
7797 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08007798 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007799 .m(1)
7800 .n(8)
7801 .k(16)
7802 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08007803 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007804 }
7805
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007806 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, k_eq_16_strided_a) {
Frank Barchardf6237402022-01-05 00:26:09 -08007807 TEST_REQUIRES_ARM_NEON_V8;
7808 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007809 .mr(1)
7810 .nr(8)
7811 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08007812 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007813 .m(1)
7814 .n(8)
7815 .k(16)
7816 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08007817 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007818 }
7819
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007820 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, k_eq_16_subtile) {
Frank Barchardf6237402022-01-05 00:26:09 -08007821 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -08007822 for (uint32_t n = 1; n <= 8; n++) {
7823 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08007824 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007825 .mr(1)
7826 .nr(8)
7827 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08007828 .sr(1)
7829 .m(m)
7830 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007831 .k(16)
Frank Barchardf6237402022-01-05 00:26:09 -08007832 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007833 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007834 }
7835 }
7836 }
7837
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007838 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, k_eq_16_subtile_m) {
Frank Barchardf6237402022-01-05 00:26:09 -08007839 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007840 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08007841 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007842 .mr(1)
7843 .nr(8)
7844 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08007845 .sr(1)
7846 .m(m)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007847 .n(8)
7848 .k(16)
Frank Barchardf6237402022-01-05 00:26:09 -08007849 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007850 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007851 }
7852 }
7853
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007854 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, k_eq_16_subtile_n) {
Frank Barchardf6237402022-01-05 00:26:09 -08007855 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007856 for (uint32_t n = 1; n <= 8; n++) {
Frank Barchardf6237402022-01-05 00:26:09 -08007857 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007858 .mr(1)
7859 .nr(8)
7860 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08007861 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007862 .m(1)
Frank Barchardf6237402022-01-05 00:26:09 -08007863 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007864 .k(16)
Frank Barchardf6237402022-01-05 00:26:09 -08007865 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007866 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007867 }
7868 }
7869
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007870 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, k_lt_16) {
Frank Barchardf6237402022-01-05 00:26:09 -08007871 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007872 for (size_t k = 1; k < 16; k++) {
Frank Barchardf6237402022-01-05 00:26:09 -08007873 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007874 .mr(1)
7875 .nr(8)
7876 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08007877 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007878 .m(1)
7879 .n(8)
Frank Barchardf6237402022-01-05 00:26:09 -08007880 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007881 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007882 }
7883 }
7884
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007885 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, k_lt_16_strided_a) {
Frank Barchardf6237402022-01-05 00:26:09 -08007886 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007887 for (size_t k = 1; k < 16; k++) {
Frank Barchardf6237402022-01-05 00:26:09 -08007888 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007889 .mr(1)
7890 .nr(8)
7891 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08007892 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007893 .m(1)
7894 .n(8)
Frank Barchardf6237402022-01-05 00:26:09 -08007895 .k(k)
7896 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08007897 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007898 }
7899 }
7900
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007901 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, k_lt_16_subtile) {
Frank Barchardf6237402022-01-05 00:26:09 -08007902 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007903 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08007904 for (uint32_t n = 1; n <= 8; n++) {
7905 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08007906 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007907 .mr(1)
7908 .nr(8)
7909 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08007910 .sr(1)
7911 .m(m)
7912 .n(n)
7913 .k(k)
7914 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007915 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007916 }
7917 }
7918 }
7919 }
7920
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007921 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, k_gt_16) {
Frank Barchardf6237402022-01-05 00:26:09 -08007922 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007923 for (size_t k = 17; k < 32; k++) {
Frank Barchardf6237402022-01-05 00:26:09 -08007924 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007925 .mr(1)
7926 .nr(8)
7927 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08007928 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007929 .m(1)
7930 .n(8)
Frank Barchardf6237402022-01-05 00:26:09 -08007931 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007932 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007933 }
7934 }
7935
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007936 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, k_gt_16_strided_a) {
Frank Barchardf6237402022-01-05 00:26:09 -08007937 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007938 for (size_t k = 17; k < 32; k++) {
Frank Barchardf6237402022-01-05 00:26:09 -08007939 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007940 .mr(1)
7941 .nr(8)
7942 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08007943 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007944 .m(1)
7945 .n(8)
Frank Barchardf6237402022-01-05 00:26:09 -08007946 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007947 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -08007948 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007949 }
7950 }
7951
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007952 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, k_gt_16_subtile) {
Frank Barchardf6237402022-01-05 00:26:09 -08007953 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007954 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08007955 for (uint32_t n = 1; n <= 8; n++) {
7956 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08007957 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007958 .mr(1)
7959 .nr(8)
7960 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08007961 .sr(1)
7962 .m(m)
7963 .n(n)
7964 .k(k)
7965 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007966 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08007967 }
7968 }
7969 }
7970 }
7971
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007972 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, k_div_16) {
Frank Barchardf6237402022-01-05 00:26:09 -08007973 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007974 for (size_t k = 32; k <= 160; k += 16) {
7975 GemmMicrokernelTester()
7976 .mr(1)
7977 .nr(8)
7978 .kr(2)
7979 .sr(1)
7980 .m(1)
7981 .n(8)
7982 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007983 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007984 }
7985 }
7986
7987 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, k_div_16_strided_a) {
7988 TEST_REQUIRES_ARM_NEON_V8;
7989 for (size_t k = 32; k <= 160; k += 16) {
7990 GemmMicrokernelTester()
7991 .mr(1)
7992 .nr(8)
7993 .kr(2)
7994 .sr(1)
7995 .m(1)
7996 .n(8)
7997 .k(k)
7998 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -08007999 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008000 }
8001 }
8002
8003 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, k_div_16_subtile) {
8004 TEST_REQUIRES_ARM_NEON_V8;
8005 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08008006 for (uint32_t n = 1; n <= 8; n++) {
8007 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008008 GemmMicrokernelTester()
8009 .mr(1)
8010 .nr(8)
8011 .kr(2)
8012 .sr(1)
8013 .m(m)
8014 .n(n)
8015 .k(k)
8016 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008017 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008018 }
Frank Barchardf6237402022-01-05 00:26:09 -08008019 }
8020 }
8021 }
8022
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008023 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, n_gt_8) {
Frank Barchardf6237402022-01-05 00:26:09 -08008024 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008025 for (uint32_t n = 9; n < 16; n++) {
8026 for (size_t k = 1; k <= 80; k += 17) {
Frank Barchardf6237402022-01-05 00:26:09 -08008027 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008028 .mr(1)
8029 .nr(8)
8030 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08008031 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008032 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08008033 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08008034 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08008035 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08008036 }
8037 }
8038 }
8039
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008040 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, n_gt_8_strided_cn) {
Frank Barchardf6237402022-01-05 00:26:09 -08008041 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008042 for (uint32_t n = 9; n < 16; n++) {
8043 for (size_t k = 1; k <= 80; k += 17) {
Frank Barchardf6237402022-01-05 00:26:09 -08008044 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008045 .mr(1)
8046 .nr(8)
8047 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08008048 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008049 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08008050 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008051 .k(k)
8052 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08008053 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008054 }
8055 }
8056 }
8057
8058 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, n_gt_8_strided_a) {
8059 TEST_REQUIRES_ARM_NEON_V8;
8060 for (uint32_t n = 9; n < 16; n++) {
8061 for (size_t k = 1; k <= 80; k += 17) {
8062 GemmMicrokernelTester()
8063 .mr(1)
8064 .nr(8)
8065 .kr(2)
8066 .sr(1)
8067 .m(1)
Frank Barchardf6237402022-01-05 00:26:09 -08008068 .n(n)
8069 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008070 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08008071 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08008072 }
8073 }
8074 }
8075
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008076 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, n_gt_8_subtile) {
Frank Barchardf6237402022-01-05 00:26:09 -08008077 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008078 for (uint32_t n = 9; n < 16; n++) {
8079 for (size_t k = 1; k <= 80; k += 17) {
8080 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08008081 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008082 .mr(1)
8083 .nr(8)
8084 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08008085 .sr(1)
8086 .m(m)
8087 .n(n)
8088 .k(k)
8089 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008090 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08008091 }
8092 }
8093 }
8094 }
8095
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008096 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, n_div_8) {
Frank Barchardf6237402022-01-05 00:26:09 -08008097 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008098 for (uint32_t n = 16; n <= 24; n += 8) {
8099 for (size_t k = 1; k <= 80; k += 17) {
Frank Barchardf6237402022-01-05 00:26:09 -08008100 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008101 .mr(1)
8102 .nr(8)
8103 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08008104 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008105 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08008106 .n(n)
Frank Barchardf6237402022-01-05 00:26:09 -08008107 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08008108 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08008109 }
8110 }
8111 }
8112
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008113 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, n_div_8_strided_cn) {
Frank Barchardf6237402022-01-05 00:26:09 -08008114 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008115 for (uint32_t n = 16; n <= 24; n += 8) {
8116 for (size_t k = 1; k <= 80; k += 17) {
Frank Barchardf6237402022-01-05 00:26:09 -08008117 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008118 .mr(1)
8119 .nr(8)
8120 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08008121 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008122 .m(1)
Frank Barchardf6237402022-01-05 00:26:09 -08008123 .n(n)
8124 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008125 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08008126 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08008127 }
8128 }
8129 }
8130
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008131 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, n_div_8_strided_a) {
Frank Barchardf6237402022-01-05 00:26:09 -08008132 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008133 for (uint32_t n = 16; n <= 24; n += 8) {
8134 for (size_t k = 1; k <= 80; k += 17) {
Frank Barchardf6237402022-01-05 00:26:09 -08008135 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008136 .mr(1)
8137 .nr(8)
8138 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08008139 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008140 .m(1)
Frank Barchardf6237402022-01-05 00:26:09 -08008141 .n(n)
8142 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008143 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08008144 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08008145 }
8146 }
8147 }
8148
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008149 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, n_div_8_subtile) {
Frank Barchardf6237402022-01-05 00:26:09 -08008150 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008151 for (uint32_t n = 16; n <= 24; n += 8) {
8152 for (size_t k = 1; k <= 80; k += 17) {
8153 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08008154 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008155 .mr(1)
8156 .nr(8)
8157 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08008158 .sr(1)
8159 .m(m)
8160 .n(n)
8161 .k(k)
8162 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008163 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08008164 }
8165 }
8166 }
8167 }
8168
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008169 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, strided_cm_subtile) {
Frank Barchardf6237402022-01-05 00:26:09 -08008170 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008171 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08008172 for (uint32_t n = 1; n <= 8; n++) {
8173 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchardf6237402022-01-05 00:26:09 -08008174 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008175 .mr(1)
8176 .nr(8)
8177 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08008178 .sr(1)
8179 .m(m)
8180 .n(n)
8181 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008182 .cm_stride(11)
Frank Barchardf6237402022-01-05 00:26:09 -08008183 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008184 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08008185 }
8186 }
8187 }
8188 }
8189
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008190 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, qmin) {
Frank Barchardf6237402022-01-05 00:26:09 -08008191 TEST_REQUIRES_ARM_NEON_V8;
8192 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008193 .mr(1)
8194 .nr(8)
8195 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08008196 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008197 .m(1)
8198 .n(8)
8199 .k(16)
Frank Barchardf6237402022-01-05 00:26:09 -08008200 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08008201 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08008202 }
8203
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008204 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, qmax) {
Frank Barchardf6237402022-01-05 00:26:09 -08008205 TEST_REQUIRES_ARM_NEON_V8;
8206 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008207 .mr(1)
8208 .nr(8)
8209 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08008210 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008211 .m(1)
8212 .n(8)
8213 .k(16)
Frank Barchardf6237402022-01-05 00:26:09 -08008214 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08008215 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08008216 }
8217
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008218 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD1R, strided_cm) {
Frank Barchardf6237402022-01-05 00:26:09 -08008219 TEST_REQUIRES_ARM_NEON_V8;
8220 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008221 .mr(1)
8222 .nr(8)
8223 .kr(2)
Frank Barchardf6237402022-01-05 00:26:09 -08008224 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008225 .m(1)
8226 .n(8)
8227 .k(16)
8228 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08008229 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchardf6237402022-01-05 00:26:09 -08008230 }
8231#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
8232
8233
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008234#if XNN_ARCH_ARM || XNN_ARCH_ARM64
8235 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, k_eq_16) {
8236 TEST_REQUIRES_ARM_NEON;
Frank Barchard87fe4102021-12-28 14:42:23 -08008237 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008238 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008239 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008240 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008241 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008242 .m(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008243 .n(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008244 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -08008245 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008246 }
8247
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008248 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, strided_cn) {
8249 TEST_REQUIRES_ARM_NEON;
Frank Barchard87fe4102021-12-28 14:42:23 -08008250 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008251 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008252 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008253 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008254 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008255 .m(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008256 .n(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008257 .k(16)
Frank Barchard87fe4102021-12-28 14:42:23 -08008258 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08008259 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008260 }
8261
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008262 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, k_eq_16_strided_a) {
8263 TEST_REQUIRES_ARM_NEON;
Frank Barchard87fe4102021-12-28 14:42:23 -08008264 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008265 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008266 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008267 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008268 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008269 .m(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008270 .n(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008271 .k(16)
8272 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08008273 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008274 }
8275
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008276 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, k_eq_16_subtile) {
8277 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -08008278 for (uint32_t n = 1; n <= 8; n++) {
8279 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008280 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008281 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008282 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008283 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008284 .sr(1)
8285 .m(m)
8286 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008287 .k(16)
Frank Barchard87fe4102021-12-28 14:42:23 -08008288 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008289 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008290 }
8291 }
8292 }
8293
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008294 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, k_eq_16_subtile_m) {
8295 TEST_REQUIRES_ARM_NEON;
8296 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008297 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008298 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008299 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008300 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008301 .sr(1)
8302 .m(m)
8303 .n(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008304 .k(16)
Frank Barchard87fe4102021-12-28 14:42:23 -08008305 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008306 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008307 }
8308 }
8309
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008310 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, k_eq_16_subtile_n) {
8311 TEST_REQUIRES_ARM_NEON;
Frank Barchard87fe4102021-12-28 14:42:23 -08008312 for (uint32_t n = 1; n <= 8; n++) {
8313 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008314 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008315 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008316 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008317 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008318 .m(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008319 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008320 .k(16)
Frank Barchard87fe4102021-12-28 14:42:23 -08008321 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008322 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008323 }
8324 }
8325
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008326 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, k_lt_16) {
8327 TEST_REQUIRES_ARM_NEON;
8328 for (size_t k = 1; k < 16; k++) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008329 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008330 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008331 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008332 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008333 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008334 .m(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008335 .n(8)
8336 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08008337 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008338 }
8339 }
8340
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008341 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, k_lt_16_strided_a) {
8342 TEST_REQUIRES_ARM_NEON;
8343 for (size_t k = 1; k < 16; k++) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008344 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008345 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008346 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008347 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008348 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008349 .m(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008350 .n(8)
8351 .k(k)
8352 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08008353 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008354 }
8355 }
8356
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008357 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, k_lt_16_subtile) {
8358 TEST_REQUIRES_ARM_NEON;
8359 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08008360 for (uint32_t n = 1; n <= 8; n++) {
8361 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008362 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008363 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008364 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008365 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008366 .sr(1)
8367 .m(m)
8368 .n(n)
8369 .k(k)
8370 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008371 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008372 }
8373 }
8374 }
8375 }
8376
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008377 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, k_gt_16) {
8378 TEST_REQUIRES_ARM_NEON;
8379 for (size_t k = 17; k < 32; k++) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008380 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008381 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008382 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008383 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008384 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008385 .m(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008386 .n(8)
8387 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08008388 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008389 }
8390 }
8391
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008392 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, k_gt_16_strided_a) {
8393 TEST_REQUIRES_ARM_NEON;
8394 for (size_t k = 17; k < 32; k++) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008395 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008396 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008397 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008398 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008399 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008400 .m(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008401 .n(8)
8402 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008403 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -08008404 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008405 }
8406 }
8407
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008408 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, k_gt_16_subtile) {
8409 TEST_REQUIRES_ARM_NEON;
8410 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08008411 for (uint32_t n = 1; n <= 8; n++) {
8412 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008413 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008414 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008415 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008416 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008417 .sr(1)
8418 .m(m)
8419 .n(n)
8420 .k(k)
8421 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008422 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008423 }
8424 }
8425 }
8426 }
8427
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008428 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, k_div_16) {
8429 TEST_REQUIRES_ARM_NEON;
8430 for (size_t k = 32; k <= 160; k += 16) {
8431 GemmMicrokernelTester()
8432 .mr(2)
8433 .nr(8)
8434 .kr(2)
8435 .sr(1)
8436 .m(2)
8437 .n(8)
8438 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08008439 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008440 }
8441 }
8442
8443 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, k_div_16_strided_a) {
8444 TEST_REQUIRES_ARM_NEON;
8445 for (size_t k = 32; k <= 160; k += 16) {
8446 GemmMicrokernelTester()
8447 .mr(2)
8448 .nr(8)
8449 .kr(2)
8450 .sr(1)
8451 .m(2)
8452 .n(8)
8453 .k(k)
8454 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -08008455 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008456 }
8457 }
8458
8459 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, k_div_16_subtile) {
8460 TEST_REQUIRES_ARM_NEON;
8461 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08008462 for (uint32_t n = 1; n <= 8; n++) {
8463 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008464 GemmMicrokernelTester()
8465 .mr(2)
8466 .nr(8)
8467 .kr(2)
8468 .sr(1)
8469 .m(m)
8470 .n(n)
8471 .k(k)
8472 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008473 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008474 }
8475 }
8476 }
8477 }
8478
8479 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, n_gt_8) {
8480 TEST_REQUIRES_ARM_NEON;
Frank Barchard87fe4102021-12-28 14:42:23 -08008481 for (uint32_t n = 9; n < 16; n++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008482 for (size_t k = 1; k <= 80; k += 17) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008483 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008484 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008485 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008486 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008487 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008488 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08008489 .n(n)
Frank Barchard87fe4102021-12-28 14:42:23 -08008490 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08008491 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008492 }
8493 }
8494 }
8495
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008496 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, n_gt_8_strided_cn) {
8497 TEST_REQUIRES_ARM_NEON;
Frank Barchard87fe4102021-12-28 14:42:23 -08008498 for (uint32_t n = 9; n < 16; n++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008499 for (size_t k = 1; k <= 80; k += 17) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008500 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008501 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008502 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008503 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008504 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008505 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08008506 .n(n)
Frank Barchard87fe4102021-12-28 14:42:23 -08008507 .k(k)
8508 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08008509 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008510 }
8511 }
8512 }
8513
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008514 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, n_gt_8_strided_a) {
8515 TEST_REQUIRES_ARM_NEON;
Frank Barchard87fe4102021-12-28 14:42:23 -08008516 for (uint32_t n = 9; n < 16; n++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008517 for (size_t k = 1; k <= 80; k += 17) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008518 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008519 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008520 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008521 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008522 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008523 .m(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008524 .n(n)
8525 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008526 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08008527 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008528 }
8529 }
8530 }
8531
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008532 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, n_gt_8_subtile) {
8533 TEST_REQUIRES_ARM_NEON;
Frank Barchard87fe4102021-12-28 14:42:23 -08008534 for (uint32_t n = 9; n < 16; n++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008535 for (size_t k = 1; k <= 80; k += 17) {
8536 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008537 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008538 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008539 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008540 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008541 .sr(1)
8542 .m(m)
8543 .n(n)
8544 .k(k)
8545 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008546 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008547 }
8548 }
8549 }
8550 }
8551
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008552 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, n_div_8) {
8553 TEST_REQUIRES_ARM_NEON;
Frank Barchard87fe4102021-12-28 14:42:23 -08008554 for (uint32_t n = 16; n <= 24; n += 8) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008555 for (size_t k = 1; k <= 80; k += 17) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008556 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008557 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008558 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008559 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008560 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008561 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08008562 .n(n)
Frank Barchard87fe4102021-12-28 14:42:23 -08008563 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08008564 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008565 }
8566 }
8567 }
8568
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008569 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, n_div_8_strided_cn) {
8570 TEST_REQUIRES_ARM_NEON;
Frank Barchard87fe4102021-12-28 14:42:23 -08008571 for (uint32_t n = 16; n <= 24; n += 8) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008572 for (size_t k = 1; k <= 80; k += 17) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008573 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008574 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008575 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008576 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008577 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008578 .m(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008579 .n(n)
8580 .k(k)
8581 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08008582 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008583 }
8584 }
8585 }
8586
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008587 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, n_div_8_strided_a) {
8588 TEST_REQUIRES_ARM_NEON;
Frank Barchard87fe4102021-12-28 14:42:23 -08008589 for (uint32_t n = 16; n <= 24; n += 8) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008590 for (size_t k = 1; k <= 80; k += 17) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008591 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008592 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008593 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008594 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008595 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008596 .m(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008597 .n(n)
8598 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008599 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08008600 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008601 }
8602 }
8603 }
8604
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008605 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, n_div_8_subtile) {
8606 TEST_REQUIRES_ARM_NEON;
Frank Barchard87fe4102021-12-28 14:42:23 -08008607 for (uint32_t n = 16; n <= 24; n += 8) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008608 for (size_t k = 1; k <= 80; k += 17) {
8609 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008610 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008611 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008612 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008613 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008614 .sr(1)
8615 .m(m)
8616 .n(n)
8617 .k(k)
8618 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008619 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008620 }
8621 }
8622 }
8623 }
8624
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008625 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, strided_cm_subtile) {
8626 TEST_REQUIRES_ARM_NEON;
8627 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08008628 for (uint32_t n = 1; n <= 8; n++) {
8629 for (uint32_t m = 1; m <= 2; m++) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008630 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008631 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008632 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008633 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008634 .sr(1)
8635 .m(m)
8636 .n(n)
8637 .k(k)
8638 .cm_stride(11)
8639 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008640 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008641 }
8642 }
8643 }
8644 }
8645
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008646 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, qmin) {
8647 TEST_REQUIRES_ARM_NEON;
Frank Barchard87fe4102021-12-28 14:42:23 -08008648 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008649 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008650 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008651 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008652 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008653 .m(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008654 .n(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008655 .k(16)
Frank Barchard87fe4102021-12-28 14:42:23 -08008656 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08008657 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008658 }
8659
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008660 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, qmax) {
8661 TEST_REQUIRES_ARM_NEON;
Frank Barchard87fe4102021-12-28 14:42:23 -08008662 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008663 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008664 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008665 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008666 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008667 .m(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008668 .n(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008669 .k(16)
Frank Barchard87fe4102021-12-28 14:42:23 -08008670 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08008671 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008672 }
8673
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008674 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD2R, strided_cm) {
8675 TEST_REQUIRES_ARM_NEON;
Frank Barchard87fe4102021-12-28 14:42:23 -08008676 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008677 .mr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008678 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008679 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008680 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008681 .m(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008682 .n(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008683 .k(16)
Frank Barchard87fe4102021-12-28 14:42:23 -08008684 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08008685 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008686 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008687#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard87fe4102021-12-28 14:42:23 -08008688
8689
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008690#if XNN_ARCH_ARM || XNN_ARCH_ARM64
8691 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, k_eq_16) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008692 TEST_REQUIRES_ARM_NEON_V8;
8693 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008694 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008695 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008696 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008697 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008698 .m(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008699 .n(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008700 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -08008701 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008702 }
8703
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008704 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, strided_cn) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008705 TEST_REQUIRES_ARM_NEON_V8;
8706 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008707 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008708 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008709 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008710 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008711 .m(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008712 .n(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008713 .k(16)
Frank Barchard87fe4102021-12-28 14:42:23 -08008714 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08008715 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008716 }
8717
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008718 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, k_eq_16_strided_a) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008719 TEST_REQUIRES_ARM_NEON_V8;
8720 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008721 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008722 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008723 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008724 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008725 .m(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008726 .n(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008727 .k(16)
8728 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08008729 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008730 }
8731
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008732 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, k_eq_16_subtile) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008733 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -08008734 for (uint32_t n = 1; n <= 8; n++) {
8735 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008736 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008737 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008738 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008739 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008740 .sr(1)
8741 .m(m)
8742 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008743 .k(16)
Frank Barchard87fe4102021-12-28 14:42:23 -08008744 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008745 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008746 }
8747 }
8748 }
8749
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008750 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, k_eq_16_subtile_m) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008751 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008752 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008753 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008754 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008755 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008756 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008757 .sr(1)
8758 .m(m)
8759 .n(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008760 .k(16)
Frank Barchard87fe4102021-12-28 14:42:23 -08008761 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008762 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008763 }
8764 }
8765
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008766 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, k_eq_16_subtile_n) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008767 TEST_REQUIRES_ARM_NEON_V8;
8768 for (uint32_t n = 1; n <= 8; n++) {
8769 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008770 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008771 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008772 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008773 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008774 .m(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008775 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008776 .k(16)
Frank Barchard87fe4102021-12-28 14:42:23 -08008777 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008778 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008779 }
8780 }
8781
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008782 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, k_lt_16) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008783 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008784 for (size_t k = 1; k < 16; k++) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008785 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008786 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008787 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008788 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008789 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008790 .m(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008791 .n(8)
8792 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08008793 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008794 }
8795 }
8796
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008797 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, k_lt_16_strided_a) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008798 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008799 for (size_t k = 1; k < 16; k++) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008800 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008801 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008802 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008803 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008804 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008805 .m(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008806 .n(8)
8807 .k(k)
8808 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08008809 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008810 }
8811 }
8812
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008813 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, k_lt_16_subtile) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008814 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008815 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08008816 for (uint32_t n = 1; n <= 8; n++) {
8817 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008818 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008819 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008820 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008821 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008822 .sr(1)
8823 .m(m)
8824 .n(n)
8825 .k(k)
8826 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008827 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008828 }
8829 }
8830 }
8831 }
8832
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008833 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, k_gt_16) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008834 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008835 for (size_t k = 17; k < 32; k++) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008836 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008837 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008838 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008839 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008840 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008841 .m(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008842 .n(8)
8843 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08008844 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008845 }
8846 }
8847
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008848 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, k_gt_16_strided_a) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008849 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008850 for (size_t k = 17; k < 32; k++) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008851 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008852 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008853 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008854 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008855 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008856 .m(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008857 .n(8)
8858 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008859 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -08008860 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008861 }
8862 }
8863
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008864 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, k_gt_16_subtile) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008865 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008866 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08008867 for (uint32_t n = 1; n <= 8; n++) {
8868 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008869 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008870 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008871 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008872 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008873 .sr(1)
8874 .m(m)
8875 .n(n)
8876 .k(k)
8877 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008878 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008879 }
8880 }
8881 }
8882 }
8883
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008884 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, k_div_16) {
8885 TEST_REQUIRES_ARM_NEON_V8;
8886 for (size_t k = 32; k <= 160; k += 16) {
8887 GemmMicrokernelTester()
8888 .mr(1)
8889 .nr(8)
8890 .kr(2)
8891 .sr(1)
8892 .m(1)
8893 .n(8)
8894 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08008895 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008896 }
8897 }
8898
8899 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, k_div_16_strided_a) {
8900 TEST_REQUIRES_ARM_NEON_V8;
8901 for (size_t k = 32; k <= 160; k += 16) {
8902 GemmMicrokernelTester()
8903 .mr(1)
8904 .nr(8)
8905 .kr(2)
8906 .sr(1)
8907 .m(1)
8908 .n(8)
8909 .k(k)
8910 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -08008911 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008912 }
8913 }
8914
8915 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, k_div_16_subtile) {
8916 TEST_REQUIRES_ARM_NEON_V8;
8917 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08008918 for (uint32_t n = 1; n <= 8; n++) {
8919 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008920 GemmMicrokernelTester()
8921 .mr(1)
8922 .nr(8)
8923 .kr(2)
8924 .sr(1)
8925 .m(m)
8926 .n(n)
8927 .k(k)
8928 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008929 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008930 }
8931 }
8932 }
8933 }
8934
8935 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, n_gt_8) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008936 TEST_REQUIRES_ARM_NEON_V8;
8937 for (uint32_t n = 9; n < 16; n++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008938 for (size_t k = 1; k <= 80; k += 17) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008939 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008940 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008941 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008942 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008943 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008944 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08008945 .n(n)
Frank Barchard87fe4102021-12-28 14:42:23 -08008946 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08008947 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008948 }
8949 }
8950 }
8951
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008952 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, n_gt_8_strided_cn) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008953 TEST_REQUIRES_ARM_NEON_V8;
8954 for (uint32_t n = 9; n < 16; n++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008955 for (size_t k = 1; k <= 80; k += 17) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008956 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008957 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008958 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008959 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008960 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008961 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08008962 .n(n)
Frank Barchard87fe4102021-12-28 14:42:23 -08008963 .k(k)
8964 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08008965 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008966 }
8967 }
8968 }
8969
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008970 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, n_gt_8_strided_a) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008971 TEST_REQUIRES_ARM_NEON_V8;
8972 for (uint32_t n = 9; n < 16; n++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008973 for (size_t k = 1; k <= 80; k += 17) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008974 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008975 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008976 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008977 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008978 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008979 .m(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008980 .n(n)
8981 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008982 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08008983 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08008984 }
8985 }
8986 }
8987
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008988 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, n_gt_8_subtile) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008989 TEST_REQUIRES_ARM_NEON_V8;
8990 for (uint32_t n = 9; n < 16; n++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008991 for (size_t k = 1; k <= 80; k += 17) {
8992 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchard87fe4102021-12-28 14:42:23 -08008993 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008994 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08008995 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008996 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08008997 .sr(1)
8998 .m(m)
8999 .n(n)
9000 .k(k)
9001 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009002 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08009003 }
9004 }
9005 }
9006 }
9007
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009008 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, n_div_8) {
Frank Barchard87fe4102021-12-28 14:42:23 -08009009 TEST_REQUIRES_ARM_NEON_V8;
9010 for (uint32_t n = 16; n <= 24; n += 8) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009011 for (size_t k = 1; k <= 80; k += 17) {
Frank Barchard87fe4102021-12-28 14:42:23 -08009012 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009013 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08009014 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009015 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08009016 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009017 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08009018 .n(n)
Frank Barchard87fe4102021-12-28 14:42:23 -08009019 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08009020 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08009021 }
9022 }
9023 }
9024
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009025 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, n_div_8_strided_cn) {
Frank Barchard87fe4102021-12-28 14:42:23 -08009026 TEST_REQUIRES_ARM_NEON_V8;
9027 for (uint32_t n = 16; n <= 24; n += 8) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009028 for (size_t k = 1; k <= 80; k += 17) {
Frank Barchard87fe4102021-12-28 14:42:23 -08009029 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009030 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08009031 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009032 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08009033 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009034 .m(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08009035 .n(n)
9036 .k(k)
9037 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08009038 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08009039 }
9040 }
9041 }
9042
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009043 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, n_div_8_strided_a) {
Frank Barchard87fe4102021-12-28 14:42:23 -08009044 TEST_REQUIRES_ARM_NEON_V8;
9045 for (uint32_t n = 16; n <= 24; n += 8) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009046 for (size_t k = 1; k <= 80; k += 17) {
Frank Barchard87fe4102021-12-28 14:42:23 -08009047 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009048 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08009049 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009050 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08009051 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009052 .m(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08009053 .n(n)
9054 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009055 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08009056 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08009057 }
9058 }
9059 }
9060
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009061 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, n_div_8_subtile) {
Frank Barchard87fe4102021-12-28 14:42:23 -08009062 TEST_REQUIRES_ARM_NEON_V8;
9063 for (uint32_t n = 16; n <= 24; n += 8) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009064 for (size_t k = 1; k <= 80; k += 17) {
9065 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchard87fe4102021-12-28 14:42:23 -08009066 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009067 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08009068 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009069 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08009070 .sr(1)
9071 .m(m)
9072 .n(n)
9073 .k(k)
9074 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009075 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08009076 }
9077 }
9078 }
9079 }
9080
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009081 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, strided_cm_subtile) {
Frank Barchard87fe4102021-12-28 14:42:23 -08009082 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009083 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08009084 for (uint32_t n = 1; n <= 8; n++) {
9085 for (uint32_t m = 1; m <= 1; m++) {
Frank Barchard87fe4102021-12-28 14:42:23 -08009086 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009087 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08009088 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009089 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08009090 .sr(1)
9091 .m(m)
9092 .n(n)
9093 .k(k)
9094 .cm_stride(11)
9095 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009096 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08009097 }
9098 }
9099 }
9100 }
9101
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009102 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, qmin) {
Frank Barchard87fe4102021-12-28 14:42:23 -08009103 TEST_REQUIRES_ARM_NEON_V8;
9104 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009105 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08009106 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009107 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08009108 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009109 .m(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08009110 .n(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009111 .k(16)
Frank Barchard87fe4102021-12-28 14:42:23 -08009112 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08009113 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08009114 }
9115
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009116 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, qmax) {
Frank Barchard87fe4102021-12-28 14:42:23 -08009117 TEST_REQUIRES_ARM_NEON_V8;
9118 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009119 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08009120 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009121 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08009122 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009123 .m(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08009124 .n(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009125 .k(16)
Frank Barchard87fe4102021-12-28 14:42:23 -08009126 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08009127 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08009128 }
9129
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009130 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEONV8_MLAL_LD2R, strided_cm) {
Frank Barchard87fe4102021-12-28 14:42:23 -08009131 TEST_REQUIRES_ARM_NEON_V8;
9132 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009133 .mr(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08009134 .nr(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009135 .kr(2)
Frank Barchard87fe4102021-12-28 14:42:23 -08009136 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009137 .m(1)
Frank Barchard87fe4102021-12-28 14:42:23 -08009138 .n(8)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009139 .k(16)
Frank Barchard87fe4102021-12-28 14:42:23 -08009140 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08009141 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard87fe4102021-12-28 14:42:23 -08009142 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009143#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
9144
9145
9146#if XNN_ARCH_ARM || XNN_ARCH_ARM64
9147 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, k_eq_16) {
9148 TEST_REQUIRES_ARM_NEON_V8;
9149 GemmMicrokernelTester()
9150 .mr(2)
9151 .nr(8)
9152 .kr(2)
9153 .sr(1)
9154 .m(2)
9155 .n(8)
9156 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -08009157 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009158 }
9159
9160 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, strided_cn) {
9161 TEST_REQUIRES_ARM_NEON_V8;
9162 GemmMicrokernelTester()
9163 .mr(2)
9164 .nr(8)
9165 .kr(2)
9166 .sr(1)
9167 .m(2)
9168 .n(8)
9169 .k(16)
9170 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08009171 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009172 }
9173
9174 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, k_eq_16_strided_a) {
9175 TEST_REQUIRES_ARM_NEON_V8;
9176 GemmMicrokernelTester()
9177 .mr(2)
9178 .nr(8)
9179 .kr(2)
9180 .sr(1)
9181 .m(2)
9182 .n(8)
9183 .k(16)
9184 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08009185 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009186 }
9187
9188 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, k_eq_16_subtile) {
9189 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -08009190 for (uint32_t n = 1; n <= 8; n++) {
9191 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009192 GemmMicrokernelTester()
9193 .mr(2)
9194 .nr(8)
9195 .kr(2)
9196 .sr(1)
9197 .m(m)
9198 .n(n)
9199 .k(16)
9200 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009201 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009202 }
9203 }
9204 }
9205
9206 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, k_eq_16_subtile_m) {
9207 TEST_REQUIRES_ARM_NEON_V8;
9208 for (uint32_t m = 1; m <= 2; m++) {
9209 GemmMicrokernelTester()
9210 .mr(2)
9211 .nr(8)
9212 .kr(2)
9213 .sr(1)
9214 .m(m)
9215 .n(8)
9216 .k(16)
9217 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009218 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009219 }
9220 }
9221
9222 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, k_eq_16_subtile_n) {
9223 TEST_REQUIRES_ARM_NEON_V8;
9224 for (uint32_t n = 1; n <= 8; n++) {
9225 GemmMicrokernelTester()
9226 .mr(2)
9227 .nr(8)
9228 .kr(2)
9229 .sr(1)
9230 .m(2)
9231 .n(n)
9232 .k(16)
9233 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009234 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009235 }
9236 }
9237
9238 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, k_lt_16) {
9239 TEST_REQUIRES_ARM_NEON_V8;
9240 for (size_t k = 1; k < 16; k++) {
9241 GemmMicrokernelTester()
9242 .mr(2)
9243 .nr(8)
9244 .kr(2)
9245 .sr(1)
9246 .m(2)
9247 .n(8)
9248 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08009249 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009250 }
9251 }
9252
9253 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, k_lt_16_strided_a) {
9254 TEST_REQUIRES_ARM_NEON_V8;
9255 for (size_t k = 1; k < 16; k++) {
9256 GemmMicrokernelTester()
9257 .mr(2)
9258 .nr(8)
9259 .kr(2)
9260 .sr(1)
9261 .m(2)
9262 .n(8)
9263 .k(k)
9264 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08009265 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009266 }
9267 }
9268
9269 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, k_lt_16_subtile) {
9270 TEST_REQUIRES_ARM_NEON_V8;
9271 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08009272 for (uint32_t n = 1; n <= 8; n++) {
9273 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009274 GemmMicrokernelTester()
9275 .mr(2)
9276 .nr(8)
9277 .kr(2)
9278 .sr(1)
9279 .m(m)
9280 .n(n)
9281 .k(k)
9282 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009283 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009284 }
9285 }
9286 }
9287 }
9288
9289 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, k_gt_16) {
9290 TEST_REQUIRES_ARM_NEON_V8;
9291 for (size_t k = 17; k < 32; k++) {
9292 GemmMicrokernelTester()
9293 .mr(2)
9294 .nr(8)
9295 .kr(2)
9296 .sr(1)
9297 .m(2)
9298 .n(8)
9299 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08009300 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009301 }
9302 }
9303
9304 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, k_gt_16_strided_a) {
9305 TEST_REQUIRES_ARM_NEON_V8;
9306 for (size_t k = 17; k < 32; k++) {
9307 GemmMicrokernelTester()
9308 .mr(2)
9309 .nr(8)
9310 .kr(2)
9311 .sr(1)
9312 .m(2)
9313 .n(8)
9314 .k(k)
9315 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -08009316 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009317 }
9318 }
9319
9320 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, k_gt_16_subtile) {
9321 TEST_REQUIRES_ARM_NEON_V8;
9322 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08009323 for (uint32_t n = 1; n <= 8; n++) {
9324 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009325 GemmMicrokernelTester()
9326 .mr(2)
9327 .nr(8)
9328 .kr(2)
9329 .sr(1)
9330 .m(m)
9331 .n(n)
9332 .k(k)
9333 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009334 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009335 }
9336 }
9337 }
9338 }
9339
9340 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, k_div_16) {
9341 TEST_REQUIRES_ARM_NEON_V8;
9342 for (size_t k = 32; k <= 160; k += 16) {
9343 GemmMicrokernelTester()
9344 .mr(2)
9345 .nr(8)
9346 .kr(2)
9347 .sr(1)
9348 .m(2)
9349 .n(8)
9350 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08009351 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009352 }
9353 }
9354
9355 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, k_div_16_strided_a) {
9356 TEST_REQUIRES_ARM_NEON_V8;
9357 for (size_t k = 32; k <= 160; k += 16) {
9358 GemmMicrokernelTester()
9359 .mr(2)
9360 .nr(8)
9361 .kr(2)
9362 .sr(1)
9363 .m(2)
9364 .n(8)
9365 .k(k)
9366 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -08009367 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009368 }
9369 }
9370
9371 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, k_div_16_subtile) {
9372 TEST_REQUIRES_ARM_NEON_V8;
9373 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08009374 for (uint32_t n = 1; n <= 8; n++) {
9375 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009376 GemmMicrokernelTester()
9377 .mr(2)
9378 .nr(8)
9379 .kr(2)
9380 .sr(1)
9381 .m(m)
9382 .n(n)
9383 .k(k)
9384 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009385 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009386 }
9387 }
9388 }
9389 }
9390
9391 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, n_gt_8) {
9392 TEST_REQUIRES_ARM_NEON_V8;
9393 for (uint32_t n = 9; n < 16; n++) {
9394 for (size_t k = 1; k <= 80; k += 17) {
9395 GemmMicrokernelTester()
9396 .mr(2)
9397 .nr(8)
9398 .kr(2)
9399 .sr(1)
9400 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08009401 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009402 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08009403 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009404 }
9405 }
9406 }
9407
9408 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, n_gt_8_strided_cn) {
9409 TEST_REQUIRES_ARM_NEON_V8;
9410 for (uint32_t n = 9; n < 16; n++) {
9411 for (size_t k = 1; k <= 80; k += 17) {
9412 GemmMicrokernelTester()
9413 .mr(2)
9414 .nr(8)
9415 .kr(2)
9416 .sr(1)
9417 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08009418 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009419 .k(k)
9420 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08009421 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009422 }
9423 }
9424 }
9425
9426 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, n_gt_8_strided_a) {
9427 TEST_REQUIRES_ARM_NEON_V8;
9428 for (uint32_t n = 9; n < 16; n++) {
9429 for (size_t k = 1; k <= 80; k += 17) {
9430 GemmMicrokernelTester()
9431 .mr(2)
9432 .nr(8)
9433 .kr(2)
9434 .sr(1)
9435 .m(2)
9436 .n(n)
9437 .k(k)
9438 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08009439 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009440 }
9441 }
9442 }
9443
9444 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, n_gt_8_subtile) {
9445 TEST_REQUIRES_ARM_NEON_V8;
9446 for (uint32_t n = 9; n < 16; n++) {
9447 for (size_t k = 1; k <= 80; k += 17) {
9448 for (uint32_t m = 1; m <= 2; m++) {
9449 GemmMicrokernelTester()
9450 .mr(2)
9451 .nr(8)
9452 .kr(2)
9453 .sr(1)
9454 .m(m)
9455 .n(n)
9456 .k(k)
9457 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009458 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009459 }
9460 }
9461 }
9462 }
9463
9464 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, n_div_8) {
9465 TEST_REQUIRES_ARM_NEON_V8;
9466 for (uint32_t n = 16; n <= 24; n += 8) {
9467 for (size_t k = 1; k <= 80; k += 17) {
9468 GemmMicrokernelTester()
9469 .mr(2)
9470 .nr(8)
9471 .kr(2)
9472 .sr(1)
9473 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08009474 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009475 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08009476 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009477 }
9478 }
9479 }
9480
9481 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, n_div_8_strided_cn) {
9482 TEST_REQUIRES_ARM_NEON_V8;
9483 for (uint32_t n = 16; n <= 24; n += 8) {
9484 for (size_t k = 1; k <= 80; k += 17) {
9485 GemmMicrokernelTester()
9486 .mr(2)
9487 .nr(8)
9488 .kr(2)
9489 .sr(1)
9490 .m(2)
9491 .n(n)
9492 .k(k)
9493 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08009494 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009495 }
9496 }
9497 }
9498
9499 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, n_div_8_strided_a) {
9500 TEST_REQUIRES_ARM_NEON_V8;
9501 for (uint32_t n = 16; n <= 24; n += 8) {
9502 for (size_t k = 1; k <= 80; k += 17) {
9503 GemmMicrokernelTester()
9504 .mr(2)
9505 .nr(8)
9506 .kr(2)
9507 .sr(1)
9508 .m(2)
9509 .n(n)
9510 .k(k)
9511 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08009512 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009513 }
9514 }
9515 }
9516
9517 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, n_div_8_subtile) {
9518 TEST_REQUIRES_ARM_NEON_V8;
9519 for (uint32_t n = 16; n <= 24; n += 8) {
9520 for (size_t k = 1; k <= 80; k += 17) {
9521 for (uint32_t m = 1; m <= 2; m++) {
9522 GemmMicrokernelTester()
9523 .mr(2)
9524 .nr(8)
9525 .kr(2)
9526 .sr(1)
9527 .m(m)
9528 .n(n)
9529 .k(k)
9530 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009531 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009532 }
9533 }
9534 }
9535 }
9536
9537 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, strided_cm_subtile) {
9538 TEST_REQUIRES_ARM_NEON_V8;
9539 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08009540 for (uint32_t n = 1; n <= 8; n++) {
9541 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009542 GemmMicrokernelTester()
9543 .mr(2)
9544 .nr(8)
9545 .kr(2)
9546 .sr(1)
9547 .m(m)
9548 .n(n)
9549 .k(k)
9550 .cm_stride(11)
9551 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009552 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009553 }
9554 }
9555 }
9556 }
9557
9558 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, qmin) {
9559 TEST_REQUIRES_ARM_NEON_V8;
9560 GemmMicrokernelTester()
9561 .mr(2)
9562 .nr(8)
9563 .kr(2)
9564 .sr(1)
9565 .m(2)
9566 .n(8)
9567 .k(16)
9568 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08009569 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009570 }
9571
9572 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, qmax) {
9573 TEST_REQUIRES_ARM_NEON_V8;
9574 GemmMicrokernelTester()
9575 .mr(2)
9576 .nr(8)
9577 .kr(2)
9578 .sr(1)
9579 .m(2)
9580 .n(8)
9581 .k(16)
9582 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08009583 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009584 }
9585
9586 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD2R, strided_cm) {
9587 TEST_REQUIRES_ARM_NEON_V8;
9588 GemmMicrokernelTester()
9589 .mr(2)
9590 .nr(8)
9591 .kr(2)
9592 .sr(1)
9593 .m(2)
9594 .n(8)
9595 .k(16)
9596 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08009597 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld2r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009598 }
9599#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
9600
9601
9602#if XNN_ARCH_ARM || XNN_ARCH_ARM64
9603 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, k_eq_16) {
9604 TEST_REQUIRES_ARM_NEON;
9605 GemmMicrokernelTester()
9606 .mr(1)
9607 .nr(8)
9608 .kr(2)
9609 .sr(1)
9610 .m(1)
9611 .n(8)
9612 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -08009613 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009614 }
9615
9616 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, strided_cn) {
9617 TEST_REQUIRES_ARM_NEON;
9618 GemmMicrokernelTester()
9619 .mr(1)
9620 .nr(8)
9621 .kr(2)
9622 .sr(1)
9623 .m(1)
9624 .n(8)
9625 .k(16)
9626 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08009627 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009628 }
9629
9630 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, k_eq_16_strided_a) {
9631 TEST_REQUIRES_ARM_NEON;
9632 GemmMicrokernelTester()
9633 .mr(1)
9634 .nr(8)
9635 .kr(2)
9636 .sr(1)
9637 .m(1)
9638 .n(8)
9639 .k(16)
9640 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08009641 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009642 }
9643
9644 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, k_eq_16_subtile) {
9645 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -08009646 for (uint32_t n = 1; n <= 8; n++) {
9647 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009648 GemmMicrokernelTester()
9649 .mr(1)
9650 .nr(8)
9651 .kr(2)
9652 .sr(1)
9653 .m(m)
9654 .n(n)
9655 .k(16)
9656 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009657 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009658 }
9659 }
9660 }
9661
9662 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, k_eq_16_subtile_m) {
9663 TEST_REQUIRES_ARM_NEON;
9664 for (uint32_t m = 1; m <= 1; m++) {
9665 GemmMicrokernelTester()
9666 .mr(1)
9667 .nr(8)
9668 .kr(2)
9669 .sr(1)
9670 .m(m)
9671 .n(8)
9672 .k(16)
9673 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009674 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009675 }
9676 }
9677
9678 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, k_eq_16_subtile_n) {
9679 TEST_REQUIRES_ARM_NEON;
9680 for (uint32_t n = 1; n <= 8; n++) {
9681 GemmMicrokernelTester()
9682 .mr(1)
9683 .nr(8)
9684 .kr(2)
9685 .sr(1)
9686 .m(1)
9687 .n(n)
9688 .k(16)
9689 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009690 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009691 }
9692 }
9693
9694 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, k_lt_16) {
9695 TEST_REQUIRES_ARM_NEON;
9696 for (size_t k = 1; k < 16; k++) {
9697 GemmMicrokernelTester()
9698 .mr(1)
9699 .nr(8)
9700 .kr(2)
9701 .sr(1)
9702 .m(1)
9703 .n(8)
9704 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08009705 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009706 }
9707 }
9708
9709 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, k_lt_16_strided_a) {
9710 TEST_REQUIRES_ARM_NEON;
9711 for (size_t k = 1; k < 16; k++) {
9712 GemmMicrokernelTester()
9713 .mr(1)
9714 .nr(8)
9715 .kr(2)
9716 .sr(1)
9717 .m(1)
9718 .n(8)
9719 .k(k)
9720 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08009721 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009722 }
9723 }
9724
9725 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, k_lt_16_subtile) {
9726 TEST_REQUIRES_ARM_NEON;
9727 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08009728 for (uint32_t n = 1; n <= 8; n++) {
9729 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009730 GemmMicrokernelTester()
9731 .mr(1)
9732 .nr(8)
9733 .kr(2)
9734 .sr(1)
9735 .m(m)
9736 .n(n)
9737 .k(k)
9738 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009739 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009740 }
9741 }
9742 }
9743 }
9744
9745 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, k_gt_16) {
9746 TEST_REQUIRES_ARM_NEON;
9747 for (size_t k = 17; k < 32; k++) {
9748 GemmMicrokernelTester()
9749 .mr(1)
9750 .nr(8)
9751 .kr(2)
9752 .sr(1)
9753 .m(1)
9754 .n(8)
9755 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08009756 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009757 }
9758 }
9759
9760 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, k_gt_16_strided_a) {
9761 TEST_REQUIRES_ARM_NEON;
9762 for (size_t k = 17; k < 32; k++) {
9763 GemmMicrokernelTester()
9764 .mr(1)
9765 .nr(8)
9766 .kr(2)
9767 .sr(1)
9768 .m(1)
9769 .n(8)
9770 .k(k)
9771 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -08009772 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009773 }
9774 }
9775
9776 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, k_gt_16_subtile) {
9777 TEST_REQUIRES_ARM_NEON;
9778 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08009779 for (uint32_t n = 1; n <= 8; n++) {
9780 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009781 GemmMicrokernelTester()
9782 .mr(1)
9783 .nr(8)
9784 .kr(2)
9785 .sr(1)
9786 .m(m)
9787 .n(n)
9788 .k(k)
9789 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009790 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009791 }
9792 }
9793 }
9794 }
9795
9796 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, k_div_16) {
9797 TEST_REQUIRES_ARM_NEON;
9798 for (size_t k = 32; k <= 160; k += 16) {
9799 GemmMicrokernelTester()
9800 .mr(1)
9801 .nr(8)
9802 .kr(2)
9803 .sr(1)
9804 .m(1)
9805 .n(8)
9806 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08009807 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009808 }
9809 }
9810
9811 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, k_div_16_strided_a) {
9812 TEST_REQUIRES_ARM_NEON;
9813 for (size_t k = 32; k <= 160; k += 16) {
9814 GemmMicrokernelTester()
9815 .mr(1)
9816 .nr(8)
9817 .kr(2)
9818 .sr(1)
9819 .m(1)
9820 .n(8)
9821 .k(k)
9822 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -08009823 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009824 }
9825 }
9826
9827 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, k_div_16_subtile) {
9828 TEST_REQUIRES_ARM_NEON;
9829 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08009830 for (uint32_t n = 1; n <= 8; n++) {
9831 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009832 GemmMicrokernelTester()
9833 .mr(1)
9834 .nr(8)
9835 .kr(2)
9836 .sr(1)
9837 .m(m)
9838 .n(n)
9839 .k(k)
9840 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009841 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009842 }
9843 }
9844 }
9845 }
9846
9847 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, n_gt_8) {
9848 TEST_REQUIRES_ARM_NEON;
9849 for (uint32_t n = 9; n < 16; n++) {
9850 for (size_t k = 1; k <= 80; k += 17) {
9851 GemmMicrokernelTester()
9852 .mr(1)
9853 .nr(8)
9854 .kr(2)
9855 .sr(1)
9856 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08009857 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009858 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08009859 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009860 }
9861 }
9862 }
9863
9864 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, n_gt_8_strided_cn) {
9865 TEST_REQUIRES_ARM_NEON;
9866 for (uint32_t n = 9; n < 16; n++) {
9867 for (size_t k = 1; k <= 80; k += 17) {
9868 GemmMicrokernelTester()
9869 .mr(1)
9870 .nr(8)
9871 .kr(2)
9872 .sr(1)
9873 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08009874 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009875 .k(k)
9876 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08009877 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009878 }
9879 }
9880 }
9881
9882 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, n_gt_8_strided_a) {
9883 TEST_REQUIRES_ARM_NEON;
9884 for (uint32_t n = 9; n < 16; n++) {
9885 for (size_t k = 1; k <= 80; k += 17) {
9886 GemmMicrokernelTester()
9887 .mr(1)
9888 .nr(8)
9889 .kr(2)
9890 .sr(1)
9891 .m(1)
9892 .n(n)
9893 .k(k)
9894 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08009895 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009896 }
9897 }
9898 }
9899
9900 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, n_gt_8_subtile) {
9901 TEST_REQUIRES_ARM_NEON;
9902 for (uint32_t n = 9; n < 16; n++) {
9903 for (size_t k = 1; k <= 80; k += 17) {
9904 for (uint32_t m = 1; m <= 1; m++) {
9905 GemmMicrokernelTester()
9906 .mr(1)
9907 .nr(8)
9908 .kr(2)
9909 .sr(1)
9910 .m(m)
9911 .n(n)
9912 .k(k)
9913 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009914 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009915 }
9916 }
9917 }
9918 }
9919
9920 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, n_div_8) {
9921 TEST_REQUIRES_ARM_NEON;
9922 for (uint32_t n = 16; n <= 24; n += 8) {
9923 for (size_t k = 1; k <= 80; k += 17) {
9924 GemmMicrokernelTester()
9925 .mr(1)
9926 .nr(8)
9927 .kr(2)
9928 .sr(1)
9929 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08009930 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009931 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08009932 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009933 }
9934 }
9935 }
9936
9937 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, n_div_8_strided_cn) {
9938 TEST_REQUIRES_ARM_NEON;
9939 for (uint32_t n = 16; n <= 24; n += 8) {
9940 for (size_t k = 1; k <= 80; k += 17) {
9941 GemmMicrokernelTester()
9942 .mr(1)
9943 .nr(8)
9944 .kr(2)
9945 .sr(1)
9946 .m(1)
9947 .n(n)
9948 .k(k)
9949 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08009950 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009951 }
9952 }
9953 }
9954
9955 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, n_div_8_strided_a) {
9956 TEST_REQUIRES_ARM_NEON;
9957 for (uint32_t n = 16; n <= 24; n += 8) {
9958 for (size_t k = 1; k <= 80; k += 17) {
9959 GemmMicrokernelTester()
9960 .mr(1)
9961 .nr(8)
9962 .kr(2)
9963 .sr(1)
9964 .m(1)
9965 .n(n)
9966 .k(k)
9967 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08009968 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009969 }
9970 }
9971 }
9972
9973 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, n_div_8_subtile) {
9974 TEST_REQUIRES_ARM_NEON;
9975 for (uint32_t n = 16; n <= 24; n += 8) {
9976 for (size_t k = 1; k <= 80; k += 17) {
9977 for (uint32_t m = 1; m <= 1; m++) {
9978 GemmMicrokernelTester()
9979 .mr(1)
9980 .nr(8)
9981 .kr(2)
9982 .sr(1)
9983 .m(m)
9984 .n(n)
9985 .k(k)
9986 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009987 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009988 }
9989 }
9990 }
9991 }
9992
9993 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, strided_cm_subtile) {
9994 TEST_REQUIRES_ARM_NEON;
9995 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08009996 for (uint32_t n = 1; n <= 8; n++) {
9997 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009998 GemmMicrokernelTester()
9999 .mr(1)
10000 .nr(8)
10001 .kr(2)
10002 .sr(1)
10003 .m(m)
10004 .n(n)
10005 .k(k)
10006 .cm_stride(11)
10007 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010008 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010009 }
10010 }
10011 }
10012 }
10013
10014 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, qmin) {
10015 TEST_REQUIRES_ARM_NEON;
10016 GemmMicrokernelTester()
10017 .mr(1)
10018 .nr(8)
10019 .kr(2)
10020 .sr(1)
10021 .m(1)
10022 .n(8)
10023 .k(16)
10024 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080010025 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010026 }
10027
10028 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, qmax) {
10029 TEST_REQUIRES_ARM_NEON;
10030 GemmMicrokernelTester()
10031 .mr(1)
10032 .nr(8)
10033 .kr(2)
10034 .sr(1)
10035 .m(1)
10036 .n(8)
10037 .k(16)
10038 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080010039 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010040 }
10041
10042 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD4R, strided_cm) {
10043 TEST_REQUIRES_ARM_NEON;
10044 GemmMicrokernelTester()
10045 .mr(1)
10046 .nr(8)
10047 .kr(2)
10048 .sr(1)
10049 .m(1)
10050 .n(8)
10051 .k(16)
10052 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080010053 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010054 }
10055#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
10056
10057
10058#if XNN_ARCH_ARM || XNN_ARCH_ARM64
10059 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16) {
10060 TEST_REQUIRES_ARM_NEON;
10061 GemmMicrokernelTester()
10062 .mr(2)
10063 .nr(8)
10064 .kr(4)
10065 .sr(2)
10066 .m(2)
10067 .n(8)
10068 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -080010069 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010070 }
10071
10072 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, strided_cn) {
10073 TEST_REQUIRES_ARM_NEON;
10074 GemmMicrokernelTester()
10075 .mr(2)
10076 .nr(8)
10077 .kr(4)
10078 .sr(2)
10079 .m(2)
10080 .n(8)
10081 .k(16)
10082 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080010083 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010084 }
10085
10086 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16_strided_a) {
10087 TEST_REQUIRES_ARM_NEON;
10088 GemmMicrokernelTester()
10089 .mr(2)
10090 .nr(8)
10091 .kr(4)
10092 .sr(2)
10093 .m(2)
10094 .n(8)
10095 .k(16)
10096 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080010097 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010098 }
10099
10100 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16_subtile) {
10101 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -080010102 for (uint32_t n = 1; n <= 8; n++) {
10103 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010104 GemmMicrokernelTester()
10105 .mr(2)
10106 .nr(8)
10107 .kr(4)
10108 .sr(2)
10109 .m(m)
10110 .n(n)
10111 .k(16)
10112 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010113 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010114 }
10115 }
10116 }
10117
10118 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16_subtile_m) {
10119 TEST_REQUIRES_ARM_NEON;
10120 for (uint32_t m = 1; m <= 2; m++) {
10121 GemmMicrokernelTester()
10122 .mr(2)
10123 .nr(8)
10124 .kr(4)
10125 .sr(2)
10126 .m(m)
10127 .n(8)
10128 .k(16)
10129 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010130 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010131 }
10132 }
10133
10134 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_eq_16_subtile_n) {
10135 TEST_REQUIRES_ARM_NEON;
10136 for (uint32_t n = 1; n <= 8; n++) {
10137 GemmMicrokernelTester()
10138 .mr(2)
10139 .nr(8)
10140 .kr(4)
10141 .sr(2)
10142 .m(2)
10143 .n(n)
10144 .k(16)
10145 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010146 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010147 }
10148 }
10149
10150 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_lt_16) {
10151 TEST_REQUIRES_ARM_NEON;
10152 for (size_t k = 1; k < 16; k++) {
10153 GemmMicrokernelTester()
10154 .mr(2)
10155 .nr(8)
10156 .kr(4)
10157 .sr(2)
10158 .m(2)
10159 .n(8)
10160 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080010161 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010162 }
10163 }
10164
10165 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_lt_16_strided_a) {
10166 TEST_REQUIRES_ARM_NEON;
10167 for (size_t k = 1; k < 16; k++) {
10168 GemmMicrokernelTester()
10169 .mr(2)
10170 .nr(8)
10171 .kr(4)
10172 .sr(2)
10173 .m(2)
10174 .n(8)
10175 .k(k)
10176 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080010177 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010178 }
10179 }
10180
10181 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_lt_16_subtile) {
10182 TEST_REQUIRES_ARM_NEON;
10183 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080010184 for (uint32_t n = 1; n <= 8; n++) {
10185 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010186 GemmMicrokernelTester()
10187 .mr(2)
10188 .nr(8)
10189 .kr(4)
10190 .sr(2)
10191 .m(m)
10192 .n(n)
10193 .k(k)
10194 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010195 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010196 }
10197 }
10198 }
10199 }
10200
10201 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_gt_16) {
10202 TEST_REQUIRES_ARM_NEON;
10203 for (size_t k = 17; k < 32; k++) {
10204 GemmMicrokernelTester()
10205 .mr(2)
10206 .nr(8)
10207 .kr(4)
10208 .sr(2)
10209 .m(2)
10210 .n(8)
10211 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080010212 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010213 }
10214 }
10215
10216 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_gt_16_strided_a) {
10217 TEST_REQUIRES_ARM_NEON;
10218 for (size_t k = 17; k < 32; k++) {
10219 GemmMicrokernelTester()
10220 .mr(2)
10221 .nr(8)
10222 .kr(4)
10223 .sr(2)
10224 .m(2)
10225 .n(8)
10226 .k(k)
10227 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -080010228 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010229 }
10230 }
10231
10232 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_gt_16_subtile) {
10233 TEST_REQUIRES_ARM_NEON;
10234 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080010235 for (uint32_t n = 1; n <= 8; n++) {
10236 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010237 GemmMicrokernelTester()
10238 .mr(2)
10239 .nr(8)
10240 .kr(4)
10241 .sr(2)
10242 .m(m)
10243 .n(n)
10244 .k(k)
10245 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010246 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010247 }
10248 }
10249 }
10250 }
10251
10252 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_div_16) {
10253 TEST_REQUIRES_ARM_NEON;
10254 for (size_t k = 32; k <= 160; k += 16) {
10255 GemmMicrokernelTester()
10256 .mr(2)
10257 .nr(8)
10258 .kr(4)
10259 .sr(2)
10260 .m(2)
10261 .n(8)
10262 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080010263 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010264 }
10265 }
10266
10267 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_div_16_strided_a) {
10268 TEST_REQUIRES_ARM_NEON;
10269 for (size_t k = 32; k <= 160; k += 16) {
10270 GemmMicrokernelTester()
10271 .mr(2)
10272 .nr(8)
10273 .kr(4)
10274 .sr(2)
10275 .m(2)
10276 .n(8)
10277 .k(k)
10278 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -080010279 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010280 }
10281 }
10282
10283 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, k_div_16_subtile) {
10284 TEST_REQUIRES_ARM_NEON;
10285 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080010286 for (uint32_t n = 1; n <= 8; n++) {
10287 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010288 GemmMicrokernelTester()
10289 .mr(2)
10290 .nr(8)
10291 .kr(4)
10292 .sr(2)
10293 .m(m)
10294 .n(n)
10295 .k(k)
10296 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010297 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010298 }
10299 }
10300 }
10301 }
10302
10303 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_gt_8) {
10304 TEST_REQUIRES_ARM_NEON;
10305 for (uint32_t n = 9; n < 16; n++) {
10306 for (size_t k = 1; k <= 80; k += 17) {
10307 GemmMicrokernelTester()
10308 .mr(2)
10309 .nr(8)
10310 .kr(4)
10311 .sr(2)
10312 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080010313 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010314 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080010315 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010316 }
10317 }
10318 }
10319
10320 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_gt_8_strided_cn) {
10321 TEST_REQUIRES_ARM_NEON;
10322 for (uint32_t n = 9; n < 16; n++) {
10323 for (size_t k = 1; k <= 80; k += 17) {
10324 GemmMicrokernelTester()
10325 .mr(2)
10326 .nr(8)
10327 .kr(4)
10328 .sr(2)
10329 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080010330 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010331 .k(k)
10332 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080010333 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010334 }
10335 }
10336 }
10337
10338 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_gt_8_strided_a) {
10339 TEST_REQUIRES_ARM_NEON;
10340 for (uint32_t n = 9; n < 16; n++) {
10341 for (size_t k = 1; k <= 80; k += 17) {
10342 GemmMicrokernelTester()
10343 .mr(2)
10344 .nr(8)
10345 .kr(4)
10346 .sr(2)
10347 .m(2)
10348 .n(n)
10349 .k(k)
10350 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080010351 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010352 }
10353 }
10354 }
10355
10356 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_gt_8_subtile) {
10357 TEST_REQUIRES_ARM_NEON;
10358 for (uint32_t n = 9; n < 16; n++) {
10359 for (size_t k = 1; k <= 80; k += 17) {
10360 for (uint32_t m = 1; m <= 2; m++) {
10361 GemmMicrokernelTester()
10362 .mr(2)
10363 .nr(8)
10364 .kr(4)
10365 .sr(2)
10366 .m(m)
10367 .n(n)
10368 .k(k)
10369 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010370 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010371 }
10372 }
10373 }
10374 }
10375
10376 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_div_8) {
10377 TEST_REQUIRES_ARM_NEON;
10378 for (uint32_t n = 16; n <= 24; n += 8) {
10379 for (size_t k = 1; k <= 80; k += 17) {
10380 GemmMicrokernelTester()
10381 .mr(2)
10382 .nr(8)
10383 .kr(4)
10384 .sr(2)
10385 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080010386 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010387 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080010388 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010389 }
10390 }
10391 }
10392
10393 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_div_8_strided_cn) {
10394 TEST_REQUIRES_ARM_NEON;
10395 for (uint32_t n = 16; n <= 24; n += 8) {
10396 for (size_t k = 1; k <= 80; k += 17) {
10397 GemmMicrokernelTester()
10398 .mr(2)
10399 .nr(8)
10400 .kr(4)
10401 .sr(2)
10402 .m(2)
10403 .n(n)
10404 .k(k)
10405 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080010406 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010407 }
10408 }
10409 }
10410
10411 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_div_8_strided_a) {
10412 TEST_REQUIRES_ARM_NEON;
10413 for (uint32_t n = 16; n <= 24; n += 8) {
10414 for (size_t k = 1; k <= 80; k += 17) {
10415 GemmMicrokernelTester()
10416 .mr(2)
10417 .nr(8)
10418 .kr(4)
10419 .sr(2)
10420 .m(2)
10421 .n(n)
10422 .k(k)
10423 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080010424 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010425 }
10426 }
10427 }
10428
10429 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, n_div_8_subtile) {
10430 TEST_REQUIRES_ARM_NEON;
10431 for (uint32_t n = 16; n <= 24; n += 8) {
10432 for (size_t k = 1; k <= 80; k += 17) {
10433 for (uint32_t m = 1; m <= 2; m++) {
10434 GemmMicrokernelTester()
10435 .mr(2)
10436 .nr(8)
10437 .kr(4)
10438 .sr(2)
10439 .m(m)
10440 .n(n)
10441 .k(k)
10442 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010443 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010444 }
10445 }
10446 }
10447 }
10448
10449 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, strided_cm_subtile) {
10450 TEST_REQUIRES_ARM_NEON;
10451 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080010452 for (uint32_t n = 1; n <= 8; n++) {
10453 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010454 GemmMicrokernelTester()
10455 .mr(2)
10456 .nr(8)
10457 .kr(4)
10458 .sr(2)
10459 .m(m)
10460 .n(n)
10461 .k(k)
10462 .cm_stride(11)
10463 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010464 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010465 }
10466 }
10467 }
10468 }
10469
10470 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, qmin) {
10471 TEST_REQUIRES_ARM_NEON;
10472 GemmMicrokernelTester()
10473 .mr(2)
10474 .nr(8)
10475 .kr(4)
10476 .sr(2)
10477 .m(2)
10478 .n(8)
10479 .k(16)
10480 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080010481 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010482 }
10483
10484 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, qmax) {
10485 TEST_REQUIRES_ARM_NEON;
10486 GemmMicrokernelTester()
10487 .mr(2)
10488 .nr(8)
10489 .kr(4)
10490 .sr(2)
10491 .m(2)
10492 .n(8)
10493 .k(16)
10494 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080010495 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010496 }
10497
10498 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEON_MLAL, strided_cm) {
10499 TEST_REQUIRES_ARM_NEON;
10500 GemmMicrokernelTester()
10501 .mr(2)
10502 .nr(8)
10503 .kr(4)
10504 .sr(2)
10505 .m(2)
10506 .n(8)
10507 .k(16)
10508 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080010509 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010510 }
10511#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
10512
10513
10514#if XNN_ARCH_ARM || XNN_ARCH_ARM64
10515 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16) {
10516 TEST_REQUIRES_ARM_NEON_V8;
10517 GemmMicrokernelTester()
10518 .mr(2)
10519 .nr(8)
10520 .kr(4)
10521 .sr(2)
10522 .m(2)
10523 .n(8)
10524 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -080010525 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010526 }
10527
10528 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, strided_cn) {
10529 TEST_REQUIRES_ARM_NEON_V8;
10530 GemmMicrokernelTester()
10531 .mr(2)
10532 .nr(8)
10533 .kr(4)
10534 .sr(2)
10535 .m(2)
10536 .n(8)
10537 .k(16)
10538 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080010539 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010540 }
10541
10542 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16_strided_a) {
10543 TEST_REQUIRES_ARM_NEON_V8;
10544 GemmMicrokernelTester()
10545 .mr(2)
10546 .nr(8)
10547 .kr(4)
10548 .sr(2)
10549 .m(2)
10550 .n(8)
10551 .k(16)
10552 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080010553 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010554 }
10555
10556 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16_subtile) {
10557 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -080010558 for (uint32_t n = 1; n <= 8; n++) {
10559 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010560 GemmMicrokernelTester()
10561 .mr(2)
10562 .nr(8)
10563 .kr(4)
10564 .sr(2)
10565 .m(m)
10566 .n(n)
10567 .k(16)
10568 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010569 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010570 }
10571 }
10572 }
10573
10574 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16_subtile_m) {
10575 TEST_REQUIRES_ARM_NEON_V8;
10576 for (uint32_t m = 1; m <= 2; m++) {
10577 GemmMicrokernelTester()
10578 .mr(2)
10579 .nr(8)
10580 .kr(4)
10581 .sr(2)
10582 .m(m)
10583 .n(8)
10584 .k(16)
10585 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010586 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010587 }
10588 }
10589
10590 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_eq_16_subtile_n) {
10591 TEST_REQUIRES_ARM_NEON_V8;
10592 for (uint32_t n = 1; n <= 8; n++) {
10593 GemmMicrokernelTester()
10594 .mr(2)
10595 .nr(8)
10596 .kr(4)
10597 .sr(2)
10598 .m(2)
10599 .n(n)
10600 .k(16)
10601 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010602 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010603 }
10604 }
10605
10606 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_lt_16) {
10607 TEST_REQUIRES_ARM_NEON_V8;
10608 for (size_t k = 1; k < 16; k++) {
10609 GemmMicrokernelTester()
10610 .mr(2)
10611 .nr(8)
10612 .kr(4)
10613 .sr(2)
10614 .m(2)
10615 .n(8)
10616 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080010617 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010618 }
10619 }
10620
10621 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_lt_16_strided_a) {
10622 TEST_REQUIRES_ARM_NEON_V8;
10623 for (size_t k = 1; k < 16; k++) {
10624 GemmMicrokernelTester()
10625 .mr(2)
10626 .nr(8)
10627 .kr(4)
10628 .sr(2)
10629 .m(2)
10630 .n(8)
10631 .k(k)
10632 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080010633 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010634 }
10635 }
10636
10637 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_lt_16_subtile) {
10638 TEST_REQUIRES_ARM_NEON_V8;
10639 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080010640 for (uint32_t n = 1; n <= 8; n++) {
10641 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010642 GemmMicrokernelTester()
10643 .mr(2)
10644 .nr(8)
10645 .kr(4)
10646 .sr(2)
10647 .m(m)
10648 .n(n)
10649 .k(k)
10650 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010651 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010652 }
10653 }
10654 }
10655 }
10656
10657 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_gt_16) {
10658 TEST_REQUIRES_ARM_NEON_V8;
10659 for (size_t k = 17; k < 32; k++) {
10660 GemmMicrokernelTester()
10661 .mr(2)
10662 .nr(8)
10663 .kr(4)
10664 .sr(2)
10665 .m(2)
10666 .n(8)
10667 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080010668 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010669 }
10670 }
10671
10672 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_gt_16_strided_a) {
10673 TEST_REQUIRES_ARM_NEON_V8;
10674 for (size_t k = 17; k < 32; k++) {
10675 GemmMicrokernelTester()
10676 .mr(2)
10677 .nr(8)
10678 .kr(4)
10679 .sr(2)
10680 .m(2)
10681 .n(8)
10682 .k(k)
10683 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -080010684 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010685 }
10686 }
10687
10688 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_gt_16_subtile) {
10689 TEST_REQUIRES_ARM_NEON_V8;
10690 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080010691 for (uint32_t n = 1; n <= 8; n++) {
10692 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010693 GemmMicrokernelTester()
10694 .mr(2)
10695 .nr(8)
10696 .kr(4)
10697 .sr(2)
10698 .m(m)
10699 .n(n)
10700 .k(k)
10701 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010702 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010703 }
10704 }
10705 }
10706 }
10707
10708 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_div_16) {
10709 TEST_REQUIRES_ARM_NEON_V8;
10710 for (size_t k = 32; k <= 160; k += 16) {
10711 GemmMicrokernelTester()
10712 .mr(2)
10713 .nr(8)
10714 .kr(4)
10715 .sr(2)
10716 .m(2)
10717 .n(8)
10718 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080010719 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010720 }
10721 }
10722
10723 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_div_16_strided_a) {
10724 TEST_REQUIRES_ARM_NEON_V8;
10725 for (size_t k = 32; k <= 160; k += 16) {
10726 GemmMicrokernelTester()
10727 .mr(2)
10728 .nr(8)
10729 .kr(4)
10730 .sr(2)
10731 .m(2)
10732 .n(8)
10733 .k(k)
10734 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -080010735 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010736 }
10737 }
10738
10739 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, k_div_16_subtile) {
10740 TEST_REQUIRES_ARM_NEON_V8;
10741 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080010742 for (uint32_t n = 1; n <= 8; n++) {
10743 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010744 GemmMicrokernelTester()
10745 .mr(2)
10746 .nr(8)
10747 .kr(4)
10748 .sr(2)
10749 .m(m)
10750 .n(n)
10751 .k(k)
10752 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010753 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010754 }
10755 }
10756 }
10757 }
10758
10759 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_gt_8) {
10760 TEST_REQUIRES_ARM_NEON_V8;
10761 for (uint32_t n = 9; n < 16; n++) {
10762 for (size_t k = 1; k <= 80; k += 17) {
10763 GemmMicrokernelTester()
10764 .mr(2)
10765 .nr(8)
10766 .kr(4)
10767 .sr(2)
10768 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080010769 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010770 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080010771 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010772 }
10773 }
10774 }
10775
10776 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_gt_8_strided_cn) {
10777 TEST_REQUIRES_ARM_NEON_V8;
10778 for (uint32_t n = 9; n < 16; n++) {
10779 for (size_t k = 1; k <= 80; k += 17) {
10780 GemmMicrokernelTester()
10781 .mr(2)
10782 .nr(8)
10783 .kr(4)
10784 .sr(2)
10785 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080010786 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010787 .k(k)
10788 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080010789 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010790 }
10791 }
10792 }
10793
10794 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_gt_8_strided_a) {
10795 TEST_REQUIRES_ARM_NEON_V8;
10796 for (uint32_t n = 9; n < 16; n++) {
10797 for (size_t k = 1; k <= 80; k += 17) {
10798 GemmMicrokernelTester()
10799 .mr(2)
10800 .nr(8)
10801 .kr(4)
10802 .sr(2)
10803 .m(2)
10804 .n(n)
10805 .k(k)
10806 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080010807 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010808 }
10809 }
10810 }
10811
10812 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_gt_8_subtile) {
10813 TEST_REQUIRES_ARM_NEON_V8;
10814 for (uint32_t n = 9; n < 16; n++) {
10815 for (size_t k = 1; k <= 80; k += 17) {
10816 for (uint32_t m = 1; m <= 2; m++) {
10817 GemmMicrokernelTester()
10818 .mr(2)
10819 .nr(8)
10820 .kr(4)
10821 .sr(2)
10822 .m(m)
10823 .n(n)
10824 .k(k)
10825 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010826 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010827 }
10828 }
10829 }
10830 }
10831
10832 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_div_8) {
10833 TEST_REQUIRES_ARM_NEON_V8;
10834 for (uint32_t n = 16; n <= 24; n += 8) {
10835 for (size_t k = 1; k <= 80; k += 17) {
10836 GemmMicrokernelTester()
10837 .mr(2)
10838 .nr(8)
10839 .kr(4)
10840 .sr(2)
10841 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080010842 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010843 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080010844 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010845 }
10846 }
10847 }
10848
10849 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_div_8_strided_cn) {
10850 TEST_REQUIRES_ARM_NEON_V8;
10851 for (uint32_t n = 16; n <= 24; n += 8) {
10852 for (size_t k = 1; k <= 80; k += 17) {
10853 GemmMicrokernelTester()
10854 .mr(2)
10855 .nr(8)
10856 .kr(4)
10857 .sr(2)
10858 .m(2)
10859 .n(n)
10860 .k(k)
10861 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080010862 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010863 }
10864 }
10865 }
10866
10867 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_div_8_strided_a) {
10868 TEST_REQUIRES_ARM_NEON_V8;
10869 for (uint32_t n = 16; n <= 24; n += 8) {
10870 for (size_t k = 1; k <= 80; k += 17) {
10871 GemmMicrokernelTester()
10872 .mr(2)
10873 .nr(8)
10874 .kr(4)
10875 .sr(2)
10876 .m(2)
10877 .n(n)
10878 .k(k)
10879 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080010880 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010881 }
10882 }
10883 }
10884
10885 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, n_div_8_subtile) {
10886 TEST_REQUIRES_ARM_NEON_V8;
10887 for (uint32_t n = 16; n <= 24; n += 8) {
10888 for (size_t k = 1; k <= 80; k += 17) {
10889 for (uint32_t m = 1; m <= 2; m++) {
10890 GemmMicrokernelTester()
10891 .mr(2)
10892 .nr(8)
10893 .kr(4)
10894 .sr(2)
10895 .m(m)
10896 .n(n)
10897 .k(k)
10898 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010899 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010900 }
10901 }
10902 }
10903 }
10904
10905 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, strided_cm_subtile) {
10906 TEST_REQUIRES_ARM_NEON_V8;
10907 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080010908 for (uint32_t n = 1; n <= 8; n++) {
10909 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010910 GemmMicrokernelTester()
10911 .mr(2)
10912 .nr(8)
10913 .kr(4)
10914 .sr(2)
10915 .m(m)
10916 .n(n)
10917 .k(k)
10918 .cm_stride(11)
10919 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010920 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010921 }
10922 }
10923 }
10924 }
10925
10926 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, qmin) {
10927 TEST_REQUIRES_ARM_NEON_V8;
10928 GemmMicrokernelTester()
10929 .mr(2)
10930 .nr(8)
10931 .kr(4)
10932 .sr(2)
10933 .m(2)
10934 .n(8)
10935 .k(16)
10936 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080010937 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010938 }
10939
10940 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, qmax) {
10941 TEST_REQUIRES_ARM_NEON_V8;
10942 GemmMicrokernelTester()
10943 .mr(2)
10944 .nr(8)
10945 .kr(4)
10946 .sr(2)
10947 .m(2)
10948 .n(8)
10949 .k(16)
10950 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080010951 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010952 }
10953
10954 TEST(QC8_GEMM_MINMAX_FP32_2X8C4S2__NEONV8_MLAL, strided_cm) {
10955 TEST_REQUIRES_ARM_NEON_V8;
10956 GemmMicrokernelTester()
10957 .mr(2)
10958 .nr(8)
10959 .kr(4)
10960 .sr(2)
10961 .m(2)
10962 .n(8)
10963 .k(16)
10964 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080010965 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4s2__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010966 }
10967#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
10968
10969
10970#if XNN_ARCH_ARM || XNN_ARCH_ARM64
10971 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16) {
10972 TEST_REQUIRES_ARM_NEON;
10973 GemmMicrokernelTester()
10974 .mr(1)
10975 .nr(8)
10976 .kr(2)
10977 .sr(4)
10978 .m(1)
10979 .n(8)
10980 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -080010981 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010982 }
10983
10984 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, strided_cn) {
10985 TEST_REQUIRES_ARM_NEON;
10986 GemmMicrokernelTester()
10987 .mr(1)
10988 .nr(8)
10989 .kr(2)
10990 .sr(4)
10991 .m(1)
10992 .n(8)
10993 .k(16)
10994 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080010995 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010996 }
10997
10998 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16_strided_a) {
10999 TEST_REQUIRES_ARM_NEON;
11000 GemmMicrokernelTester()
11001 .mr(1)
11002 .nr(8)
11003 .kr(2)
11004 .sr(4)
11005 .m(1)
11006 .n(8)
11007 .k(16)
11008 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080011009 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011010 }
11011
11012 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16_subtile) {
11013 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -080011014 for (uint32_t n = 1; n <= 8; n++) {
11015 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011016 GemmMicrokernelTester()
11017 .mr(1)
11018 .nr(8)
11019 .kr(2)
11020 .sr(4)
11021 .m(m)
11022 .n(n)
11023 .k(16)
11024 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011025 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011026 }
11027 }
11028 }
11029
11030 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16_subtile_m) {
11031 TEST_REQUIRES_ARM_NEON;
11032 for (uint32_t m = 1; m <= 1; m++) {
11033 GemmMicrokernelTester()
11034 .mr(1)
11035 .nr(8)
11036 .kr(2)
11037 .sr(4)
11038 .m(m)
11039 .n(8)
11040 .k(16)
11041 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011042 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011043 }
11044 }
11045
11046 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_eq_16_subtile_n) {
11047 TEST_REQUIRES_ARM_NEON;
11048 for (uint32_t n = 1; n <= 8; n++) {
11049 GemmMicrokernelTester()
11050 .mr(1)
11051 .nr(8)
11052 .kr(2)
11053 .sr(4)
11054 .m(1)
11055 .n(n)
11056 .k(16)
11057 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011058 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011059 }
11060 }
11061
11062 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_lt_16) {
11063 TEST_REQUIRES_ARM_NEON;
11064 for (size_t k = 1; k < 16; k++) {
11065 GemmMicrokernelTester()
11066 .mr(1)
11067 .nr(8)
11068 .kr(2)
11069 .sr(4)
11070 .m(1)
11071 .n(8)
11072 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080011073 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011074 }
11075 }
11076
11077 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_lt_16_strided_a) {
11078 TEST_REQUIRES_ARM_NEON;
11079 for (size_t k = 1; k < 16; k++) {
11080 GemmMicrokernelTester()
11081 .mr(1)
11082 .nr(8)
11083 .kr(2)
11084 .sr(4)
11085 .m(1)
11086 .n(8)
11087 .k(k)
11088 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080011089 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011090 }
11091 }
11092
11093 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_lt_16_subtile) {
11094 TEST_REQUIRES_ARM_NEON;
11095 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080011096 for (uint32_t n = 1; n <= 8; n++) {
11097 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011098 GemmMicrokernelTester()
11099 .mr(1)
11100 .nr(8)
11101 .kr(2)
11102 .sr(4)
11103 .m(m)
11104 .n(n)
11105 .k(k)
11106 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011107 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011108 }
11109 }
11110 }
11111 }
11112
11113 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_gt_16) {
11114 TEST_REQUIRES_ARM_NEON;
11115 for (size_t k = 17; k < 32; k++) {
11116 GemmMicrokernelTester()
11117 .mr(1)
11118 .nr(8)
11119 .kr(2)
11120 .sr(4)
11121 .m(1)
11122 .n(8)
11123 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080011124 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011125 }
11126 }
11127
11128 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_gt_16_strided_a) {
11129 TEST_REQUIRES_ARM_NEON;
11130 for (size_t k = 17; k < 32; k++) {
11131 GemmMicrokernelTester()
11132 .mr(1)
11133 .nr(8)
11134 .kr(2)
11135 .sr(4)
11136 .m(1)
11137 .n(8)
11138 .k(k)
11139 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -080011140 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011141 }
11142 }
11143
11144 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_gt_16_subtile) {
11145 TEST_REQUIRES_ARM_NEON;
11146 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080011147 for (uint32_t n = 1; n <= 8; n++) {
11148 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011149 GemmMicrokernelTester()
11150 .mr(1)
11151 .nr(8)
11152 .kr(2)
11153 .sr(4)
11154 .m(m)
11155 .n(n)
11156 .k(k)
11157 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011158 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011159 }
11160 }
11161 }
11162 }
11163
11164 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_div_16) {
11165 TEST_REQUIRES_ARM_NEON;
11166 for (size_t k = 32; k <= 160; k += 16) {
11167 GemmMicrokernelTester()
11168 .mr(1)
11169 .nr(8)
11170 .kr(2)
11171 .sr(4)
11172 .m(1)
11173 .n(8)
11174 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080011175 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011176 }
11177 }
11178
11179 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_div_16_strided_a) {
11180 TEST_REQUIRES_ARM_NEON;
11181 for (size_t k = 32; k <= 160; k += 16) {
11182 GemmMicrokernelTester()
11183 .mr(1)
11184 .nr(8)
11185 .kr(2)
11186 .sr(4)
11187 .m(1)
11188 .n(8)
11189 .k(k)
11190 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -080011191 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011192 }
11193 }
11194
11195 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, k_div_16_subtile) {
11196 TEST_REQUIRES_ARM_NEON;
11197 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080011198 for (uint32_t n = 1; n <= 8; n++) {
11199 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011200 GemmMicrokernelTester()
11201 .mr(1)
11202 .nr(8)
11203 .kr(2)
11204 .sr(4)
11205 .m(m)
11206 .n(n)
11207 .k(k)
11208 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011209 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011210 }
11211 }
11212 }
11213 }
11214
11215 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_gt_8) {
11216 TEST_REQUIRES_ARM_NEON;
11217 for (uint32_t n = 9; n < 16; n++) {
11218 for (size_t k = 1; k <= 80; k += 17) {
11219 GemmMicrokernelTester()
11220 .mr(1)
11221 .nr(8)
11222 .kr(2)
11223 .sr(4)
11224 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080011225 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011226 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080011227 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011228 }
11229 }
11230 }
11231
11232 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_gt_8_strided_cn) {
11233 TEST_REQUIRES_ARM_NEON;
11234 for (uint32_t n = 9; n < 16; n++) {
11235 for (size_t k = 1; k <= 80; k += 17) {
11236 GemmMicrokernelTester()
11237 .mr(1)
11238 .nr(8)
11239 .kr(2)
11240 .sr(4)
11241 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080011242 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011243 .k(k)
11244 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080011245 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011246 }
11247 }
11248 }
11249
11250 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_gt_8_strided_a) {
11251 TEST_REQUIRES_ARM_NEON;
11252 for (uint32_t n = 9; n < 16; n++) {
11253 for (size_t k = 1; k <= 80; k += 17) {
11254 GemmMicrokernelTester()
11255 .mr(1)
11256 .nr(8)
11257 .kr(2)
11258 .sr(4)
11259 .m(1)
11260 .n(n)
11261 .k(k)
11262 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080011263 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011264 }
11265 }
11266 }
11267
11268 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_gt_8_subtile) {
11269 TEST_REQUIRES_ARM_NEON;
11270 for (uint32_t n = 9; n < 16; n++) {
11271 for (size_t k = 1; k <= 80; k += 17) {
11272 for (uint32_t m = 1; m <= 1; m++) {
11273 GemmMicrokernelTester()
11274 .mr(1)
11275 .nr(8)
11276 .kr(2)
11277 .sr(4)
11278 .m(m)
11279 .n(n)
11280 .k(k)
11281 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011282 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011283 }
11284 }
11285 }
11286 }
11287
11288 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_div_8) {
11289 TEST_REQUIRES_ARM_NEON;
11290 for (uint32_t n = 16; n <= 24; n += 8) {
11291 for (size_t k = 1; k <= 80; k += 17) {
11292 GemmMicrokernelTester()
11293 .mr(1)
11294 .nr(8)
11295 .kr(2)
11296 .sr(4)
11297 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080011298 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011299 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080011300 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011301 }
11302 }
11303 }
11304
11305 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_div_8_strided_cn) {
11306 TEST_REQUIRES_ARM_NEON;
11307 for (uint32_t n = 16; n <= 24; n += 8) {
11308 for (size_t k = 1; k <= 80; k += 17) {
11309 GemmMicrokernelTester()
11310 .mr(1)
11311 .nr(8)
11312 .kr(2)
11313 .sr(4)
11314 .m(1)
11315 .n(n)
11316 .k(k)
11317 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080011318 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011319 }
11320 }
11321 }
11322
11323 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_div_8_strided_a) {
11324 TEST_REQUIRES_ARM_NEON;
11325 for (uint32_t n = 16; n <= 24; n += 8) {
11326 for (size_t k = 1; k <= 80; k += 17) {
11327 GemmMicrokernelTester()
11328 .mr(1)
11329 .nr(8)
11330 .kr(2)
11331 .sr(4)
11332 .m(1)
11333 .n(n)
11334 .k(k)
11335 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080011336 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011337 }
11338 }
11339 }
11340
11341 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, n_div_8_subtile) {
11342 TEST_REQUIRES_ARM_NEON;
11343 for (uint32_t n = 16; n <= 24; n += 8) {
11344 for (size_t k = 1; k <= 80; k += 17) {
11345 for (uint32_t m = 1; m <= 1; m++) {
11346 GemmMicrokernelTester()
11347 .mr(1)
11348 .nr(8)
11349 .kr(2)
11350 .sr(4)
11351 .m(m)
11352 .n(n)
11353 .k(k)
11354 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011355 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011356 }
11357 }
11358 }
11359 }
11360
11361 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, strided_cm_subtile) {
11362 TEST_REQUIRES_ARM_NEON;
11363 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080011364 for (uint32_t n = 1; n <= 8; n++) {
11365 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011366 GemmMicrokernelTester()
11367 .mr(1)
11368 .nr(8)
11369 .kr(2)
11370 .sr(4)
11371 .m(m)
11372 .n(n)
11373 .k(k)
11374 .cm_stride(11)
11375 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011376 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011377 }
11378 }
11379 }
11380 }
11381
11382 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, qmin) {
11383 TEST_REQUIRES_ARM_NEON;
11384 GemmMicrokernelTester()
11385 .mr(1)
11386 .nr(8)
11387 .kr(2)
11388 .sr(4)
11389 .m(1)
11390 .n(8)
11391 .k(16)
11392 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080011393 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011394 }
11395
11396 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, qmax) {
11397 TEST_REQUIRES_ARM_NEON;
11398 GemmMicrokernelTester()
11399 .mr(1)
11400 .nr(8)
11401 .kr(2)
11402 .sr(4)
11403 .m(1)
11404 .n(8)
11405 .k(16)
11406 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080011407 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011408 }
11409
11410 TEST(QC8_GEMM_MINMAX_FP32_1X8C2S4__NEON_MLAL, strided_cm) {
11411 TEST_REQUIRES_ARM_NEON;
11412 GemmMicrokernelTester()
11413 .mr(1)
11414 .nr(8)
11415 .kr(2)
11416 .sr(4)
11417 .m(1)
11418 .n(8)
11419 .k(16)
11420 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080011421 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011422 }
11423#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
11424
11425
11426#if XNN_ARCH_ARM || XNN_ARCH_ARM64
11427 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16) {
11428 TEST_REQUIRES_ARM_NEON_V8;
11429 GemmMicrokernelTester()
11430 .mr(2)
11431 .nr(8)
11432 .kr(2)
11433 .sr(4)
11434 .m(2)
11435 .n(8)
11436 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -080011437 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011438 }
11439
11440 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, strided_cn) {
11441 TEST_REQUIRES_ARM_NEON_V8;
11442 GemmMicrokernelTester()
11443 .mr(2)
11444 .nr(8)
11445 .kr(2)
11446 .sr(4)
11447 .m(2)
11448 .n(8)
11449 .k(16)
11450 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080011451 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011452 }
11453
11454 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16_strided_a) {
11455 TEST_REQUIRES_ARM_NEON_V8;
11456 GemmMicrokernelTester()
11457 .mr(2)
11458 .nr(8)
11459 .kr(2)
11460 .sr(4)
11461 .m(2)
11462 .n(8)
11463 .k(16)
11464 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080011465 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011466 }
11467
11468 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16_subtile) {
11469 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -080011470 for (uint32_t n = 1; n <= 8; n++) {
11471 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011472 GemmMicrokernelTester()
11473 .mr(2)
11474 .nr(8)
11475 .kr(2)
11476 .sr(4)
11477 .m(m)
11478 .n(n)
11479 .k(16)
11480 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011481 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011482 }
11483 }
11484 }
11485
11486 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16_subtile_m) {
11487 TEST_REQUIRES_ARM_NEON_V8;
11488 for (uint32_t m = 1; m <= 2; m++) {
11489 GemmMicrokernelTester()
11490 .mr(2)
11491 .nr(8)
11492 .kr(2)
11493 .sr(4)
11494 .m(m)
11495 .n(8)
11496 .k(16)
11497 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011498 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011499 }
11500 }
11501
11502 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_eq_16_subtile_n) {
11503 TEST_REQUIRES_ARM_NEON_V8;
11504 for (uint32_t n = 1; n <= 8; n++) {
11505 GemmMicrokernelTester()
11506 .mr(2)
11507 .nr(8)
11508 .kr(2)
11509 .sr(4)
11510 .m(2)
11511 .n(n)
11512 .k(16)
11513 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011514 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011515 }
11516 }
11517
11518 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_lt_16) {
11519 TEST_REQUIRES_ARM_NEON_V8;
11520 for (size_t k = 1; k < 16; k++) {
11521 GemmMicrokernelTester()
11522 .mr(2)
11523 .nr(8)
11524 .kr(2)
11525 .sr(4)
11526 .m(2)
11527 .n(8)
11528 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080011529 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011530 }
11531 }
11532
11533 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_lt_16_strided_a) {
11534 TEST_REQUIRES_ARM_NEON_V8;
11535 for (size_t k = 1; k < 16; k++) {
11536 GemmMicrokernelTester()
11537 .mr(2)
11538 .nr(8)
11539 .kr(2)
11540 .sr(4)
11541 .m(2)
11542 .n(8)
11543 .k(k)
11544 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080011545 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011546 }
11547 }
11548
11549 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_lt_16_subtile) {
11550 TEST_REQUIRES_ARM_NEON_V8;
11551 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080011552 for (uint32_t n = 1; n <= 8; n++) {
11553 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011554 GemmMicrokernelTester()
11555 .mr(2)
11556 .nr(8)
11557 .kr(2)
11558 .sr(4)
11559 .m(m)
11560 .n(n)
11561 .k(k)
11562 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011563 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011564 }
11565 }
11566 }
11567 }
11568
11569 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_gt_16) {
11570 TEST_REQUIRES_ARM_NEON_V8;
11571 for (size_t k = 17; k < 32; k++) {
11572 GemmMicrokernelTester()
11573 .mr(2)
11574 .nr(8)
11575 .kr(2)
11576 .sr(4)
11577 .m(2)
11578 .n(8)
11579 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080011580 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011581 }
11582 }
11583
11584 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_gt_16_strided_a) {
11585 TEST_REQUIRES_ARM_NEON_V8;
11586 for (size_t k = 17; k < 32; k++) {
11587 GemmMicrokernelTester()
11588 .mr(2)
11589 .nr(8)
11590 .kr(2)
11591 .sr(4)
11592 .m(2)
11593 .n(8)
11594 .k(k)
11595 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -080011596 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011597 }
11598 }
11599
11600 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_gt_16_subtile) {
11601 TEST_REQUIRES_ARM_NEON_V8;
11602 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080011603 for (uint32_t n = 1; n <= 8; n++) {
11604 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011605 GemmMicrokernelTester()
11606 .mr(2)
11607 .nr(8)
11608 .kr(2)
11609 .sr(4)
11610 .m(m)
11611 .n(n)
11612 .k(k)
11613 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011614 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011615 }
11616 }
11617 }
11618 }
11619
11620 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_div_16) {
11621 TEST_REQUIRES_ARM_NEON_V8;
11622 for (size_t k = 32; k <= 160; k += 16) {
11623 GemmMicrokernelTester()
11624 .mr(2)
11625 .nr(8)
11626 .kr(2)
11627 .sr(4)
11628 .m(2)
11629 .n(8)
11630 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080011631 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011632 }
11633 }
11634
11635 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_div_16_strided_a) {
11636 TEST_REQUIRES_ARM_NEON_V8;
11637 for (size_t k = 32; k <= 160; k += 16) {
11638 GemmMicrokernelTester()
11639 .mr(2)
11640 .nr(8)
11641 .kr(2)
11642 .sr(4)
11643 .m(2)
11644 .n(8)
11645 .k(k)
11646 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -080011647 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011648 }
11649 }
11650
11651 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, k_div_16_subtile) {
11652 TEST_REQUIRES_ARM_NEON_V8;
11653 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080011654 for (uint32_t n = 1; n <= 8; n++) {
11655 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011656 GemmMicrokernelTester()
11657 .mr(2)
11658 .nr(8)
11659 .kr(2)
11660 .sr(4)
11661 .m(m)
11662 .n(n)
11663 .k(k)
11664 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011665 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011666 }
11667 }
11668 }
11669 }
11670
11671 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_gt_8) {
11672 TEST_REQUIRES_ARM_NEON_V8;
11673 for (uint32_t n = 9; n < 16; n++) {
11674 for (size_t k = 1; k <= 80; k += 17) {
11675 GemmMicrokernelTester()
11676 .mr(2)
11677 .nr(8)
11678 .kr(2)
11679 .sr(4)
11680 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080011681 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011682 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080011683 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011684 }
11685 }
11686 }
11687
11688 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_gt_8_strided_cn) {
11689 TEST_REQUIRES_ARM_NEON_V8;
11690 for (uint32_t n = 9; n < 16; n++) {
11691 for (size_t k = 1; k <= 80; k += 17) {
11692 GemmMicrokernelTester()
11693 .mr(2)
11694 .nr(8)
11695 .kr(2)
11696 .sr(4)
11697 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080011698 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011699 .k(k)
11700 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080011701 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011702 }
11703 }
11704 }
11705
11706 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_gt_8_strided_a) {
11707 TEST_REQUIRES_ARM_NEON_V8;
11708 for (uint32_t n = 9; n < 16; n++) {
11709 for (size_t k = 1; k <= 80; k += 17) {
11710 GemmMicrokernelTester()
11711 .mr(2)
11712 .nr(8)
11713 .kr(2)
11714 .sr(4)
11715 .m(2)
11716 .n(n)
11717 .k(k)
11718 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080011719 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011720 }
11721 }
11722 }
11723
11724 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_gt_8_subtile) {
11725 TEST_REQUIRES_ARM_NEON_V8;
11726 for (uint32_t n = 9; n < 16; n++) {
11727 for (size_t k = 1; k <= 80; k += 17) {
11728 for (uint32_t m = 1; m <= 2; m++) {
11729 GemmMicrokernelTester()
11730 .mr(2)
11731 .nr(8)
11732 .kr(2)
11733 .sr(4)
11734 .m(m)
11735 .n(n)
11736 .k(k)
11737 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011738 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011739 }
11740 }
11741 }
11742 }
11743
11744 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_div_8) {
11745 TEST_REQUIRES_ARM_NEON_V8;
11746 for (uint32_t n = 16; n <= 24; n += 8) {
11747 for (size_t k = 1; k <= 80; k += 17) {
11748 GemmMicrokernelTester()
11749 .mr(2)
11750 .nr(8)
11751 .kr(2)
11752 .sr(4)
11753 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080011754 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011755 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080011756 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011757 }
11758 }
11759 }
11760
11761 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_div_8_strided_cn) {
11762 TEST_REQUIRES_ARM_NEON_V8;
11763 for (uint32_t n = 16; n <= 24; n += 8) {
11764 for (size_t k = 1; k <= 80; k += 17) {
11765 GemmMicrokernelTester()
11766 .mr(2)
11767 .nr(8)
11768 .kr(2)
11769 .sr(4)
11770 .m(2)
11771 .n(n)
11772 .k(k)
11773 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080011774 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011775 }
11776 }
11777 }
11778
11779 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_div_8_strided_a) {
11780 TEST_REQUIRES_ARM_NEON_V8;
11781 for (uint32_t n = 16; n <= 24; n += 8) {
11782 for (size_t k = 1; k <= 80; k += 17) {
11783 GemmMicrokernelTester()
11784 .mr(2)
11785 .nr(8)
11786 .kr(2)
11787 .sr(4)
11788 .m(2)
11789 .n(n)
11790 .k(k)
11791 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080011792 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011793 }
11794 }
11795 }
11796
11797 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, n_div_8_subtile) {
11798 TEST_REQUIRES_ARM_NEON_V8;
11799 for (uint32_t n = 16; n <= 24; n += 8) {
11800 for (size_t k = 1; k <= 80; k += 17) {
11801 for (uint32_t m = 1; m <= 2; m++) {
11802 GemmMicrokernelTester()
11803 .mr(2)
11804 .nr(8)
11805 .kr(2)
11806 .sr(4)
11807 .m(m)
11808 .n(n)
11809 .k(k)
11810 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011811 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011812 }
11813 }
11814 }
11815 }
11816
11817 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, strided_cm_subtile) {
11818 TEST_REQUIRES_ARM_NEON_V8;
11819 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080011820 for (uint32_t n = 1; n <= 8; n++) {
11821 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011822 GemmMicrokernelTester()
11823 .mr(2)
11824 .nr(8)
11825 .kr(2)
11826 .sr(4)
11827 .m(m)
11828 .n(n)
11829 .k(k)
11830 .cm_stride(11)
11831 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011832 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011833 }
11834 }
11835 }
11836 }
11837
11838 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, qmin) {
11839 TEST_REQUIRES_ARM_NEON_V8;
11840 GemmMicrokernelTester()
11841 .mr(2)
11842 .nr(8)
11843 .kr(2)
11844 .sr(4)
11845 .m(2)
11846 .n(8)
11847 .k(16)
11848 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080011849 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011850 }
11851
11852 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, qmax) {
11853 TEST_REQUIRES_ARM_NEON_V8;
11854 GemmMicrokernelTester()
11855 .mr(2)
11856 .nr(8)
11857 .kr(2)
11858 .sr(4)
11859 .m(2)
11860 .n(8)
11861 .k(16)
11862 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080011863 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011864 }
11865
11866 TEST(QC8_GEMM_MINMAX_FP32_2X8C2S4__NEONV8_MLAL, strided_cm) {
11867 TEST_REQUIRES_ARM_NEON_V8;
11868 GemmMicrokernelTester()
11869 .mr(2)
11870 .nr(8)
11871 .kr(2)
11872 .sr(4)
11873 .m(2)
11874 .n(8)
11875 .k(16)
11876 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080011877 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011878 }
11879#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
11880
11881
11882#if XNN_ARCH_ARM || XNN_ARCH_ARM64
11883 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16) {
11884 TEST_REQUIRES_ARM_NEON;
11885 GemmMicrokernelTester()
11886 .mr(1)
11887 .nr(8)
11888 .kr(4)
11889 .sr(1)
11890 .m(1)
11891 .n(8)
11892 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -080011893 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011894 }
11895
11896 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, strided_cn) {
11897 TEST_REQUIRES_ARM_NEON;
11898 GemmMicrokernelTester()
11899 .mr(1)
11900 .nr(8)
11901 .kr(4)
11902 .sr(1)
11903 .m(1)
11904 .n(8)
11905 .k(16)
11906 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080011907 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011908 }
11909
11910 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16_strided_a) {
11911 TEST_REQUIRES_ARM_NEON;
11912 GemmMicrokernelTester()
11913 .mr(1)
11914 .nr(8)
11915 .kr(4)
11916 .sr(1)
11917 .m(1)
11918 .n(8)
11919 .k(16)
11920 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080011921 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011922 }
11923
11924 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16_subtile) {
11925 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -080011926 for (uint32_t n = 1; n <= 8; n++) {
11927 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011928 GemmMicrokernelTester()
11929 .mr(1)
11930 .nr(8)
11931 .kr(4)
11932 .sr(1)
11933 .m(m)
11934 .n(n)
11935 .k(16)
11936 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011937 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011938 }
11939 }
11940 }
11941
11942 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
11943 TEST_REQUIRES_ARM_NEON;
11944 for (uint32_t m = 1; m <= 1; m++) {
11945 GemmMicrokernelTester()
11946 .mr(1)
11947 .nr(8)
11948 .kr(4)
11949 .sr(1)
11950 .m(m)
11951 .n(8)
11952 .k(16)
11953 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011954 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011955 }
11956 }
11957
11958 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
11959 TEST_REQUIRES_ARM_NEON;
11960 for (uint32_t n = 1; n <= 8; n++) {
11961 GemmMicrokernelTester()
11962 .mr(1)
11963 .nr(8)
11964 .kr(4)
11965 .sr(1)
11966 .m(1)
11967 .n(n)
11968 .k(16)
11969 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011970 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011971 }
11972 }
11973
11974 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_lt_16) {
11975 TEST_REQUIRES_ARM_NEON;
11976 for (size_t k = 1; k < 16; k++) {
11977 GemmMicrokernelTester()
11978 .mr(1)
11979 .nr(8)
11980 .kr(4)
11981 .sr(1)
11982 .m(1)
11983 .n(8)
11984 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080011985 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011986 }
11987 }
11988
11989 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_lt_16_strided_a) {
11990 TEST_REQUIRES_ARM_NEON;
11991 for (size_t k = 1; k < 16; k++) {
11992 GemmMicrokernelTester()
11993 .mr(1)
11994 .nr(8)
11995 .kr(4)
11996 .sr(1)
11997 .m(1)
11998 .n(8)
11999 .k(k)
12000 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080012001 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012002 }
12003 }
12004
12005 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_lt_16_subtile) {
12006 TEST_REQUIRES_ARM_NEON;
12007 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080012008 for (uint32_t n = 1; n <= 8; n++) {
12009 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012010 GemmMicrokernelTester()
12011 .mr(1)
12012 .nr(8)
12013 .kr(4)
12014 .sr(1)
12015 .m(m)
12016 .n(n)
12017 .k(k)
12018 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012019 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012020 }
12021 }
12022 }
12023 }
12024
12025 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_gt_16) {
12026 TEST_REQUIRES_ARM_NEON;
12027 for (size_t k = 17; k < 32; k++) {
12028 GemmMicrokernelTester()
12029 .mr(1)
12030 .nr(8)
12031 .kr(4)
12032 .sr(1)
12033 .m(1)
12034 .n(8)
12035 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012036 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012037 }
12038 }
12039
12040 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_gt_16_strided_a) {
12041 TEST_REQUIRES_ARM_NEON;
12042 for (size_t k = 17; k < 32; k++) {
12043 GemmMicrokernelTester()
12044 .mr(1)
12045 .nr(8)
12046 .kr(4)
12047 .sr(1)
12048 .m(1)
12049 .n(8)
12050 .k(k)
12051 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -080012052 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012053 }
12054 }
12055
12056 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_gt_16_subtile) {
12057 TEST_REQUIRES_ARM_NEON;
12058 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080012059 for (uint32_t n = 1; n <= 8; n++) {
12060 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012061 GemmMicrokernelTester()
12062 .mr(1)
12063 .nr(8)
12064 .kr(4)
12065 .sr(1)
12066 .m(m)
12067 .n(n)
12068 .k(k)
12069 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012070 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012071 }
12072 }
12073 }
12074 }
12075
12076 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_div_16) {
12077 TEST_REQUIRES_ARM_NEON;
12078 for (size_t k = 32; k <= 160; k += 16) {
12079 GemmMicrokernelTester()
12080 .mr(1)
12081 .nr(8)
12082 .kr(4)
12083 .sr(1)
12084 .m(1)
12085 .n(8)
12086 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012087 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012088 }
12089 }
12090
12091 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_div_16_strided_a) {
12092 TEST_REQUIRES_ARM_NEON;
12093 for (size_t k = 32; k <= 160; k += 16) {
12094 GemmMicrokernelTester()
12095 .mr(1)
12096 .nr(8)
12097 .kr(4)
12098 .sr(1)
12099 .m(1)
12100 .n(8)
12101 .k(k)
12102 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -080012103 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012104 }
12105 }
12106
12107 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, k_div_16_subtile) {
12108 TEST_REQUIRES_ARM_NEON;
12109 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080012110 for (uint32_t n = 1; n <= 8; n++) {
12111 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012112 GemmMicrokernelTester()
12113 .mr(1)
12114 .nr(8)
12115 .kr(4)
12116 .sr(1)
12117 .m(m)
12118 .n(n)
12119 .k(k)
12120 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012121 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012122 }
12123 }
12124 }
12125 }
12126
12127 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_gt_8) {
12128 TEST_REQUIRES_ARM_NEON;
12129 for (uint32_t n = 9; n < 16; n++) {
12130 for (size_t k = 1; k <= 80; k += 17) {
12131 GemmMicrokernelTester()
12132 .mr(1)
12133 .nr(8)
12134 .kr(4)
12135 .sr(1)
12136 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080012137 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012138 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012139 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012140 }
12141 }
12142 }
12143
12144 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_gt_8_strided_cn) {
12145 TEST_REQUIRES_ARM_NEON;
12146 for (uint32_t n = 9; n < 16; n++) {
12147 for (size_t k = 1; k <= 80; k += 17) {
12148 GemmMicrokernelTester()
12149 .mr(1)
12150 .nr(8)
12151 .kr(4)
12152 .sr(1)
12153 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080012154 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012155 .k(k)
12156 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080012157 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012158 }
12159 }
12160 }
12161
12162 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_gt_8_strided_a) {
12163 TEST_REQUIRES_ARM_NEON;
12164 for (uint32_t n = 9; n < 16; n++) {
12165 for (size_t k = 1; k <= 80; k += 17) {
12166 GemmMicrokernelTester()
12167 .mr(1)
12168 .nr(8)
12169 .kr(4)
12170 .sr(1)
12171 .m(1)
12172 .n(n)
12173 .k(k)
12174 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080012175 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012176 }
12177 }
12178 }
12179
12180 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_gt_8_subtile) {
12181 TEST_REQUIRES_ARM_NEON;
12182 for (uint32_t n = 9; n < 16; n++) {
12183 for (size_t k = 1; k <= 80; k += 17) {
12184 for (uint32_t m = 1; m <= 1; m++) {
12185 GemmMicrokernelTester()
12186 .mr(1)
12187 .nr(8)
12188 .kr(4)
12189 .sr(1)
12190 .m(m)
12191 .n(n)
12192 .k(k)
12193 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012194 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012195 }
12196 }
12197 }
12198 }
12199
12200 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_div_8) {
12201 TEST_REQUIRES_ARM_NEON;
12202 for (uint32_t n = 16; n <= 24; n += 8) {
12203 for (size_t k = 1; k <= 80; k += 17) {
12204 GemmMicrokernelTester()
12205 .mr(1)
12206 .nr(8)
12207 .kr(4)
12208 .sr(1)
12209 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080012210 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012211 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012212 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012213 }
12214 }
12215 }
12216
12217 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_div_8_strided_cn) {
12218 TEST_REQUIRES_ARM_NEON;
12219 for (uint32_t n = 16; n <= 24; n += 8) {
12220 for (size_t k = 1; k <= 80; k += 17) {
12221 GemmMicrokernelTester()
12222 .mr(1)
12223 .nr(8)
12224 .kr(4)
12225 .sr(1)
12226 .m(1)
12227 .n(n)
12228 .k(k)
12229 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080012230 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012231 }
12232 }
12233 }
12234
12235 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_div_8_strided_a) {
12236 TEST_REQUIRES_ARM_NEON;
12237 for (uint32_t n = 16; n <= 24; n += 8) {
12238 for (size_t k = 1; k <= 80; k += 17) {
12239 GemmMicrokernelTester()
12240 .mr(1)
12241 .nr(8)
12242 .kr(4)
12243 .sr(1)
12244 .m(1)
12245 .n(n)
12246 .k(k)
12247 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080012248 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012249 }
12250 }
12251 }
12252
12253 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, n_div_8_subtile) {
12254 TEST_REQUIRES_ARM_NEON;
12255 for (uint32_t n = 16; n <= 24; n += 8) {
12256 for (size_t k = 1; k <= 80; k += 17) {
12257 for (uint32_t m = 1; m <= 1; m++) {
12258 GemmMicrokernelTester()
12259 .mr(1)
12260 .nr(8)
12261 .kr(4)
12262 .sr(1)
12263 .m(m)
12264 .n(n)
12265 .k(k)
12266 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012267 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012268 }
12269 }
12270 }
12271 }
12272
12273 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, strided_cm_subtile) {
12274 TEST_REQUIRES_ARM_NEON;
12275 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080012276 for (uint32_t n = 1; n <= 8; n++) {
12277 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012278 GemmMicrokernelTester()
12279 .mr(1)
12280 .nr(8)
12281 .kr(4)
12282 .sr(1)
12283 .m(m)
12284 .n(n)
12285 .k(k)
12286 .cm_stride(11)
12287 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012288 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012289 }
12290 }
12291 }
12292 }
12293
12294 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, qmin) {
12295 TEST_REQUIRES_ARM_NEON;
12296 GemmMicrokernelTester()
12297 .mr(1)
12298 .nr(8)
12299 .kr(4)
12300 .sr(1)
12301 .m(1)
12302 .n(8)
12303 .k(16)
12304 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080012305 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012306 }
12307
12308 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, qmax) {
12309 TEST_REQUIRES_ARM_NEON;
12310 GemmMicrokernelTester()
12311 .mr(1)
12312 .nr(8)
12313 .kr(4)
12314 .sr(1)
12315 .m(1)
12316 .n(8)
12317 .k(16)
12318 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080012319 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012320 }
12321
12322 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_DUP, strided_cm) {
12323 TEST_REQUIRES_ARM_NEON;
12324 GemmMicrokernelTester()
12325 .mr(1)
12326 .nr(8)
12327 .kr(4)
12328 .sr(1)
12329 .m(1)
12330 .n(8)
12331 .k(16)
12332 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080012333 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012334 }
12335#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
12336
12337
12338#if XNN_ARCH_ARM || XNN_ARCH_ARM64
12339 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, k_eq_16) {
12340 TEST_REQUIRES_ARM_NEON;
12341 GemmMicrokernelTester()
12342 .mr(1)
12343 .nr(8)
12344 .kr(4)
12345 .sr(1)
12346 .m(1)
12347 .n(8)
12348 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -080012349 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012350 }
12351
12352 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, strided_cn) {
12353 TEST_REQUIRES_ARM_NEON;
12354 GemmMicrokernelTester()
12355 .mr(1)
12356 .nr(8)
12357 .kr(4)
12358 .sr(1)
12359 .m(1)
12360 .n(8)
12361 .k(16)
12362 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080012363 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012364 }
12365
12366 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, k_eq_16_strided_a) {
12367 TEST_REQUIRES_ARM_NEON;
12368 GemmMicrokernelTester()
12369 .mr(1)
12370 .nr(8)
12371 .kr(4)
12372 .sr(1)
12373 .m(1)
12374 .n(8)
12375 .k(16)
12376 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080012377 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012378 }
12379
12380 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, k_eq_16_subtile) {
12381 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -080012382 for (uint32_t n = 1; n <= 8; n++) {
12383 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012384 GemmMicrokernelTester()
12385 .mr(1)
12386 .nr(8)
12387 .kr(4)
12388 .sr(1)
12389 .m(m)
12390 .n(n)
12391 .k(16)
12392 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012393 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012394 }
12395 }
12396 }
12397
12398 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, k_eq_16_subtile_m) {
12399 TEST_REQUIRES_ARM_NEON;
12400 for (uint32_t m = 1; m <= 1; m++) {
12401 GemmMicrokernelTester()
12402 .mr(1)
12403 .nr(8)
12404 .kr(4)
12405 .sr(1)
12406 .m(m)
12407 .n(8)
12408 .k(16)
12409 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012410 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012411 }
12412 }
12413
12414 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, k_eq_16_subtile_n) {
12415 TEST_REQUIRES_ARM_NEON;
12416 for (uint32_t n = 1; n <= 8; n++) {
12417 GemmMicrokernelTester()
12418 .mr(1)
12419 .nr(8)
12420 .kr(4)
12421 .sr(1)
12422 .m(1)
12423 .n(n)
12424 .k(16)
12425 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012426 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012427 }
12428 }
12429
12430 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, k_lt_16) {
12431 TEST_REQUIRES_ARM_NEON;
12432 for (size_t k = 1; k < 16; k++) {
12433 GemmMicrokernelTester()
12434 .mr(1)
12435 .nr(8)
12436 .kr(4)
12437 .sr(1)
12438 .m(1)
12439 .n(8)
12440 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012441 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012442 }
12443 }
12444
12445 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, k_lt_16_strided_a) {
12446 TEST_REQUIRES_ARM_NEON;
12447 for (size_t k = 1; k < 16; k++) {
12448 GemmMicrokernelTester()
12449 .mr(1)
12450 .nr(8)
12451 .kr(4)
12452 .sr(1)
12453 .m(1)
12454 .n(8)
12455 .k(k)
12456 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080012457 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012458 }
12459 }
12460
12461 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, k_lt_16_subtile) {
12462 TEST_REQUIRES_ARM_NEON;
12463 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080012464 for (uint32_t n = 1; n <= 8; n++) {
12465 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012466 GemmMicrokernelTester()
12467 .mr(1)
12468 .nr(8)
12469 .kr(4)
12470 .sr(1)
12471 .m(m)
12472 .n(n)
12473 .k(k)
12474 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012475 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012476 }
12477 }
12478 }
12479 }
12480
12481 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, k_gt_16) {
12482 TEST_REQUIRES_ARM_NEON;
12483 for (size_t k = 17; k < 32; k++) {
12484 GemmMicrokernelTester()
12485 .mr(1)
12486 .nr(8)
12487 .kr(4)
12488 .sr(1)
12489 .m(1)
12490 .n(8)
12491 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012492 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012493 }
12494 }
12495
12496 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, k_gt_16_strided_a) {
12497 TEST_REQUIRES_ARM_NEON;
12498 for (size_t k = 17; k < 32; k++) {
12499 GemmMicrokernelTester()
12500 .mr(1)
12501 .nr(8)
12502 .kr(4)
12503 .sr(1)
12504 .m(1)
12505 .n(8)
12506 .k(k)
12507 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -080012508 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012509 }
12510 }
12511
12512 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, k_gt_16_subtile) {
12513 TEST_REQUIRES_ARM_NEON;
12514 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080012515 for (uint32_t n = 1; n <= 8; n++) {
12516 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012517 GemmMicrokernelTester()
12518 .mr(1)
12519 .nr(8)
12520 .kr(4)
12521 .sr(1)
12522 .m(m)
12523 .n(n)
12524 .k(k)
12525 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012526 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012527 }
12528 }
12529 }
12530 }
12531
12532 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, k_div_16) {
12533 TEST_REQUIRES_ARM_NEON;
12534 for (size_t k = 32; k <= 160; k += 16) {
12535 GemmMicrokernelTester()
12536 .mr(1)
12537 .nr(8)
12538 .kr(4)
12539 .sr(1)
12540 .m(1)
12541 .n(8)
12542 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012543 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012544 }
12545 }
12546
12547 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, k_div_16_strided_a) {
12548 TEST_REQUIRES_ARM_NEON;
12549 for (size_t k = 32; k <= 160; k += 16) {
12550 GemmMicrokernelTester()
12551 .mr(1)
12552 .nr(8)
12553 .kr(4)
12554 .sr(1)
12555 .m(1)
12556 .n(8)
12557 .k(k)
12558 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -080012559 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012560 }
12561 }
12562
12563 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, k_div_16_subtile) {
12564 TEST_REQUIRES_ARM_NEON;
12565 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080012566 for (uint32_t n = 1; n <= 8; n++) {
12567 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012568 GemmMicrokernelTester()
12569 .mr(1)
12570 .nr(8)
12571 .kr(4)
12572 .sr(1)
12573 .m(m)
12574 .n(n)
12575 .k(k)
12576 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012577 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012578 }
12579 }
12580 }
12581 }
12582
12583 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, n_gt_8) {
12584 TEST_REQUIRES_ARM_NEON;
12585 for (uint32_t n = 9; n < 16; n++) {
12586 for (size_t k = 1; k <= 80; k += 17) {
12587 GemmMicrokernelTester()
12588 .mr(1)
12589 .nr(8)
12590 .kr(4)
12591 .sr(1)
12592 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080012593 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012594 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012595 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012596 }
12597 }
12598 }
12599
12600 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, n_gt_8_strided_cn) {
12601 TEST_REQUIRES_ARM_NEON;
12602 for (uint32_t n = 9; n < 16; n++) {
12603 for (size_t k = 1; k <= 80; k += 17) {
12604 GemmMicrokernelTester()
12605 .mr(1)
12606 .nr(8)
12607 .kr(4)
12608 .sr(1)
12609 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080012610 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012611 .k(k)
12612 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080012613 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012614 }
12615 }
12616 }
12617
12618 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, n_gt_8_strided_a) {
12619 TEST_REQUIRES_ARM_NEON;
12620 for (uint32_t n = 9; n < 16; n++) {
12621 for (size_t k = 1; k <= 80; k += 17) {
12622 GemmMicrokernelTester()
12623 .mr(1)
12624 .nr(8)
12625 .kr(4)
12626 .sr(1)
12627 .m(1)
12628 .n(n)
12629 .k(k)
12630 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080012631 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012632 }
12633 }
12634 }
12635
12636 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, n_gt_8_subtile) {
12637 TEST_REQUIRES_ARM_NEON;
12638 for (uint32_t n = 9; n < 16; n++) {
12639 for (size_t k = 1; k <= 80; k += 17) {
12640 for (uint32_t m = 1; m <= 1; m++) {
12641 GemmMicrokernelTester()
12642 .mr(1)
12643 .nr(8)
12644 .kr(4)
12645 .sr(1)
12646 .m(m)
12647 .n(n)
12648 .k(k)
12649 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012650 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012651 }
12652 }
12653 }
12654 }
12655
12656 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, n_div_8) {
12657 TEST_REQUIRES_ARM_NEON;
12658 for (uint32_t n = 16; n <= 24; n += 8) {
12659 for (size_t k = 1; k <= 80; k += 17) {
12660 GemmMicrokernelTester()
12661 .mr(1)
12662 .nr(8)
12663 .kr(4)
12664 .sr(1)
12665 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080012666 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012667 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012668 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012669 }
12670 }
12671 }
12672
12673 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, n_div_8_strided_cn) {
12674 TEST_REQUIRES_ARM_NEON;
12675 for (uint32_t n = 16; n <= 24; n += 8) {
12676 for (size_t k = 1; k <= 80; k += 17) {
12677 GemmMicrokernelTester()
12678 .mr(1)
12679 .nr(8)
12680 .kr(4)
12681 .sr(1)
12682 .m(1)
12683 .n(n)
12684 .k(k)
12685 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080012686 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012687 }
12688 }
12689 }
12690
12691 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, n_div_8_strided_a) {
12692 TEST_REQUIRES_ARM_NEON;
12693 for (uint32_t n = 16; n <= 24; n += 8) {
12694 for (size_t k = 1; k <= 80; k += 17) {
12695 GemmMicrokernelTester()
12696 .mr(1)
12697 .nr(8)
12698 .kr(4)
12699 .sr(1)
12700 .m(1)
12701 .n(n)
12702 .k(k)
12703 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080012704 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012705 }
12706 }
12707 }
12708
12709 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, n_div_8_subtile) {
12710 TEST_REQUIRES_ARM_NEON;
12711 for (uint32_t n = 16; n <= 24; n += 8) {
12712 for (size_t k = 1; k <= 80; k += 17) {
12713 for (uint32_t m = 1; m <= 1; m++) {
12714 GemmMicrokernelTester()
12715 .mr(1)
12716 .nr(8)
12717 .kr(4)
12718 .sr(1)
12719 .m(m)
12720 .n(n)
12721 .k(k)
12722 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012723 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012724 }
12725 }
12726 }
12727 }
12728
12729 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, strided_cm_subtile) {
12730 TEST_REQUIRES_ARM_NEON;
12731 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080012732 for (uint32_t n = 1; n <= 8; n++) {
12733 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012734 GemmMicrokernelTester()
12735 .mr(1)
12736 .nr(8)
12737 .kr(4)
12738 .sr(1)
12739 .m(m)
12740 .n(n)
12741 .k(k)
12742 .cm_stride(11)
12743 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012744 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012745 }
12746 }
12747 }
12748 }
12749
12750 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, qmin) {
12751 TEST_REQUIRES_ARM_NEON;
12752 GemmMicrokernelTester()
12753 .mr(1)
12754 .nr(8)
12755 .kr(4)
12756 .sr(1)
12757 .m(1)
12758 .n(8)
12759 .k(16)
12760 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080012761 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012762 }
12763
12764 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, qmax) {
12765 TEST_REQUIRES_ARM_NEON;
12766 GemmMicrokernelTester()
12767 .mr(1)
12768 .nr(8)
12769 .kr(4)
12770 .sr(1)
12771 .m(1)
12772 .n(8)
12773 .k(16)
12774 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080012775 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012776 }
12777
12778 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD2R, strided_cm) {
12779 TEST_REQUIRES_ARM_NEON;
12780 GemmMicrokernelTester()
12781 .mr(1)
12782 .nr(8)
12783 .kr(4)
12784 .sr(1)
12785 .m(1)
12786 .n(8)
12787 .k(16)
12788 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080012789 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012790 }
12791#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
12792
12793
12794#if XNN_ARCH_ARM || XNN_ARCH_ARM64
12795 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16) {
12796 TEST_REQUIRES_ARM_NEON_V8;
12797 GemmMicrokernelTester()
12798 .mr(2)
12799 .nr(8)
12800 .kr(2)
12801 .sr(1)
12802 .m(2)
12803 .n(8)
12804 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -080012805 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012806 }
12807
12808 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, strided_cn) {
12809 TEST_REQUIRES_ARM_NEON_V8;
12810 GemmMicrokernelTester()
12811 .mr(2)
12812 .nr(8)
12813 .kr(2)
12814 .sr(1)
12815 .m(2)
12816 .n(8)
12817 .k(16)
12818 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080012819 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012820 }
12821
12822 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16_strided_a) {
12823 TEST_REQUIRES_ARM_NEON_V8;
12824 GemmMicrokernelTester()
12825 .mr(2)
12826 .nr(8)
12827 .kr(2)
12828 .sr(1)
12829 .m(2)
12830 .n(8)
12831 .k(16)
12832 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080012833 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012834 }
12835
12836 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile) {
12837 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -080012838 for (uint32_t n = 1; n <= 8; n++) {
12839 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012840 GemmMicrokernelTester()
12841 .mr(2)
12842 .nr(8)
12843 .kr(2)
12844 .sr(1)
12845 .m(m)
12846 .n(n)
12847 .k(16)
12848 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012849 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012850 }
12851 }
12852 }
12853
12854 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile_m) {
12855 TEST_REQUIRES_ARM_NEON_V8;
12856 for (uint32_t m = 1; m <= 2; m++) {
12857 GemmMicrokernelTester()
12858 .mr(2)
12859 .nr(8)
12860 .kr(2)
12861 .sr(1)
12862 .m(m)
12863 .n(8)
12864 .k(16)
12865 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012866 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012867 }
12868 }
12869
12870 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_eq_16_subtile_n) {
12871 TEST_REQUIRES_ARM_NEON_V8;
12872 for (uint32_t n = 1; n <= 8; n++) {
12873 GemmMicrokernelTester()
12874 .mr(2)
12875 .nr(8)
12876 .kr(2)
12877 .sr(1)
12878 .m(2)
12879 .n(n)
12880 .k(16)
12881 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012882 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012883 }
12884 }
12885
12886 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_lt_16) {
12887 TEST_REQUIRES_ARM_NEON_V8;
12888 for (size_t k = 1; k < 16; k++) {
12889 GemmMicrokernelTester()
12890 .mr(2)
12891 .nr(8)
12892 .kr(2)
12893 .sr(1)
12894 .m(2)
12895 .n(8)
12896 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012897 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012898 }
12899 }
12900
12901 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_lt_16_strided_a) {
12902 TEST_REQUIRES_ARM_NEON_V8;
12903 for (size_t k = 1; k < 16; k++) {
12904 GemmMicrokernelTester()
12905 .mr(2)
12906 .nr(8)
12907 .kr(2)
12908 .sr(1)
12909 .m(2)
12910 .n(8)
12911 .k(k)
12912 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080012913 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012914 }
12915 }
12916
12917 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_lt_16_subtile) {
12918 TEST_REQUIRES_ARM_NEON_V8;
12919 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080012920 for (uint32_t n = 1; n <= 8; n++) {
12921 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012922 GemmMicrokernelTester()
12923 .mr(2)
12924 .nr(8)
12925 .kr(2)
12926 .sr(1)
12927 .m(m)
12928 .n(n)
12929 .k(k)
12930 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012931 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012932 }
12933 }
12934 }
12935 }
12936
12937 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_gt_16) {
12938 TEST_REQUIRES_ARM_NEON_V8;
12939 for (size_t k = 17; k < 32; k++) {
12940 GemmMicrokernelTester()
12941 .mr(2)
12942 .nr(8)
12943 .kr(2)
12944 .sr(1)
12945 .m(2)
12946 .n(8)
12947 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012948 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012949 }
12950 }
12951
12952 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_gt_16_strided_a) {
12953 TEST_REQUIRES_ARM_NEON_V8;
12954 for (size_t k = 17; k < 32; k++) {
12955 GemmMicrokernelTester()
12956 .mr(2)
12957 .nr(8)
12958 .kr(2)
12959 .sr(1)
12960 .m(2)
12961 .n(8)
12962 .k(k)
12963 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -080012964 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012965 }
12966 }
12967
12968 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_gt_16_subtile) {
12969 TEST_REQUIRES_ARM_NEON_V8;
12970 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080012971 for (uint32_t n = 1; n <= 8; n++) {
12972 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012973 GemmMicrokernelTester()
12974 .mr(2)
12975 .nr(8)
12976 .kr(2)
12977 .sr(1)
12978 .m(m)
12979 .n(n)
12980 .k(k)
12981 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012982 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012983 }
12984 }
12985 }
12986 }
12987
12988 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_div_16) {
12989 TEST_REQUIRES_ARM_NEON_V8;
12990 for (size_t k = 32; k <= 160; k += 16) {
12991 GemmMicrokernelTester()
12992 .mr(2)
12993 .nr(8)
12994 .kr(2)
12995 .sr(1)
12996 .m(2)
12997 .n(8)
12998 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012999 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013000 }
13001 }
13002
13003 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_div_16_strided_a) {
13004 TEST_REQUIRES_ARM_NEON_V8;
13005 for (size_t k = 32; k <= 160; k += 16) {
13006 GemmMicrokernelTester()
13007 .mr(2)
13008 .nr(8)
13009 .kr(2)
13010 .sr(1)
13011 .m(2)
13012 .n(8)
13013 .k(k)
13014 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -080013015 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013016 }
13017 }
13018
13019 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, k_div_16_subtile) {
13020 TEST_REQUIRES_ARM_NEON_V8;
13021 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080013022 for (uint32_t n = 1; n <= 8; n++) {
13023 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013024 GemmMicrokernelTester()
13025 .mr(2)
13026 .nr(8)
13027 .kr(2)
13028 .sr(1)
13029 .m(m)
13030 .n(n)
13031 .k(k)
13032 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013033 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013034 }
13035 }
13036 }
13037 }
13038
13039 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_gt_8) {
13040 TEST_REQUIRES_ARM_NEON_V8;
13041 for (uint32_t n = 9; n < 16; n++) {
13042 for (size_t k = 1; k <= 80; k += 17) {
13043 GemmMicrokernelTester()
13044 .mr(2)
13045 .nr(8)
13046 .kr(2)
13047 .sr(1)
13048 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080013049 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013050 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080013051 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013052 }
13053 }
13054 }
13055
13056 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_gt_8_strided_cn) {
13057 TEST_REQUIRES_ARM_NEON_V8;
13058 for (uint32_t n = 9; n < 16; n++) {
13059 for (size_t k = 1; k <= 80; k += 17) {
13060 GemmMicrokernelTester()
13061 .mr(2)
13062 .nr(8)
13063 .kr(2)
13064 .sr(1)
13065 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080013066 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013067 .k(k)
13068 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080013069 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013070 }
13071 }
13072 }
13073
13074 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_gt_8_strided_a) {
13075 TEST_REQUIRES_ARM_NEON_V8;
13076 for (uint32_t n = 9; n < 16; n++) {
13077 for (size_t k = 1; k <= 80; k += 17) {
13078 GemmMicrokernelTester()
13079 .mr(2)
13080 .nr(8)
13081 .kr(2)
13082 .sr(1)
13083 .m(2)
13084 .n(n)
13085 .k(k)
13086 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080013087 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013088 }
13089 }
13090 }
13091
13092 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_gt_8_subtile) {
13093 TEST_REQUIRES_ARM_NEON_V8;
13094 for (uint32_t n = 9; n < 16; n++) {
13095 for (size_t k = 1; k <= 80; k += 17) {
13096 for (uint32_t m = 1; m <= 2; m++) {
13097 GemmMicrokernelTester()
13098 .mr(2)
13099 .nr(8)
13100 .kr(2)
13101 .sr(1)
13102 .m(m)
13103 .n(n)
13104 .k(k)
13105 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013106 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013107 }
13108 }
13109 }
13110 }
13111
13112 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_div_8) {
13113 TEST_REQUIRES_ARM_NEON_V8;
13114 for (uint32_t n = 16; n <= 24; n += 8) {
13115 for (size_t k = 1; k <= 80; k += 17) {
13116 GemmMicrokernelTester()
13117 .mr(2)
13118 .nr(8)
13119 .kr(2)
13120 .sr(1)
13121 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080013122 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013123 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080013124 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013125 }
13126 }
13127 }
13128
13129 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_div_8_strided_cn) {
13130 TEST_REQUIRES_ARM_NEON_V8;
13131 for (uint32_t n = 16; n <= 24; n += 8) {
13132 for (size_t k = 1; k <= 80; k += 17) {
13133 GemmMicrokernelTester()
13134 .mr(2)
13135 .nr(8)
13136 .kr(2)
13137 .sr(1)
13138 .m(2)
13139 .n(n)
13140 .k(k)
13141 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080013142 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013143 }
13144 }
13145 }
13146
13147 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_div_8_strided_a) {
13148 TEST_REQUIRES_ARM_NEON_V8;
13149 for (uint32_t n = 16; n <= 24; n += 8) {
13150 for (size_t k = 1; k <= 80; k += 17) {
13151 GemmMicrokernelTester()
13152 .mr(2)
13153 .nr(8)
13154 .kr(2)
13155 .sr(1)
13156 .m(2)
13157 .n(n)
13158 .k(k)
13159 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080013160 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013161 }
13162 }
13163 }
13164
13165 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, n_div_8_subtile) {
13166 TEST_REQUIRES_ARM_NEON_V8;
13167 for (uint32_t n = 16; n <= 24; n += 8) {
13168 for (size_t k = 1; k <= 80; k += 17) {
13169 for (uint32_t m = 1; m <= 2; m++) {
13170 GemmMicrokernelTester()
13171 .mr(2)
13172 .nr(8)
13173 .kr(2)
13174 .sr(1)
13175 .m(m)
13176 .n(n)
13177 .k(k)
13178 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013179 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013180 }
13181 }
13182 }
13183 }
13184
13185 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, strided_cm_subtile) {
13186 TEST_REQUIRES_ARM_NEON_V8;
13187 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080013188 for (uint32_t n = 1; n <= 8; n++) {
13189 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013190 GemmMicrokernelTester()
13191 .mr(2)
13192 .nr(8)
13193 .kr(2)
13194 .sr(1)
13195 .m(m)
13196 .n(n)
13197 .k(k)
13198 .cm_stride(11)
13199 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013200 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013201 }
13202 }
13203 }
13204 }
13205
13206 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, qmin) {
13207 TEST_REQUIRES_ARM_NEON_V8;
13208 GemmMicrokernelTester()
13209 .mr(2)
13210 .nr(8)
13211 .kr(2)
13212 .sr(1)
13213 .m(2)
13214 .n(8)
13215 .k(16)
13216 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080013217 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013218 }
13219
13220 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, qmax) {
13221 TEST_REQUIRES_ARM_NEON_V8;
13222 GemmMicrokernelTester()
13223 .mr(2)
13224 .nr(8)
13225 .kr(2)
13226 .sr(1)
13227 .m(2)
13228 .n(8)
13229 .k(16)
13230 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080013231 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013232 }
13233
13234 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_DUP, strided_cm) {
13235 TEST_REQUIRES_ARM_NEON_V8;
13236 GemmMicrokernelTester()
13237 .mr(2)
13238 .nr(8)
13239 .kr(2)
13240 .sr(1)
13241 .m(2)
13242 .n(8)
13243 .k(16)
13244 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080013245 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013246 }
13247#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
13248
13249
13250#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
13251 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16) {
13252 TEST_REQUIRES_ARM_NEON;
13253 GemmMicrokernelTester()
13254 .mr(2)
13255 .nr(8)
13256 .kr(8)
13257 .sr(1)
13258 .m(2)
13259 .n(8)
13260 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -080013261 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013262 }
13263
13264 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cn) {
13265 TEST_REQUIRES_ARM_NEON;
13266 GemmMicrokernelTester()
13267 .mr(2)
13268 .nr(8)
13269 .kr(8)
13270 .sr(1)
13271 .m(2)
13272 .n(8)
13273 .k(16)
13274 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080013275 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013276 }
13277
13278 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_strided_a) {
13279 TEST_REQUIRES_ARM_NEON;
13280 GemmMicrokernelTester()
13281 .mr(2)
13282 .nr(8)
13283 .kr(8)
13284 .sr(1)
13285 .m(2)
13286 .n(8)
13287 .k(16)
13288 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080013289 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013290 }
13291
13292 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile) {
13293 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -080013294 for (uint32_t n = 1; n <= 8; n++) {
13295 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013296 GemmMicrokernelTester()
13297 .mr(2)
13298 .nr(8)
13299 .kr(8)
13300 .sr(1)
13301 .m(m)
13302 .n(n)
13303 .k(16)
13304 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013305 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013306 }
13307 }
13308 }
13309
13310 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_m) {
13311 TEST_REQUIRES_ARM_NEON;
13312 for (uint32_t m = 1; m <= 2; m++) {
13313 GemmMicrokernelTester()
13314 .mr(2)
13315 .nr(8)
13316 .kr(8)
13317 .sr(1)
13318 .m(m)
13319 .n(8)
13320 .k(16)
13321 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013322 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013323 }
13324 }
13325
13326 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_eq_16_subtile_n) {
13327 TEST_REQUIRES_ARM_NEON;
13328 for (uint32_t n = 1; n <= 8; n++) {
13329 GemmMicrokernelTester()
13330 .mr(2)
13331 .nr(8)
13332 .kr(8)
13333 .sr(1)
13334 .m(2)
13335 .n(n)
13336 .k(16)
13337 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013338 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013339 }
13340 }
13341
13342 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16) {
13343 TEST_REQUIRES_ARM_NEON;
13344 for (size_t k = 1; k < 16; k++) {
13345 GemmMicrokernelTester()
13346 .mr(2)
13347 .nr(8)
13348 .kr(8)
13349 .sr(1)
13350 .m(2)
13351 .n(8)
13352 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080013353 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013354 }
13355 }
13356
13357 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_strided_a) {
13358 TEST_REQUIRES_ARM_NEON;
13359 for (size_t k = 1; k < 16; k++) {
13360 GemmMicrokernelTester()
13361 .mr(2)
13362 .nr(8)
13363 .kr(8)
13364 .sr(1)
13365 .m(2)
13366 .n(8)
13367 .k(k)
13368 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080013369 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013370 }
13371 }
13372
13373 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_lt_16_subtile) {
13374 TEST_REQUIRES_ARM_NEON;
13375 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080013376 for (uint32_t n = 1; n <= 8; n++) {
13377 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013378 GemmMicrokernelTester()
13379 .mr(2)
13380 .nr(8)
13381 .kr(8)
13382 .sr(1)
13383 .m(m)
13384 .n(n)
13385 .k(k)
13386 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013387 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013388 }
13389 }
13390 }
13391 }
13392
13393 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16) {
13394 TEST_REQUIRES_ARM_NEON;
13395 for (size_t k = 17; k < 32; k++) {
13396 GemmMicrokernelTester()
13397 .mr(2)
13398 .nr(8)
13399 .kr(8)
13400 .sr(1)
13401 .m(2)
13402 .n(8)
13403 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080013404 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013405 }
13406 }
13407
13408 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_strided_a) {
13409 TEST_REQUIRES_ARM_NEON;
13410 for (size_t k = 17; k < 32; k++) {
13411 GemmMicrokernelTester()
13412 .mr(2)
13413 .nr(8)
13414 .kr(8)
13415 .sr(1)
13416 .m(2)
13417 .n(8)
13418 .k(k)
13419 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -080013420 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013421 }
13422 }
13423
13424 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_gt_16_subtile) {
13425 TEST_REQUIRES_ARM_NEON;
13426 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080013427 for (uint32_t n = 1; n <= 8; n++) {
13428 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013429 GemmMicrokernelTester()
13430 .mr(2)
13431 .nr(8)
13432 .kr(8)
13433 .sr(1)
13434 .m(m)
13435 .n(n)
13436 .k(k)
13437 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013438 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013439 }
13440 }
13441 }
13442 }
13443
13444 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16) {
13445 TEST_REQUIRES_ARM_NEON;
13446 for (size_t k = 32; k <= 160; k += 16) {
13447 GemmMicrokernelTester()
13448 .mr(2)
13449 .nr(8)
13450 .kr(8)
13451 .sr(1)
13452 .m(2)
13453 .n(8)
13454 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080013455 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013456 }
13457 }
13458
13459 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_strided_a) {
13460 TEST_REQUIRES_ARM_NEON;
13461 for (size_t k = 32; k <= 160; k += 16) {
13462 GemmMicrokernelTester()
13463 .mr(2)
13464 .nr(8)
13465 .kr(8)
13466 .sr(1)
13467 .m(2)
13468 .n(8)
13469 .k(k)
13470 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -080013471 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013472 }
13473 }
13474
13475 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, k_div_16_subtile) {
13476 TEST_REQUIRES_ARM_NEON;
13477 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080013478 for (uint32_t n = 1; n <= 8; n++) {
13479 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013480 GemmMicrokernelTester()
13481 .mr(2)
13482 .nr(8)
13483 .kr(8)
13484 .sr(1)
13485 .m(m)
13486 .n(n)
13487 .k(k)
13488 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013489 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013490 }
13491 }
13492 }
13493 }
13494
13495 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8) {
13496 TEST_REQUIRES_ARM_NEON;
13497 for (uint32_t n = 9; n < 16; n++) {
13498 for (size_t k = 1; k <= 80; k += 17) {
13499 GemmMicrokernelTester()
13500 .mr(2)
13501 .nr(8)
13502 .kr(8)
13503 .sr(1)
13504 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080013505 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013506 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080013507 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013508 }
13509 }
13510 }
13511
13512 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_cn) {
13513 TEST_REQUIRES_ARM_NEON;
13514 for (uint32_t n = 9; n < 16; n++) {
13515 for (size_t k = 1; k <= 80; k += 17) {
13516 GemmMicrokernelTester()
13517 .mr(2)
13518 .nr(8)
13519 .kr(8)
13520 .sr(1)
13521 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080013522 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013523 .k(k)
13524 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080013525 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013526 }
13527 }
13528 }
13529
13530 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_strided_a) {
13531 TEST_REQUIRES_ARM_NEON;
13532 for (uint32_t n = 9; n < 16; n++) {
13533 for (size_t k = 1; k <= 80; k += 17) {
13534 GemmMicrokernelTester()
13535 .mr(2)
13536 .nr(8)
13537 .kr(8)
13538 .sr(1)
13539 .m(2)
13540 .n(n)
13541 .k(k)
13542 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080013543 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013544 }
13545 }
13546 }
13547
13548 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_gt_8_subtile) {
13549 TEST_REQUIRES_ARM_NEON;
13550 for (uint32_t n = 9; n < 16; n++) {
13551 for (size_t k = 1; k <= 80; k += 17) {
13552 for (uint32_t m = 1; m <= 2; m++) {
13553 GemmMicrokernelTester()
13554 .mr(2)
13555 .nr(8)
13556 .kr(8)
13557 .sr(1)
13558 .m(m)
13559 .n(n)
13560 .k(k)
13561 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013562 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013563 }
13564 }
13565 }
13566 }
13567
13568 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8) {
13569 TEST_REQUIRES_ARM_NEON;
13570 for (uint32_t n = 16; n <= 24; n += 8) {
13571 for (size_t k = 1; k <= 80; k += 17) {
13572 GemmMicrokernelTester()
13573 .mr(2)
13574 .nr(8)
13575 .kr(8)
13576 .sr(1)
13577 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080013578 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013579 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080013580 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013581 }
13582 }
13583 }
13584
13585 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_cn) {
13586 TEST_REQUIRES_ARM_NEON;
13587 for (uint32_t n = 16; n <= 24; n += 8) {
13588 for (size_t k = 1; k <= 80; k += 17) {
13589 GemmMicrokernelTester()
13590 .mr(2)
13591 .nr(8)
13592 .kr(8)
13593 .sr(1)
13594 .m(2)
13595 .n(n)
13596 .k(k)
13597 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080013598 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013599 }
13600 }
13601 }
13602
13603 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_strided_a) {
13604 TEST_REQUIRES_ARM_NEON;
13605 for (uint32_t n = 16; n <= 24; n += 8) {
13606 for (size_t k = 1; k <= 80; k += 17) {
13607 GemmMicrokernelTester()
13608 .mr(2)
13609 .nr(8)
13610 .kr(8)
13611 .sr(1)
13612 .m(2)
13613 .n(n)
13614 .k(k)
13615 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080013616 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013617 }
13618 }
13619 }
13620
13621 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, n_div_8_subtile) {
13622 TEST_REQUIRES_ARM_NEON;
13623 for (uint32_t n = 16; n <= 24; n += 8) {
13624 for (size_t k = 1; k <= 80; k += 17) {
13625 for (uint32_t m = 1; m <= 2; m++) {
13626 GemmMicrokernelTester()
13627 .mr(2)
13628 .nr(8)
13629 .kr(8)
13630 .sr(1)
13631 .m(m)
13632 .n(n)
13633 .k(k)
13634 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013635 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013636 }
13637 }
13638 }
13639 }
13640
13641 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm_subtile) {
13642 TEST_REQUIRES_ARM_NEON;
13643 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080013644 for (uint32_t n = 1; n <= 8; n++) {
13645 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013646 GemmMicrokernelTester()
13647 .mr(2)
13648 .nr(8)
13649 .kr(8)
13650 .sr(1)
13651 .m(m)
13652 .n(n)
13653 .k(k)
13654 .cm_stride(11)
13655 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013656 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013657 }
13658 }
13659 }
13660 }
13661
13662 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, qmin) {
13663 TEST_REQUIRES_ARM_NEON;
13664 GemmMicrokernelTester()
13665 .mr(2)
13666 .nr(8)
13667 .kr(8)
13668 .sr(1)
13669 .m(2)
13670 .n(8)
13671 .k(16)
13672 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080013673 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013674 }
13675
13676 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, qmax) {
13677 TEST_REQUIRES_ARM_NEON;
13678 GemmMicrokernelTester()
13679 .mr(2)
13680 .nr(8)
13681 .kr(8)
13682 .sr(1)
13683 .m(2)
13684 .n(8)
13685 .k(16)
13686 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080013687 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013688 }
13689
13690 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM, strided_cm) {
13691 TEST_REQUIRES_ARM_NEON;
13692 GemmMicrokernelTester()
13693 .mr(2)
13694 .nr(8)
13695 .kr(8)
13696 .sr(1)
13697 .m(2)
13698 .n(8)
13699 .k(16)
13700 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080013701 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013702 }
13703#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
13704
13705
13706#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
13707 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16) {
13708 TEST_REQUIRES_ARM_NEON;
13709 GemmMicrokernelTester()
13710 .mr(2)
13711 .nr(8)
13712 .kr(8)
13713 .sr(1)
13714 .m(2)
13715 .n(8)
13716 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -080013717 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013718 }
13719
13720 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cn) {
13721 TEST_REQUIRES_ARM_NEON;
13722 GemmMicrokernelTester()
13723 .mr(2)
13724 .nr(8)
13725 .kr(8)
13726 .sr(1)
13727 .m(2)
13728 .n(8)
13729 .k(16)
13730 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080013731 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013732 }
13733
13734 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_strided_a) {
13735 TEST_REQUIRES_ARM_NEON;
13736 GemmMicrokernelTester()
13737 .mr(2)
13738 .nr(8)
13739 .kr(8)
13740 .sr(1)
13741 .m(2)
13742 .n(8)
13743 .k(16)
13744 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080013745 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013746 }
13747
13748 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
13749 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -080013750 for (uint32_t n = 1; n <= 8; n++) {
13751 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013752 GemmMicrokernelTester()
13753 .mr(2)
13754 .nr(8)
13755 .kr(8)
13756 .sr(1)
13757 .m(m)
13758 .n(n)
13759 .k(16)
13760 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013761 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013762 }
13763 }
13764 }
13765
13766 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
13767 TEST_REQUIRES_ARM_NEON;
13768 for (uint32_t m = 1; m <= 2; m++) {
13769 GemmMicrokernelTester()
13770 .mr(2)
13771 .nr(8)
13772 .kr(8)
13773 .sr(1)
13774 .m(m)
13775 .n(8)
13776 .k(16)
13777 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013778 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013779 }
13780 }
13781
13782 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
13783 TEST_REQUIRES_ARM_NEON;
13784 for (uint32_t n = 1; n <= 8; n++) {
13785 GemmMicrokernelTester()
13786 .mr(2)
13787 .nr(8)
13788 .kr(8)
13789 .sr(1)
13790 .m(2)
13791 .n(n)
13792 .k(16)
13793 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013794 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013795 }
13796 }
13797
13798 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16) {
13799 TEST_REQUIRES_ARM_NEON;
13800 for (size_t k = 1; k < 16; k++) {
13801 GemmMicrokernelTester()
13802 .mr(2)
13803 .nr(8)
13804 .kr(8)
13805 .sr(1)
13806 .m(2)
13807 .n(8)
13808 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080013809 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013810 }
13811 }
13812
13813 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_strided_a) {
13814 TEST_REQUIRES_ARM_NEON;
13815 for (size_t k = 1; k < 16; k++) {
13816 GemmMicrokernelTester()
13817 .mr(2)
13818 .nr(8)
13819 .kr(8)
13820 .sr(1)
13821 .m(2)
13822 .n(8)
13823 .k(k)
13824 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080013825 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013826 }
13827 }
13828
13829 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
13830 TEST_REQUIRES_ARM_NEON;
13831 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080013832 for (uint32_t n = 1; n <= 8; n++) {
13833 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013834 GemmMicrokernelTester()
13835 .mr(2)
13836 .nr(8)
13837 .kr(8)
13838 .sr(1)
13839 .m(m)
13840 .n(n)
13841 .k(k)
13842 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013843 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013844 }
13845 }
13846 }
13847 }
13848
13849 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16) {
13850 TEST_REQUIRES_ARM_NEON;
13851 for (size_t k = 17; k < 32; k++) {
13852 GemmMicrokernelTester()
13853 .mr(2)
13854 .nr(8)
13855 .kr(8)
13856 .sr(1)
13857 .m(2)
13858 .n(8)
13859 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080013860 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013861 }
13862 }
13863
13864 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_strided_a) {
13865 TEST_REQUIRES_ARM_NEON;
13866 for (size_t k = 17; k < 32; k++) {
13867 GemmMicrokernelTester()
13868 .mr(2)
13869 .nr(8)
13870 .kr(8)
13871 .sr(1)
13872 .m(2)
13873 .n(8)
13874 .k(k)
13875 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -080013876 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013877 }
13878 }
13879
13880 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
13881 TEST_REQUIRES_ARM_NEON;
13882 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080013883 for (uint32_t n = 1; n <= 8; n++) {
13884 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013885 GemmMicrokernelTester()
13886 .mr(2)
13887 .nr(8)
13888 .kr(8)
13889 .sr(1)
13890 .m(m)
13891 .n(n)
13892 .k(k)
13893 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013894 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013895 }
13896 }
13897 }
13898 }
13899
13900 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16) {
13901 TEST_REQUIRES_ARM_NEON;
13902 for (size_t k = 32; k <= 160; k += 16) {
13903 GemmMicrokernelTester()
13904 .mr(2)
13905 .nr(8)
13906 .kr(8)
13907 .sr(1)
13908 .m(2)
13909 .n(8)
13910 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080013911 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013912 }
13913 }
13914
13915 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_strided_a) {
13916 TEST_REQUIRES_ARM_NEON;
13917 for (size_t k = 32; k <= 160; k += 16) {
13918 GemmMicrokernelTester()
13919 .mr(2)
13920 .nr(8)
13921 .kr(8)
13922 .sr(1)
13923 .m(2)
13924 .n(8)
13925 .k(k)
13926 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -080013927 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013928 }
13929 }
13930
13931 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_subtile) {
13932 TEST_REQUIRES_ARM_NEON;
13933 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080013934 for (uint32_t n = 1; n <= 8; n++) {
13935 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013936 GemmMicrokernelTester()
13937 .mr(2)
13938 .nr(8)
13939 .kr(8)
13940 .sr(1)
13941 .m(m)
13942 .n(n)
13943 .k(k)
13944 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013945 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013946 }
13947 }
13948 }
13949 }
13950
13951 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8) {
13952 TEST_REQUIRES_ARM_NEON;
13953 for (uint32_t n = 9; n < 16; n++) {
13954 for (size_t k = 1; k <= 80; k += 17) {
13955 GemmMicrokernelTester()
13956 .mr(2)
13957 .nr(8)
13958 .kr(8)
13959 .sr(1)
13960 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080013961 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013962 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080013963 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013964 }
13965 }
13966 }
13967
13968 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
13969 TEST_REQUIRES_ARM_NEON;
13970 for (uint32_t n = 9; n < 16; n++) {
13971 for (size_t k = 1; k <= 80; k += 17) {
13972 GemmMicrokernelTester()
13973 .mr(2)
13974 .nr(8)
13975 .kr(8)
13976 .sr(1)
13977 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080013978 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013979 .k(k)
13980 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080013981 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013982 }
13983 }
13984 }
13985
13986 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_a) {
13987 TEST_REQUIRES_ARM_NEON;
13988 for (uint32_t n = 9; n < 16; n++) {
13989 for (size_t k = 1; k <= 80; k += 17) {
13990 GemmMicrokernelTester()
13991 .mr(2)
13992 .nr(8)
13993 .kr(8)
13994 .sr(1)
13995 .m(2)
13996 .n(n)
13997 .k(k)
13998 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080013999 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014000 }
14001 }
14002 }
14003
14004 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
14005 TEST_REQUIRES_ARM_NEON;
14006 for (uint32_t n = 9; n < 16; n++) {
14007 for (size_t k = 1; k <= 80; k += 17) {
14008 for (uint32_t m = 1; m <= 2; m++) {
14009 GemmMicrokernelTester()
14010 .mr(2)
14011 .nr(8)
14012 .kr(8)
14013 .sr(1)
14014 .m(m)
14015 .n(n)
14016 .k(k)
14017 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014018 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014019 }
14020 }
14021 }
14022 }
14023
14024 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8) {
14025 TEST_REQUIRES_ARM_NEON;
14026 for (uint32_t n = 16; n <= 24; n += 8) {
14027 for (size_t k = 1; k <= 80; k += 17) {
14028 GemmMicrokernelTester()
14029 .mr(2)
14030 .nr(8)
14031 .kr(8)
14032 .sr(1)
14033 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080014034 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014035 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080014036 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014037 }
14038 }
14039 }
14040
14041 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
14042 TEST_REQUIRES_ARM_NEON;
14043 for (uint32_t n = 16; n <= 24; n += 8) {
14044 for (size_t k = 1; k <= 80; k += 17) {
14045 GemmMicrokernelTester()
14046 .mr(2)
14047 .nr(8)
14048 .kr(8)
14049 .sr(1)
14050 .m(2)
14051 .n(n)
14052 .k(k)
14053 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080014054 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014055 }
14056 }
14057 }
14058
14059 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_a) {
14060 TEST_REQUIRES_ARM_NEON;
14061 for (uint32_t n = 16; n <= 24; n += 8) {
14062 for (size_t k = 1; k <= 80; k += 17) {
14063 GemmMicrokernelTester()
14064 .mr(2)
14065 .nr(8)
14066 .kr(8)
14067 .sr(1)
14068 .m(2)
14069 .n(n)
14070 .k(k)
14071 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080014072 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014073 }
14074 }
14075 }
14076
14077 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_subtile) {
14078 TEST_REQUIRES_ARM_NEON;
14079 for (uint32_t n = 16; n <= 24; n += 8) {
14080 for (size_t k = 1; k <= 80; k += 17) {
14081 for (uint32_t m = 1; m <= 2; m++) {
14082 GemmMicrokernelTester()
14083 .mr(2)
14084 .nr(8)
14085 .kr(8)
14086 .sr(1)
14087 .m(m)
14088 .n(n)
14089 .k(k)
14090 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014091 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014092 }
14093 }
14094 }
14095 }
14096
14097 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm_subtile) {
14098 TEST_REQUIRES_ARM_NEON;
14099 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080014100 for (uint32_t n = 1; n <= 8; n++) {
14101 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014102 GemmMicrokernelTester()
14103 .mr(2)
14104 .nr(8)
14105 .kr(8)
14106 .sr(1)
14107 .m(m)
14108 .n(n)
14109 .k(k)
14110 .cm_stride(11)
14111 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014112 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014113 }
14114 }
14115 }
14116 }
14117
14118 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmin) {
14119 TEST_REQUIRES_ARM_NEON;
14120 GemmMicrokernelTester()
14121 .mr(2)
14122 .nr(8)
14123 .kr(8)
14124 .sr(1)
14125 .m(2)
14126 .n(8)
14127 .k(16)
14128 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080014129 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014130 }
14131
14132 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmax) {
14133 TEST_REQUIRES_ARM_NEON;
14134 GemmMicrokernelTester()
14135 .mr(2)
14136 .nr(8)
14137 .kr(8)
14138 .sr(1)
14139 .m(2)
14140 .n(8)
14141 .k(16)
14142 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080014143 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014144 }
14145
14146 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm) {
14147 TEST_REQUIRES_ARM_NEON;
14148 GemmMicrokernelTester()
14149 .mr(2)
14150 .nr(8)
14151 .kr(8)
14152 .sr(1)
14153 .m(2)
14154 .n(8)
14155 .k(16)
14156 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080014157 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014158 }
14159#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard87fe4102021-12-28 14:42:23 -080014160
14161
Frank Barcharde4d3f762021-12-23 15:31:43 -080014162#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard98af05c2021-06-30 12:15:04 -070014163 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8) {
14164 TEST_REQUIRES_ARM_NEON;
14165 GemmMicrokernelTester()
14166 .mr(4)
14167 .nr(16)
14168 .kr(1)
14169 .sr(1)
14170 .m(4)
14171 .n(16)
14172 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080014173 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014174 }
14175
14176 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cn) {
14177 TEST_REQUIRES_ARM_NEON;
14178 GemmMicrokernelTester()
14179 .mr(4)
14180 .nr(16)
14181 .kr(1)
14182 .sr(1)
14183 .m(4)
14184 .n(16)
14185 .k(8)
14186 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080014187 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014188 }
14189
14190 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_strided_a) {
14191 TEST_REQUIRES_ARM_NEON;
14192 GemmMicrokernelTester()
14193 .mr(4)
14194 .nr(16)
14195 .kr(1)
14196 .sr(1)
14197 .m(4)
14198 .n(16)
14199 .k(8)
14200 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080014201 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014202 }
14203
14204 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile) {
14205 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -080014206 for (uint32_t n = 1; n <= 16; n++) {
14207 for (uint32_t m = 1; m <= 4; m++) {
Frank Barchard98af05c2021-06-30 12:15:04 -070014208 GemmMicrokernelTester()
14209 .mr(4)
14210 .nr(16)
14211 .kr(1)
14212 .sr(1)
14213 .m(m)
14214 .n(n)
14215 .k(8)
14216 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014217 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014218 }
14219 }
14220 }
14221
14222 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile_m) {
14223 TEST_REQUIRES_ARM_NEON;
14224 for (uint32_t m = 1; m <= 4; m++) {
14225 GemmMicrokernelTester()
14226 .mr(4)
14227 .nr(16)
14228 .kr(1)
14229 .sr(1)
14230 .m(m)
14231 .n(16)
14232 .k(8)
14233 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014234 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014235 }
14236 }
14237
14238 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile_n) {
14239 TEST_REQUIRES_ARM_NEON;
14240 for (uint32_t n = 1; n <= 16; n++) {
14241 GemmMicrokernelTester()
14242 .mr(4)
14243 .nr(16)
14244 .kr(1)
14245 .sr(1)
14246 .m(4)
14247 .n(n)
14248 .k(8)
14249 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014250 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014251 }
14252 }
14253
14254 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_lt_8) {
14255 TEST_REQUIRES_ARM_NEON;
14256 for (size_t k = 1; k < 8; k++) {
14257 GemmMicrokernelTester()
14258 .mr(4)
14259 .nr(16)
14260 .kr(1)
14261 .sr(1)
14262 .m(4)
14263 .n(16)
14264 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080014265 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014266 }
14267 }
14268
14269 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_lt_8_strided_a) {
14270 TEST_REQUIRES_ARM_NEON;
14271 for (size_t k = 1; k < 8; k++) {
14272 GemmMicrokernelTester()
14273 .mr(4)
14274 .nr(16)
14275 .kr(1)
14276 .sr(1)
14277 .m(4)
14278 .n(16)
14279 .k(k)
14280 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080014281 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014282 }
14283 }
14284
14285 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_lt_8_subtile) {
14286 TEST_REQUIRES_ARM_NEON;
14287 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080014288 for (uint32_t n = 1; n <= 16; n++) {
14289 for (uint32_t m = 1; m <= 4; m++) {
Frank Barchard98af05c2021-06-30 12:15:04 -070014290 GemmMicrokernelTester()
14291 .mr(4)
14292 .nr(16)
14293 .kr(1)
14294 .sr(1)
14295 .m(m)
14296 .n(n)
14297 .k(k)
14298 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014299 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014300 }
14301 }
14302 }
14303 }
14304
14305 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_gt_8) {
14306 TEST_REQUIRES_ARM_NEON;
14307 for (size_t k = 9; k < 16; k++) {
14308 GemmMicrokernelTester()
14309 .mr(4)
14310 .nr(16)
14311 .kr(1)
14312 .sr(1)
14313 .m(4)
14314 .n(16)
14315 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080014316 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014317 }
14318 }
14319
14320 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_gt_8_strided_a) {
14321 TEST_REQUIRES_ARM_NEON;
14322 for (size_t k = 9; k < 16; k++) {
14323 GemmMicrokernelTester()
14324 .mr(4)
14325 .nr(16)
14326 .kr(1)
14327 .sr(1)
14328 .m(4)
14329 .n(16)
14330 .k(k)
14331 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080014332 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014333 }
14334 }
14335
14336 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_gt_8_subtile) {
14337 TEST_REQUIRES_ARM_NEON;
14338 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080014339 for (uint32_t n = 1; n <= 16; n++) {
14340 for (uint32_t m = 1; m <= 4; m++) {
Frank Barchard98af05c2021-06-30 12:15:04 -070014341 GemmMicrokernelTester()
14342 .mr(4)
14343 .nr(16)
14344 .kr(1)
14345 .sr(1)
14346 .m(m)
14347 .n(n)
14348 .k(k)
14349 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014350 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014351 }
14352 }
14353 }
14354 }
14355
14356 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_div_8) {
14357 TEST_REQUIRES_ARM_NEON;
14358 for (size_t k = 16; k <= 80; k += 8) {
14359 GemmMicrokernelTester()
14360 .mr(4)
14361 .nr(16)
14362 .kr(1)
14363 .sr(1)
14364 .m(4)
14365 .n(16)
14366 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080014367 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014368 }
14369 }
14370
14371 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_div_8_strided_a) {
14372 TEST_REQUIRES_ARM_NEON;
14373 for (size_t k = 16; k <= 80; k += 8) {
14374 GemmMicrokernelTester()
14375 .mr(4)
14376 .nr(16)
14377 .kr(1)
14378 .sr(1)
14379 .m(4)
14380 .n(16)
14381 .k(k)
14382 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080014383 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014384 }
14385 }
14386
14387 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_div_8_subtile) {
14388 TEST_REQUIRES_ARM_NEON;
14389 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080014390 for (uint32_t n = 1; n <= 16; n++) {
14391 for (uint32_t m = 1; m <= 4; m++) {
Frank Barchard98af05c2021-06-30 12:15:04 -070014392 GemmMicrokernelTester()
14393 .mr(4)
14394 .nr(16)
14395 .kr(1)
14396 .sr(1)
14397 .m(m)
14398 .n(n)
14399 .k(k)
14400 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014401 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014402 }
14403 }
14404 }
14405 }
14406
14407 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16) {
14408 TEST_REQUIRES_ARM_NEON;
14409 for (uint32_t n = 17; n < 32; n++) {
14410 for (size_t k = 1; k <= 40; k += 9) {
14411 GemmMicrokernelTester()
14412 .mr(4)
14413 .nr(16)
14414 .kr(1)
14415 .sr(1)
14416 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080014417 .n(n)
Frank Barchard98af05c2021-06-30 12:15:04 -070014418 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080014419 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014420 }
14421 }
14422 }
14423
14424 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16_strided_cn) {
14425 TEST_REQUIRES_ARM_NEON;
14426 for (uint32_t n = 17; n < 32; n++) {
14427 for (size_t k = 1; k <= 40; k += 9) {
14428 GemmMicrokernelTester()
14429 .mr(4)
14430 .nr(16)
14431 .kr(1)
14432 .sr(1)
14433 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080014434 .n(n)
Frank Barchard98af05c2021-06-30 12:15:04 -070014435 .k(k)
14436 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080014437 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014438 }
14439 }
14440 }
14441
14442 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16_strided_a) {
14443 TEST_REQUIRES_ARM_NEON;
14444 for (uint32_t n = 17; n < 32; n++) {
14445 for (size_t k = 1; k <= 40; k += 9) {
14446 GemmMicrokernelTester()
14447 .mr(4)
14448 .nr(16)
14449 .kr(1)
14450 .sr(1)
14451 .m(4)
14452 .n(n)
14453 .k(k)
14454 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080014455 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014456 }
14457 }
14458 }
14459
14460 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16_subtile) {
14461 TEST_REQUIRES_ARM_NEON;
14462 for (uint32_t n = 17; n < 32; n++) {
14463 for (size_t k = 1; k <= 40; k += 9) {
14464 for (uint32_t m = 1; m <= 4; m++) {
14465 GemmMicrokernelTester()
14466 .mr(4)
14467 .nr(16)
14468 .kr(1)
14469 .sr(1)
14470 .m(m)
14471 .n(n)
14472 .k(k)
14473 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014474 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014475 }
14476 }
14477 }
14478 }
14479
14480 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16) {
14481 TEST_REQUIRES_ARM_NEON;
14482 for (uint32_t n = 32; n <= 48; n += 16) {
14483 for (size_t k = 1; k <= 40; k += 9) {
14484 GemmMicrokernelTester()
14485 .mr(4)
14486 .nr(16)
14487 .kr(1)
14488 .sr(1)
14489 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080014490 .n(n)
Frank Barchard98af05c2021-06-30 12:15:04 -070014491 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080014492 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014493 }
14494 }
14495 }
14496
14497 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16_strided_cn) {
14498 TEST_REQUIRES_ARM_NEON;
14499 for (uint32_t n = 32; n <= 48; n += 16) {
14500 for (size_t k = 1; k <= 40; k += 9) {
14501 GemmMicrokernelTester()
14502 .mr(4)
14503 .nr(16)
14504 .kr(1)
14505 .sr(1)
14506 .m(4)
14507 .n(n)
14508 .k(k)
14509 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080014510 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014511 }
14512 }
14513 }
14514
14515 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16_strided_a) {
14516 TEST_REQUIRES_ARM_NEON;
14517 for (uint32_t n = 32; n <= 48; n += 16) {
14518 for (size_t k = 1; k <= 40; k += 9) {
14519 GemmMicrokernelTester()
14520 .mr(4)
14521 .nr(16)
14522 .kr(1)
14523 .sr(1)
14524 .m(4)
14525 .n(n)
14526 .k(k)
14527 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080014528 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014529 }
14530 }
14531 }
14532
14533 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16_subtile) {
14534 TEST_REQUIRES_ARM_NEON;
14535 for (uint32_t n = 32; n <= 48; n += 16) {
14536 for (size_t k = 1; k <= 40; k += 9) {
14537 for (uint32_t m = 1; m <= 4; m++) {
14538 GemmMicrokernelTester()
14539 .mr(4)
14540 .nr(16)
14541 .kr(1)
14542 .sr(1)
14543 .m(m)
14544 .n(n)
14545 .k(k)
14546 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014547 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014548 }
14549 }
14550 }
14551 }
14552
14553 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cm_subtile) {
14554 TEST_REQUIRES_ARM_NEON;
14555 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080014556 for (uint32_t n = 1; n <= 16; n++) {
14557 for (uint32_t m = 1; m <= 4; m++) {
Frank Barchard98af05c2021-06-30 12:15:04 -070014558 GemmMicrokernelTester()
14559 .mr(4)
14560 .nr(16)
14561 .kr(1)
14562 .sr(1)
14563 .m(m)
14564 .n(n)
14565 .k(k)
14566 .cm_stride(19)
14567 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014568 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014569 }
14570 }
14571 }
14572 }
14573
14574 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, qmin) {
14575 TEST_REQUIRES_ARM_NEON;
14576 GemmMicrokernelTester()
14577 .mr(4)
14578 .nr(16)
14579 .kr(1)
14580 .sr(1)
14581 .m(4)
14582 .n(16)
14583 .k(8)
14584 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080014585 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014586 }
14587
14588 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, qmax) {
14589 TEST_REQUIRES_ARM_NEON;
14590 GemmMicrokernelTester()
14591 .mr(4)
14592 .nr(16)
14593 .kr(1)
14594 .sr(1)
14595 .m(4)
14596 .n(16)
14597 .k(8)
14598 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080014599 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014600 }
14601
14602 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cm) {
14603 TEST_REQUIRES_ARM_NEON;
14604 GemmMicrokernelTester()
14605 .mr(4)
14606 .nr(16)
14607 .kr(1)
14608 .sr(1)
14609 .m(4)
14610 .n(16)
14611 .k(8)
14612 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080014613 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard98af05c2021-06-30 12:15:04 -070014614 }
Frank Barcharde4d3f762021-12-23 15:31:43 -080014615#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard98af05c2021-06-30 12:15:04 -070014616
14617
Frank Barcharde4d3f762021-12-23 15:31:43 -080014618#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard5cffb642021-11-22 13:59:43 -080014619 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
14620 TEST_REQUIRES_ARM_NEON;
14621 GemmMicrokernelTester()
14622 .mr(4)
14623 .nr(16)
14624 .kr(1)
14625 .sr(1)
14626 .m(4)
14627 .n(16)
14628 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080014629 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080014630 }
14631
14632 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
14633 TEST_REQUIRES_ARM_NEON;
14634 GemmMicrokernelTester()
14635 .mr(4)
14636 .nr(16)
14637 .kr(1)
14638 .sr(1)
14639 .m(4)
14640 .n(16)
14641 .k(8)
14642 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080014643 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080014644 }
14645
14646 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_strided_a) {
14647 TEST_REQUIRES_ARM_NEON;
14648 GemmMicrokernelTester()
14649 .mr(4)
14650 .nr(16)
14651 .kr(1)
14652 .sr(1)
14653 .m(4)
14654 .n(16)
14655 .k(8)
14656 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080014657 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080014658 }
14659
14660 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
14661 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -080014662 for (uint32_t n = 1; n <= 16; n++) {
14663 for (uint32_t m = 1; m <= 4; m++) {
Frank Barchard5cffb642021-11-22 13:59:43 -080014664 GemmMicrokernelTester()
14665 .mr(4)
14666 .nr(16)
14667 .kr(1)
14668 .sr(1)
14669 .m(m)
14670 .n(n)
14671 .k(8)
14672 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014673 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080014674 }
14675 }
14676 }
14677
14678 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
14679 TEST_REQUIRES_ARM_NEON;
14680 for (uint32_t m = 1; m <= 4; m++) {
14681 GemmMicrokernelTester()
14682 .mr(4)
14683 .nr(16)
14684 .kr(1)
14685 .sr(1)
14686 .m(m)
14687 .n(16)
14688 .k(8)
14689 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014690 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080014691 }
14692 }
14693
14694 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
14695 TEST_REQUIRES_ARM_NEON;
14696 for (uint32_t n = 1; n <= 16; n++) {
14697 GemmMicrokernelTester()
14698 .mr(4)
14699 .nr(16)
14700 .kr(1)
14701 .sr(1)
14702 .m(4)
14703 .n(n)
14704 .k(8)
14705 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014706 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080014707 }
14708 }
14709
14710 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
14711 TEST_REQUIRES_ARM_NEON;
14712 for (size_t k = 1; k < 8; k++) {
14713 GemmMicrokernelTester()
14714 .mr(4)
14715 .nr(16)
14716 .kr(1)
14717 .sr(1)
14718 .m(4)
14719 .n(16)
14720 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080014721 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080014722 }
14723 }
14724
14725 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_strided_a) {
14726 TEST_REQUIRES_ARM_NEON;
14727 for (size_t k = 1; k < 8; k++) {
14728 GemmMicrokernelTester()
14729 .mr(4)
14730 .nr(16)
14731 .kr(1)
14732 .sr(1)
14733 .m(4)
14734 .n(16)
14735 .k(k)
14736 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080014737 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080014738 }
14739 }
14740
14741 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
14742 TEST_REQUIRES_ARM_NEON;
14743 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080014744 for (uint32_t n = 1; n <= 16; n++) {
14745 for (uint32_t m = 1; m <= 4; m++) {
Frank Barchard5cffb642021-11-22 13:59:43 -080014746 GemmMicrokernelTester()
14747 .mr(4)
14748 .nr(16)
14749 .kr(1)
14750 .sr(1)
14751 .m(m)
14752 .n(n)
14753 .k(k)
14754 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014755 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080014756 }
14757 }
14758 }
14759 }
14760
14761 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
14762 TEST_REQUIRES_ARM_NEON;
14763 for (size_t k = 9; k < 16; k++) {
14764 GemmMicrokernelTester()
14765 .mr(4)
14766 .nr(16)
14767 .kr(1)
14768 .sr(1)
14769 .m(4)
14770 .n(16)
14771 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080014772 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080014773 }
14774 }
14775
14776 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_strided_a) {
14777 TEST_REQUIRES_ARM_NEON;
14778 for (size_t k = 9; k < 16; k++) {
14779 GemmMicrokernelTester()
14780 .mr(4)
14781 .nr(16)
14782 .kr(1)
14783 .sr(1)
14784 .m(4)
14785 .n(16)
14786 .k(k)
14787 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080014788 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080014789 }
14790 }
14791
14792 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
14793 TEST_REQUIRES_ARM_NEON;
14794 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080014795 for (uint32_t n = 1; n <= 16; n++) {
14796 for (uint32_t m = 1; m <= 4; m++) {
Frank Barchard5cffb642021-11-22 13:59:43 -080014797 GemmMicrokernelTester()
14798 .mr(4)
14799 .nr(16)
14800 .kr(1)
14801 .sr(1)
14802 .m(m)
14803 .n(n)
14804 .k(k)
14805 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014806 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080014807 }
14808 }
14809 }
14810 }
14811
14812 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
14813 TEST_REQUIRES_ARM_NEON;
14814 for (size_t k = 16; k <= 80; k += 8) {
14815 GemmMicrokernelTester()
14816 .mr(4)
14817 .nr(16)
14818 .kr(1)
14819 .sr(1)
14820 .m(4)
14821 .n(16)
14822 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080014823 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080014824 }
14825 }
14826
14827 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_strided_a) {
14828 TEST_REQUIRES_ARM_NEON;
14829 for (size_t k = 16; k <= 80; k += 8) {
14830 GemmMicrokernelTester()
14831 .mr(4)
14832 .nr(16)
14833 .kr(1)
14834 .sr(1)
14835 .m(4)
14836 .n(16)
14837 .k(k)
14838 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080014839 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080014840 }
14841 }
14842
14843 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
14844 TEST_REQUIRES_ARM_NEON;
14845 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080014846 for (uint32_t n = 1; n <= 16; n++) {
14847 for (uint32_t m = 1; m <= 4; m++) {
Frank Barchard5cffb642021-11-22 13:59:43 -080014848 GemmMicrokernelTester()
14849 .mr(4)
14850 .nr(16)
14851 .kr(1)
14852 .sr(1)
14853 .m(m)
14854 .n(n)
14855 .k(k)
14856 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014857 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080014858 }
14859 }
14860 }
14861 }
14862
14863 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16) {
14864 TEST_REQUIRES_ARM_NEON;
14865 for (uint32_t n = 17; n < 32; n++) {
14866 for (size_t k = 1; k <= 40; k += 9) {
14867 GemmMicrokernelTester()
14868 .mr(4)
14869 .nr(16)
14870 .kr(1)
14871 .sr(1)
14872 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080014873 .n(n)
Frank Barchard5cffb642021-11-22 13:59:43 -080014874 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080014875 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080014876 }
14877 }
14878 }
14879
14880 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_cn) {
14881 TEST_REQUIRES_ARM_NEON;
14882 for (uint32_t n = 17; n < 32; n++) {
14883 for (size_t k = 1; k <= 40; k += 9) {
14884 GemmMicrokernelTester()
14885 .mr(4)
14886 .nr(16)
14887 .kr(1)
14888 .sr(1)
14889 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080014890 .n(n)
Frank Barchard5cffb642021-11-22 13:59:43 -080014891 .k(k)
14892 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080014893 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080014894 }
14895 }
14896 }
14897
14898 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_a) {
14899 TEST_REQUIRES_ARM_NEON;
14900 for (uint32_t n = 17; n < 32; n++) {
14901 for (size_t k = 1; k <= 40; k += 9) {
14902 GemmMicrokernelTester()
14903 .mr(4)
14904 .nr(16)
14905 .kr(1)
14906 .sr(1)
14907 .m(4)
14908 .n(n)
14909 .k(k)
14910 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080014911 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080014912 }
14913 }
14914 }
14915
14916 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_subtile) {
14917 TEST_REQUIRES_ARM_NEON;
14918 for (uint32_t n = 17; n < 32; n++) {
14919 for (size_t k = 1; k <= 40; k += 9) {
14920 for (uint32_t m = 1; m <= 4; m++) {
14921 GemmMicrokernelTester()
14922 .mr(4)
14923 .nr(16)
14924 .kr(1)
14925 .sr(1)
14926 .m(m)
14927 .n(n)
14928 .k(k)
14929 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014930 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080014931 }
14932 }
14933 }
14934 }
14935
14936 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16) {
14937 TEST_REQUIRES_ARM_NEON;
14938 for (uint32_t n = 32; n <= 48; n += 16) {
14939 for (size_t k = 1; k <= 40; k += 9) {
14940 GemmMicrokernelTester()
14941 .mr(4)
14942 .nr(16)
14943 .kr(1)
14944 .sr(1)
14945 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080014946 .n(n)
Frank Barchard5cffb642021-11-22 13:59:43 -080014947 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080014948 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080014949 }
14950 }
14951 }
14952
14953 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_cn) {
14954 TEST_REQUIRES_ARM_NEON;
14955 for (uint32_t n = 32; n <= 48; n += 16) {
14956 for (size_t k = 1; k <= 40; k += 9) {
14957 GemmMicrokernelTester()
14958 .mr(4)
14959 .nr(16)
14960 .kr(1)
14961 .sr(1)
14962 .m(4)
14963 .n(n)
14964 .k(k)
14965 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080014966 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080014967 }
14968 }
14969 }
14970
14971 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_a) {
14972 TEST_REQUIRES_ARM_NEON;
14973 for (uint32_t n = 32; n <= 48; n += 16) {
14974 for (size_t k = 1; k <= 40; k += 9) {
14975 GemmMicrokernelTester()
14976 .mr(4)
14977 .nr(16)
14978 .kr(1)
14979 .sr(1)
14980 .m(4)
14981 .n(n)
14982 .k(k)
14983 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080014984 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080014985 }
14986 }
14987 }
14988
14989 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_subtile) {
14990 TEST_REQUIRES_ARM_NEON;
14991 for (uint32_t n = 32; n <= 48; n += 16) {
14992 for (size_t k = 1; k <= 40; k += 9) {
14993 for (uint32_t m = 1; m <= 4; m++) {
14994 GemmMicrokernelTester()
14995 .mr(4)
14996 .nr(16)
14997 .kr(1)
14998 .sr(1)
14999 .m(m)
15000 .n(n)
15001 .k(k)
15002 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015003 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015004 }
15005 }
15006 }
15007 }
15008
15009 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
15010 TEST_REQUIRES_ARM_NEON;
15011 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080015012 for (uint32_t n = 1; n <= 16; n++) {
15013 for (uint32_t m = 1; m <= 4; m++) {
Frank Barchard5cffb642021-11-22 13:59:43 -080015014 GemmMicrokernelTester()
15015 .mr(4)
15016 .nr(16)
15017 .kr(1)
15018 .sr(1)
15019 .m(m)
15020 .n(n)
15021 .k(k)
15022 .cm_stride(19)
15023 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015024 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015025 }
15026 }
15027 }
15028 }
15029
15030 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmin) {
15031 TEST_REQUIRES_ARM_NEON;
15032 GemmMicrokernelTester()
15033 .mr(4)
15034 .nr(16)
15035 .kr(1)
15036 .sr(1)
15037 .m(4)
15038 .n(16)
15039 .k(8)
15040 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080015041 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015042 }
15043
15044 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmax) {
15045 TEST_REQUIRES_ARM_NEON;
15046 GemmMicrokernelTester()
15047 .mr(4)
15048 .nr(16)
15049 .kr(1)
15050 .sr(1)
15051 .m(4)
15052 .n(16)
15053 .k(8)
15054 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080015055 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015056 }
15057
15058 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
15059 TEST_REQUIRES_ARM_NEON;
15060 GemmMicrokernelTester()
15061 .mr(4)
15062 .nr(16)
15063 .kr(1)
15064 .sr(1)
15065 .m(4)
15066 .n(16)
15067 .k(8)
15068 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080015069 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015070 }
Frank Barcharde4d3f762021-12-23 15:31:43 -080015071#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard5cffb642021-11-22 13:59:43 -080015072
15073
Frank Barcharde4d3f762021-12-23 15:31:43 -080015074#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard5cffb642021-11-22 13:59:43 -080015075 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8) {
15076 TEST_REQUIRES_ARM_NEON;
15077 GemmMicrokernelTester()
15078 .mr(4)
15079 .nr(16)
15080 .kr(1)
15081 .sr(1)
15082 .m(4)
15083 .n(16)
15084 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080015085 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015086 }
15087
15088 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cn) {
15089 TEST_REQUIRES_ARM_NEON;
15090 GemmMicrokernelTester()
15091 .mr(4)
15092 .nr(16)
15093 .kr(1)
15094 .sr(1)
15095 .m(4)
15096 .n(16)
15097 .k(8)
15098 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080015099 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015100 }
15101
15102 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_strided_a) {
15103 TEST_REQUIRES_ARM_NEON;
15104 GemmMicrokernelTester()
15105 .mr(4)
15106 .nr(16)
15107 .kr(1)
15108 .sr(1)
15109 .m(4)
15110 .n(16)
15111 .k(8)
15112 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080015113 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015114 }
15115
15116 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile) {
15117 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -080015118 for (uint32_t n = 1; n <= 16; n++) {
15119 for (uint32_t m = 1; m <= 4; m++) {
Frank Barchard5cffb642021-11-22 13:59:43 -080015120 GemmMicrokernelTester()
15121 .mr(4)
15122 .nr(16)
15123 .kr(1)
15124 .sr(1)
15125 .m(m)
15126 .n(n)
15127 .k(8)
15128 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015129 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015130 }
15131 }
15132 }
15133
15134 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) {
15135 TEST_REQUIRES_ARM_NEON;
15136 for (uint32_t m = 1; m <= 4; m++) {
15137 GemmMicrokernelTester()
15138 .mr(4)
15139 .nr(16)
15140 .kr(1)
15141 .sr(1)
15142 .m(m)
15143 .n(16)
15144 .k(8)
15145 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015146 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015147 }
15148 }
15149
15150 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) {
15151 TEST_REQUIRES_ARM_NEON;
15152 for (uint32_t n = 1; n <= 16; n++) {
15153 GemmMicrokernelTester()
15154 .mr(4)
15155 .nr(16)
15156 .kr(1)
15157 .sr(1)
15158 .m(4)
15159 .n(n)
15160 .k(8)
15161 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015162 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015163 }
15164 }
15165
15166 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8) {
15167 TEST_REQUIRES_ARM_NEON;
15168 for (size_t k = 1; k < 8; k++) {
15169 GemmMicrokernelTester()
15170 .mr(4)
15171 .nr(16)
15172 .kr(1)
15173 .sr(1)
15174 .m(4)
15175 .n(16)
15176 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080015177 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015178 }
15179 }
15180
15181 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_strided_a) {
15182 TEST_REQUIRES_ARM_NEON;
15183 for (size_t k = 1; k < 8; k++) {
15184 GemmMicrokernelTester()
15185 .mr(4)
15186 .nr(16)
15187 .kr(1)
15188 .sr(1)
15189 .m(4)
15190 .n(16)
15191 .k(k)
15192 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080015193 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015194 }
15195 }
15196
15197 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_subtile) {
15198 TEST_REQUIRES_ARM_NEON;
15199 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080015200 for (uint32_t n = 1; n <= 16; n++) {
15201 for (uint32_t m = 1; m <= 4; m++) {
Frank Barchard5cffb642021-11-22 13:59:43 -080015202 GemmMicrokernelTester()
15203 .mr(4)
15204 .nr(16)
15205 .kr(1)
15206 .sr(1)
15207 .m(m)
15208 .n(n)
15209 .k(k)
15210 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015211 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015212 }
15213 }
15214 }
15215 }
15216
15217 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8) {
15218 TEST_REQUIRES_ARM_NEON;
15219 for (size_t k = 9; k < 16; k++) {
15220 GemmMicrokernelTester()
15221 .mr(4)
15222 .nr(16)
15223 .kr(1)
15224 .sr(1)
15225 .m(4)
15226 .n(16)
15227 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080015228 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015229 }
15230 }
15231
15232 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_strided_a) {
15233 TEST_REQUIRES_ARM_NEON;
15234 for (size_t k = 9; k < 16; k++) {
15235 GemmMicrokernelTester()
15236 .mr(4)
15237 .nr(16)
15238 .kr(1)
15239 .sr(1)
15240 .m(4)
15241 .n(16)
15242 .k(k)
15243 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080015244 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015245 }
15246 }
15247
15248 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_subtile) {
15249 TEST_REQUIRES_ARM_NEON;
15250 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080015251 for (uint32_t n = 1; n <= 16; n++) {
15252 for (uint32_t m = 1; m <= 4; m++) {
Frank Barchard5cffb642021-11-22 13:59:43 -080015253 GemmMicrokernelTester()
15254 .mr(4)
15255 .nr(16)
15256 .kr(1)
15257 .sr(1)
15258 .m(m)
15259 .n(n)
15260 .k(k)
15261 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015262 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015263 }
15264 }
15265 }
15266 }
15267
15268 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8) {
15269 TEST_REQUIRES_ARM_NEON;
15270 for (size_t k = 16; k <= 80; k += 8) {
15271 GemmMicrokernelTester()
15272 .mr(4)
15273 .nr(16)
15274 .kr(1)
15275 .sr(1)
15276 .m(4)
15277 .n(16)
15278 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080015279 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015280 }
15281 }
15282
15283 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_strided_a) {
15284 TEST_REQUIRES_ARM_NEON;
15285 for (size_t k = 16; k <= 80; k += 8) {
15286 GemmMicrokernelTester()
15287 .mr(4)
15288 .nr(16)
15289 .kr(1)
15290 .sr(1)
15291 .m(4)
15292 .n(16)
15293 .k(k)
15294 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080015295 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015296 }
15297 }
15298
15299 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_subtile) {
15300 TEST_REQUIRES_ARM_NEON;
15301 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080015302 for (uint32_t n = 1; n <= 16; n++) {
15303 for (uint32_t m = 1; m <= 4; m++) {
Frank Barchard5cffb642021-11-22 13:59:43 -080015304 GemmMicrokernelTester()
15305 .mr(4)
15306 .nr(16)
15307 .kr(1)
15308 .sr(1)
15309 .m(m)
15310 .n(n)
15311 .k(k)
15312 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015313 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015314 }
15315 }
15316 }
15317 }
15318
15319 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16) {
15320 TEST_REQUIRES_ARM_NEON;
15321 for (uint32_t n = 17; n < 32; n++) {
15322 for (size_t k = 1; k <= 40; k += 9) {
15323 GemmMicrokernelTester()
15324 .mr(4)
15325 .nr(16)
15326 .kr(1)
15327 .sr(1)
15328 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080015329 .n(n)
Frank Barchard5cffb642021-11-22 13:59:43 -080015330 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080015331 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015332 }
15333 }
15334 }
15335
15336 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_cn) {
15337 TEST_REQUIRES_ARM_NEON;
15338 for (uint32_t n = 17; n < 32; n++) {
15339 for (size_t k = 1; k <= 40; k += 9) {
15340 GemmMicrokernelTester()
15341 .mr(4)
15342 .nr(16)
15343 .kr(1)
15344 .sr(1)
15345 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080015346 .n(n)
Frank Barchard5cffb642021-11-22 13:59:43 -080015347 .k(k)
15348 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080015349 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015350 }
15351 }
15352 }
15353
15354 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_a) {
15355 TEST_REQUIRES_ARM_NEON;
15356 for (uint32_t n = 17; n < 32; n++) {
15357 for (size_t k = 1; k <= 40; k += 9) {
15358 GemmMicrokernelTester()
15359 .mr(4)
15360 .nr(16)
15361 .kr(1)
15362 .sr(1)
15363 .m(4)
15364 .n(n)
15365 .k(k)
15366 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080015367 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015368 }
15369 }
15370 }
15371
15372 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_subtile) {
15373 TEST_REQUIRES_ARM_NEON;
15374 for (uint32_t n = 17; n < 32; n++) {
15375 for (size_t k = 1; k <= 40; k += 9) {
15376 for (uint32_t m = 1; m <= 4; m++) {
15377 GemmMicrokernelTester()
15378 .mr(4)
15379 .nr(16)
15380 .kr(1)
15381 .sr(1)
15382 .m(m)
15383 .n(n)
15384 .k(k)
15385 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015386 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015387 }
15388 }
15389 }
15390 }
15391
15392 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16) {
15393 TEST_REQUIRES_ARM_NEON;
15394 for (uint32_t n = 32; n <= 48; n += 16) {
15395 for (size_t k = 1; k <= 40; k += 9) {
15396 GemmMicrokernelTester()
15397 .mr(4)
15398 .nr(16)
15399 .kr(1)
15400 .sr(1)
15401 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080015402 .n(n)
Frank Barchard5cffb642021-11-22 13:59:43 -080015403 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080015404 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015405 }
15406 }
15407 }
15408
15409 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_cn) {
15410 TEST_REQUIRES_ARM_NEON;
15411 for (uint32_t n = 32; n <= 48; n += 16) {
15412 for (size_t k = 1; k <= 40; k += 9) {
15413 GemmMicrokernelTester()
15414 .mr(4)
15415 .nr(16)
15416 .kr(1)
15417 .sr(1)
15418 .m(4)
15419 .n(n)
15420 .k(k)
15421 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080015422 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015423 }
15424 }
15425 }
15426
15427 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_a) {
15428 TEST_REQUIRES_ARM_NEON;
15429 for (uint32_t n = 32; n <= 48; n += 16) {
15430 for (size_t k = 1; k <= 40; k += 9) {
15431 GemmMicrokernelTester()
15432 .mr(4)
15433 .nr(16)
15434 .kr(1)
15435 .sr(1)
15436 .m(4)
15437 .n(n)
15438 .k(k)
15439 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080015440 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015441 }
15442 }
15443 }
15444
15445 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_subtile) {
15446 TEST_REQUIRES_ARM_NEON;
15447 for (uint32_t n = 32; n <= 48; n += 16) {
15448 for (size_t k = 1; k <= 40; k += 9) {
15449 for (uint32_t m = 1; m <= 4; m++) {
15450 GemmMicrokernelTester()
15451 .mr(4)
15452 .nr(16)
15453 .kr(1)
15454 .sr(1)
15455 .m(m)
15456 .n(n)
15457 .k(k)
15458 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015459 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015460 }
15461 }
15462 }
15463 }
15464
15465 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm_subtile) {
15466 TEST_REQUIRES_ARM_NEON;
15467 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080015468 for (uint32_t n = 1; n <= 16; n++) {
15469 for (uint32_t m = 1; m <= 4; m++) {
Frank Barchard5cffb642021-11-22 13:59:43 -080015470 GemmMicrokernelTester()
15471 .mr(4)
15472 .nr(16)
15473 .kr(1)
15474 .sr(1)
15475 .m(m)
15476 .n(n)
15477 .k(k)
15478 .cm_stride(19)
15479 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015480 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015481 }
15482 }
15483 }
15484 }
15485
15486 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmin) {
15487 TEST_REQUIRES_ARM_NEON;
15488 GemmMicrokernelTester()
15489 .mr(4)
15490 .nr(16)
15491 .kr(1)
15492 .sr(1)
15493 .m(4)
15494 .n(16)
15495 .k(8)
15496 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080015497 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015498 }
15499
15500 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmax) {
15501 TEST_REQUIRES_ARM_NEON;
15502 GemmMicrokernelTester()
15503 .mr(4)
15504 .nr(16)
15505 .kr(1)
15506 .sr(1)
15507 .m(4)
15508 .n(16)
15509 .k(8)
15510 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080015511 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015512 }
15513
15514 TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm) {
15515 TEST_REQUIRES_ARM_NEON;
15516 GemmMicrokernelTester()
15517 .mr(4)
15518 .nr(16)
15519 .kr(1)
15520 .sr(1)
15521 .m(4)
15522 .n(16)
15523 .k(8)
15524 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080015525 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Frank Barchard5cffb642021-11-22 13:59:43 -080015526 }
Frank Barcharde4d3f762021-12-23 15:31:43 -080015527#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard5cffb642021-11-22 13:59:43 -080015528
15529
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015530#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
15531 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4) {
15532 TEST_REQUIRES_ARM_NEON_DOT;
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015533 GemmMicrokernelTester()
15534 .mr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015535 .nr(16)
15536 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015537 .sr(1)
15538 .m(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015539 .n(16)
15540 .k(4)
Marat Dukhan50323b82022-01-11 00:12:01 -080015541 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015542 }
15543
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015544 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, strided_cn) {
15545 TEST_REQUIRES_ARM_NEON_DOT;
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015546 GemmMicrokernelTester()
15547 .mr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015548 .nr(16)
15549 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015550 .sr(1)
15551 .m(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015552 .n(16)
15553 .k(4)
15554 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080015555 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015556 }
15557
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015558 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_strided_a) {
15559 TEST_REQUIRES_ARM_NEON_DOT;
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015560 GemmMicrokernelTester()
15561 .mr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015562 .nr(16)
15563 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015564 .sr(1)
15565 .m(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015566 .n(16)
15567 .k(4)
15568 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080015569 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015570 }
15571
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015572 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile) {
15573 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng83844ae2022-01-14 09:52:25 -080015574 for (uint32_t n = 1; n <= 16; n++) {
15575 for (uint32_t m = 1; m <= 1; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015576 GemmMicrokernelTester()
15577 .mr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015578 .nr(16)
15579 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015580 .sr(1)
15581 .m(m)
15582 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015583 .k(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015584 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015585 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015586 }
15587 }
15588 }
15589
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015590 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_m) {
15591 TEST_REQUIRES_ARM_NEON_DOT;
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015592 for (uint32_t m = 1; m <= 1; m++) {
15593 GemmMicrokernelTester()
15594 .mr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015595 .nr(16)
15596 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015597 .sr(1)
15598 .m(m)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015599 .n(16)
15600 .k(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015601 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015602 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015603 }
15604 }
15605
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015606 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_n) {
15607 TEST_REQUIRES_ARM_NEON_DOT;
15608 for (uint32_t n = 1; n <= 16; n++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015609 GemmMicrokernelTester()
15610 .mr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015611 .nr(16)
15612 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015613 .sr(1)
15614 .m(1)
15615 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015616 .k(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015617 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015618 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015619 }
15620 }
15621
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015622 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4) {
15623 TEST_REQUIRES_ARM_NEON_DOT;
15624 for (size_t k = 1; k < 4; k++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015625 GemmMicrokernelTester()
15626 .mr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015627 .nr(16)
15628 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015629 .sr(1)
15630 .m(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015631 .n(16)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015632 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080015633 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015634 }
15635 }
15636
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015637 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4_strided_a) {
15638 TEST_REQUIRES_ARM_NEON_DOT;
15639 for (size_t k = 1; k < 4; k++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015640 GemmMicrokernelTester()
15641 .mr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015642 .nr(16)
15643 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015644 .sr(1)
15645 .m(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015646 .n(16)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015647 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015648 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080015649 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015650 }
15651 }
15652
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015653 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4_subtile) {
15654 TEST_REQUIRES_ARM_NEON_DOT;
15655 for (size_t k = 1; k < 4; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080015656 for (uint32_t n = 1; n <= 16; n++) {
15657 for (uint32_t m = 1; m <= 1; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015658 GemmMicrokernelTester()
15659 .mr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015660 .nr(16)
15661 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015662 .sr(1)
15663 .m(m)
15664 .n(n)
15665 .k(k)
15666 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015667 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015668 }
15669 }
15670 }
15671 }
15672
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015673 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4) {
15674 TEST_REQUIRES_ARM_NEON_DOT;
15675 for (size_t k = 5; k < 8; k++) {
15676 GemmMicrokernelTester()
15677 .mr(1)
15678 .nr(16)
15679 .kr(4)
15680 .sr(1)
15681 .m(1)
15682 .n(16)
15683 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080015684 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015685 }
15686 }
15687
15688 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4_strided_a) {
15689 TEST_REQUIRES_ARM_NEON_DOT;
15690 for (size_t k = 5; k < 8; k++) {
15691 GemmMicrokernelTester()
15692 .mr(1)
15693 .nr(16)
15694 .kr(4)
15695 .sr(1)
15696 .m(1)
15697 .n(16)
15698 .k(k)
15699 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080015700 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015701 }
15702 }
15703
15704 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4_subtile) {
15705 TEST_REQUIRES_ARM_NEON_DOT;
15706 for (size_t k = 5; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080015707 for (uint32_t n = 1; n <= 16; n++) {
15708 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015709 GemmMicrokernelTester()
15710 .mr(1)
15711 .nr(16)
15712 .kr(4)
15713 .sr(1)
15714 .m(m)
15715 .n(n)
15716 .k(k)
15717 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015718 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015719 }
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015720 }
15721 }
15722 }
15723
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015724 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_div_4) {
15725 TEST_REQUIRES_ARM_NEON_DOT;
15726 for (size_t k = 8; k <= 40; k += 4) {
15727 GemmMicrokernelTester()
15728 .mr(1)
15729 .nr(16)
15730 .kr(4)
15731 .sr(1)
15732 .m(1)
15733 .n(16)
15734 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080015735 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015736 }
15737 }
15738
15739 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_div_4_strided_a) {
15740 TEST_REQUIRES_ARM_NEON_DOT;
15741 for (size_t k = 8; k <= 40; k += 4) {
15742 GemmMicrokernelTester()
15743 .mr(1)
15744 .nr(16)
15745 .kr(4)
15746 .sr(1)
15747 .m(1)
15748 .n(16)
15749 .k(k)
15750 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080015751 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015752 }
15753 }
15754
15755 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_div_4_subtile) {
15756 TEST_REQUIRES_ARM_NEON_DOT;
15757 for (size_t k = 8; k <= 40; k += 4) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080015758 for (uint32_t n = 1; n <= 16; n++) {
15759 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015760 GemmMicrokernelTester()
15761 .mr(1)
15762 .nr(16)
15763 .kr(4)
15764 .sr(1)
15765 .m(m)
15766 .n(n)
15767 .k(k)
15768 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015769 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015770 }
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015771 }
15772 }
15773 }
15774
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015775 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16) {
15776 TEST_REQUIRES_ARM_NEON_DOT;
15777 for (uint32_t n = 17; n < 32; n++) {
15778 for (size_t k = 1; k <= 20; k += 5) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015779 GemmMicrokernelTester()
15780 .mr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015781 .nr(16)
15782 .kr(4)
15783 .sr(1)
15784 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080015785 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015786 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080015787 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015788 }
15789 }
15790 }
15791
15792 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_cn) {
15793 TEST_REQUIRES_ARM_NEON_DOT;
15794 for (uint32_t n = 17; n < 32; n++) {
15795 for (size_t k = 1; k <= 20; k += 5) {
15796 GemmMicrokernelTester()
15797 .mr(1)
15798 .nr(16)
15799 .kr(4)
15800 .sr(1)
15801 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080015802 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015803 .k(k)
15804 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080015805 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015806 }
15807 }
15808 }
15809
15810 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_a) {
15811 TEST_REQUIRES_ARM_NEON_DOT;
15812 for (uint32_t n = 17; n < 32; n++) {
15813 for (size_t k = 1; k <= 20; k += 5) {
15814 GemmMicrokernelTester()
15815 .mr(1)
15816 .nr(16)
15817 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015818 .sr(1)
15819 .m(1)
15820 .n(n)
15821 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015822 .a_stride(23)
Marat Dukhan50323b82022-01-11 00:12:01 -080015823 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015824 }
15825 }
15826 }
15827
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015828 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_subtile) {
15829 TEST_REQUIRES_ARM_NEON_DOT;
15830 for (uint32_t n = 17; n < 32; n++) {
15831 for (size_t k = 1; k <= 20; k += 5) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015832 for (uint32_t m = 1; m <= 1; m++) {
15833 GemmMicrokernelTester()
15834 .mr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015835 .nr(16)
15836 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015837 .sr(1)
15838 .m(m)
15839 .n(n)
15840 .k(k)
15841 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015842 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015843 }
15844 }
15845 }
15846 }
15847
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015848 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_div_16) {
15849 TEST_REQUIRES_ARM_NEON_DOT;
15850 for (uint32_t n = 32; n <= 48; n += 16) {
15851 for (size_t k = 1; k <= 20; k += 5) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015852 GemmMicrokernelTester()
15853 .mr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015854 .nr(16)
15855 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015856 .sr(1)
15857 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080015858 .n(n)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015859 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080015860 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015861 }
15862 }
15863 }
15864
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015865 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_cn) {
15866 TEST_REQUIRES_ARM_NEON_DOT;
15867 for (uint32_t n = 32; n <= 48; n += 16) {
15868 for (size_t k = 1; k <= 20; k += 5) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015869 GemmMicrokernelTester()
15870 .mr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015871 .nr(16)
15872 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015873 .sr(1)
15874 .m(1)
15875 .n(n)
15876 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015877 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080015878 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015879 }
15880 }
15881 }
15882
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015883 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_a) {
15884 TEST_REQUIRES_ARM_NEON_DOT;
15885 for (uint32_t n = 32; n <= 48; n += 16) {
15886 for (size_t k = 1; k <= 20; k += 5) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015887 GemmMicrokernelTester()
15888 .mr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015889 .nr(16)
15890 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015891 .sr(1)
15892 .m(1)
15893 .n(n)
15894 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015895 .a_stride(23)
Marat Dukhan50323b82022-01-11 00:12:01 -080015896 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015897 }
15898 }
15899 }
15900
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015901 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_subtile) {
15902 TEST_REQUIRES_ARM_NEON_DOT;
15903 for (uint32_t n = 32; n <= 48; n += 16) {
15904 for (size_t k = 1; k <= 20; k += 5) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015905 for (uint32_t m = 1; m <= 1; m++) {
15906 GemmMicrokernelTester()
15907 .mr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015908 .nr(16)
15909 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015910 .sr(1)
15911 .m(m)
15912 .n(n)
15913 .k(k)
15914 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015915 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015916 }
15917 }
15918 }
15919 }
15920
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015921 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, strided_cm_subtile) {
15922 TEST_REQUIRES_ARM_NEON_DOT;
15923 for (size_t k = 1; k <= 20; k += 5) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080015924 for (uint32_t n = 1; n <= 16; n++) {
15925 for (uint32_t m = 1; m <= 1; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015926 GemmMicrokernelTester()
15927 .mr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015928 .nr(16)
15929 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015930 .sr(1)
15931 .m(m)
15932 .n(n)
15933 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015934 .cm_stride(19)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015935 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015936 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015937 }
15938 }
15939 }
15940 }
15941
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015942 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, qmin) {
15943 TEST_REQUIRES_ARM_NEON_DOT;
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015944 GemmMicrokernelTester()
15945 .mr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015946 .nr(16)
15947 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015948 .sr(1)
15949 .m(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015950 .n(16)
15951 .k(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015952 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080015953 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015954 }
15955
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015956 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, qmax) {
15957 TEST_REQUIRES_ARM_NEON_DOT;
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015958 GemmMicrokernelTester()
15959 .mr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015960 .nr(16)
15961 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015962 .sr(1)
15963 .m(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015964 .n(16)
15965 .k(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015966 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080015967 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015968 }
15969
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015970 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, strided_cm) {
15971 TEST_REQUIRES_ARM_NEON_DOT;
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015972 GemmMicrokernelTester()
15973 .mr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015974 .nr(16)
15975 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015976 .sr(1)
15977 .m(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015978 .n(16)
15979 .k(4)
15980 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080015981 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015982 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015983#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015984
15985
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015986#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
15987 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
15988 TEST_REQUIRES_ARM_NEON_DOT;
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015989 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015990 .mr(1)
15991 .nr(16)
15992 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015993 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015994 .m(1)
15995 .n(16)
15996 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080015997 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080015998 }
15999
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016000 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, strided_cn) {
16001 TEST_REQUIRES_ARM_NEON_DOT;
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016002 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016003 .mr(1)
16004 .nr(16)
16005 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016006 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016007 .m(1)
16008 .n(16)
16009 .k(8)
16010 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080016011 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016012 }
16013
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016014 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_strided_a) {
16015 TEST_REQUIRES_ARM_NEON_DOT;
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016016 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016017 .mr(1)
16018 .nr(16)
16019 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016020 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016021 .m(1)
16022 .n(16)
16023 .k(8)
16024 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080016025 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016026 }
16027
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016028 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile) {
16029 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng83844ae2022-01-14 09:52:25 -080016030 for (uint32_t n = 1; n <= 16; n++) {
16031 for (uint32_t m = 1; m <= 1; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016032 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016033 .mr(1)
16034 .nr(16)
16035 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016036 .sr(1)
16037 .m(m)
16038 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016039 .k(8)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016040 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016041 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016042 }
16043 }
16044 }
16045
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016046 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_m) {
16047 TEST_REQUIRES_ARM_NEON_DOT;
16048 for (uint32_t m = 1; m <= 1; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016049 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016050 .mr(1)
16051 .nr(16)
16052 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016053 .sr(1)
16054 .m(m)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016055 .n(16)
16056 .k(8)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016057 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016058 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016059 }
16060 }
16061
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016062 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_n) {
16063 TEST_REQUIRES_ARM_NEON_DOT;
16064 for (uint32_t n = 1; n <= 16; n++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016065 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016066 .mr(1)
16067 .nr(16)
16068 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016069 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016070 .m(1)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016071 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016072 .k(8)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016073 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016074 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016075 }
16076 }
16077
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016078 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_lt_8) {
16079 TEST_REQUIRES_ARM_NEON_DOT;
16080 for (size_t k = 1; k < 8; k++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016081 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016082 .mr(1)
16083 .nr(16)
16084 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016085 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016086 .m(1)
16087 .n(16)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016088 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080016089 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016090 }
16091 }
16092
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016093 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_lt_8_strided_a) {
16094 TEST_REQUIRES_ARM_NEON_DOT;
16095 for (size_t k = 1; k < 8; k++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016096 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016097 .mr(1)
16098 .nr(16)
16099 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016100 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016101 .m(1)
16102 .n(16)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016103 .k(k)
16104 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080016105 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016106 }
16107 }
16108
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016109 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_lt_8_subtile) {
16110 TEST_REQUIRES_ARM_NEON_DOT;
16111 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080016112 for (uint32_t n = 1; n <= 16; n++) {
16113 for (uint32_t m = 1; m <= 1; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016114 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016115 .mr(1)
16116 .nr(16)
16117 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016118 .sr(1)
16119 .m(m)
16120 .n(n)
16121 .k(k)
16122 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016123 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016124 }
16125 }
16126 }
16127 }
16128
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016129 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_gt_8) {
16130 TEST_REQUIRES_ARM_NEON_DOT;
16131 for (size_t k = 9; k < 16; k++) {
16132 GemmMicrokernelTester()
16133 .mr(1)
16134 .nr(16)
16135 .kr(4)
16136 .sr(1)
16137 .m(1)
16138 .n(16)
16139 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080016140 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016141 }
16142 }
16143
16144 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_gt_8_strided_a) {
16145 TEST_REQUIRES_ARM_NEON_DOT;
16146 for (size_t k = 9; k < 16; k++) {
16147 GemmMicrokernelTester()
16148 .mr(1)
16149 .nr(16)
16150 .kr(4)
16151 .sr(1)
16152 .m(1)
16153 .n(16)
16154 .k(k)
16155 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080016156 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016157 }
16158 }
16159
16160 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_gt_8_subtile) {
16161 TEST_REQUIRES_ARM_NEON_DOT;
16162 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080016163 for (uint32_t n = 1; n <= 16; n++) {
16164 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016165 GemmMicrokernelTester()
16166 .mr(1)
16167 .nr(16)
16168 .kr(4)
16169 .sr(1)
16170 .m(m)
16171 .n(n)
16172 .k(k)
16173 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016174 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016175 }
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016176 }
16177 }
16178 }
16179
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016180 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_div_8) {
16181 TEST_REQUIRES_ARM_NEON_DOT;
16182 for (size_t k = 16; k <= 80; k += 8) {
16183 GemmMicrokernelTester()
16184 .mr(1)
16185 .nr(16)
16186 .kr(4)
16187 .sr(1)
16188 .m(1)
16189 .n(16)
16190 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080016191 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016192 }
16193 }
16194
16195 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_div_8_strided_a) {
16196 TEST_REQUIRES_ARM_NEON_DOT;
16197 for (size_t k = 16; k <= 80; k += 8) {
16198 GemmMicrokernelTester()
16199 .mr(1)
16200 .nr(16)
16201 .kr(4)
16202 .sr(1)
16203 .m(1)
16204 .n(16)
16205 .k(k)
16206 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080016207 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016208 }
16209 }
16210
16211 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_div_8_subtile) {
16212 TEST_REQUIRES_ARM_NEON_DOT;
16213 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080016214 for (uint32_t n = 1; n <= 16; n++) {
16215 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016216 GemmMicrokernelTester()
16217 .mr(1)
16218 .nr(16)
16219 .kr(4)
16220 .sr(1)
16221 .m(m)
16222 .n(n)
16223 .k(k)
16224 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016225 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016226 }
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016227 }
16228 }
16229 }
16230
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016231 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16) {
16232 TEST_REQUIRES_ARM_NEON_DOT;
16233 for (uint32_t n = 17; n < 32; n++) {
16234 for (size_t k = 1; k <= 40; k += 9) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016235 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016236 .mr(1)
16237 .nr(16)
16238 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016239 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016240 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080016241 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016242 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080016243 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016244 }
16245 }
16246 }
16247
16248 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_cn) {
16249 TEST_REQUIRES_ARM_NEON_DOT;
16250 for (uint32_t n = 17; n < 32; n++) {
16251 for (size_t k = 1; k <= 40; k += 9) {
16252 GemmMicrokernelTester()
16253 .mr(1)
16254 .nr(16)
16255 .kr(4)
16256 .sr(1)
16257 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080016258 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016259 .k(k)
16260 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080016261 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016262 }
16263 }
16264 }
16265
16266 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_a) {
16267 TEST_REQUIRES_ARM_NEON_DOT;
16268 for (uint32_t n = 17; n < 32; n++) {
16269 for (size_t k = 1; k <= 40; k += 9) {
16270 GemmMicrokernelTester()
16271 .mr(1)
16272 .nr(16)
16273 .kr(4)
16274 .sr(1)
16275 .m(1)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016276 .n(n)
16277 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016278 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080016279 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016280 }
16281 }
16282 }
16283
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016284 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16_subtile) {
16285 TEST_REQUIRES_ARM_NEON_DOT;
16286 for (uint32_t n = 17; n < 32; n++) {
16287 for (size_t k = 1; k <= 40; k += 9) {
16288 for (uint32_t m = 1; m <= 1; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016289 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016290 .mr(1)
16291 .nr(16)
16292 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016293 .sr(1)
16294 .m(m)
16295 .n(n)
16296 .k(k)
16297 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016298 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016299 }
16300 }
16301 }
16302 }
16303
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016304 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_div_16) {
16305 TEST_REQUIRES_ARM_NEON_DOT;
16306 for (uint32_t n = 32; n <= 48; n += 16) {
16307 for (size_t k = 1; k <= 40; k += 9) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016308 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016309 .mr(1)
16310 .nr(16)
16311 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016312 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016313 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080016314 .n(n)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016315 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080016316 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016317 }
16318 }
16319 }
16320
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016321 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_cn) {
16322 TEST_REQUIRES_ARM_NEON_DOT;
16323 for (uint32_t n = 32; n <= 48; n += 16) {
16324 for (size_t k = 1; k <= 40; k += 9) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016325 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016326 .mr(1)
16327 .nr(16)
16328 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016329 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016330 .m(1)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016331 .n(n)
16332 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016333 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080016334 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016335 }
16336 }
16337 }
16338
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016339 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_a) {
16340 TEST_REQUIRES_ARM_NEON_DOT;
16341 for (uint32_t n = 32; n <= 48; n += 16) {
16342 for (size_t k = 1; k <= 40; k += 9) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016343 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016344 .mr(1)
16345 .nr(16)
16346 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016347 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016348 .m(1)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016349 .n(n)
16350 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016351 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080016352 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016353 }
16354 }
16355 }
16356
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016357 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_div_16_subtile) {
16358 TEST_REQUIRES_ARM_NEON_DOT;
16359 for (uint32_t n = 32; n <= 48; n += 16) {
16360 for (size_t k = 1; k <= 40; k += 9) {
16361 for (uint32_t m = 1; m <= 1; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016362 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016363 .mr(1)
16364 .nr(16)
16365 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016366 .sr(1)
16367 .m(m)
16368 .n(n)
16369 .k(k)
16370 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016371 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016372 }
16373 }
16374 }
16375 }
16376
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016377 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, strided_cm_subtile) {
16378 TEST_REQUIRES_ARM_NEON_DOT;
16379 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080016380 for (uint32_t n = 1; n <= 16; n++) {
16381 for (uint32_t m = 1; m <= 1; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016382 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016383 .mr(1)
16384 .nr(16)
16385 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016386 .sr(1)
16387 .m(m)
16388 .n(n)
16389 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016390 .cm_stride(19)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016391 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016392 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016393 }
16394 }
16395 }
16396 }
16397
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016398 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, qmin) {
16399 TEST_REQUIRES_ARM_NEON_DOT;
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016400 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016401 .mr(1)
16402 .nr(16)
16403 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016404 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016405 .m(1)
16406 .n(16)
16407 .k(8)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016408 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080016409 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016410 }
16411
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016412 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, qmax) {
16413 TEST_REQUIRES_ARM_NEON_DOT;
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016414 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016415 .mr(1)
16416 .nr(16)
16417 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016418 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016419 .m(1)
16420 .n(16)
16421 .k(8)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016422 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080016423 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016424 }
16425
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016426 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, strided_cm) {
16427 TEST_REQUIRES_ARM_NEON_DOT;
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016428 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016429 .mr(1)
16430 .nr(16)
16431 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016432 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016433 .m(1)
16434 .n(16)
16435 .k(8)
16436 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080016437 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016438 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016439#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016440
16441
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016442#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
16443 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
16444 TEST_REQUIRES_ARM_NEON_DOT;
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016445 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016446 .mr(4)
16447 .nr(16)
16448 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016449 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016450 .m(4)
16451 .n(16)
16452 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080016453 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016454 }
16455
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016456 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, strided_cn) {
16457 TEST_REQUIRES_ARM_NEON_DOT;
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016458 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016459 .mr(4)
16460 .nr(16)
16461 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016462 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016463 .m(4)
16464 .n(16)
16465 .k(8)
16466 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080016467 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016468 }
16469
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016470 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_strided_a) {
16471 TEST_REQUIRES_ARM_NEON_DOT;
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016472 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016473 .mr(4)
16474 .nr(16)
16475 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016476 .sr(1)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016477 .m(4)
16478 .n(16)
16479 .k(8)
16480 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080016481 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016482 }
16483
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016484 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile) {
16485 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng83844ae2022-01-14 09:52:25 -080016486 for (uint32_t n = 1; n <= 16; n++) {
16487 for (uint32_t m = 1; m <= 4; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016488 GemmMicrokernelTester()
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016489 .mr(4)
16490 .nr(16)
16491 .kr(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016492 .sr(1)
16493 .m(m)
16494 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016495 .k(8)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016496 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016497 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080016498 }
16499 }
16500 }
16501
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016502 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_m) {
16503 TEST_REQUIRES_ARM_NEON_DOT;
16504 for (uint32_t m = 1; m <= 4; m++) {
16505 GemmMicrokernelTester()
16506 .mr(4)
16507 .nr(16)
16508 .kr(4)
16509 .sr(1)
16510 .m(m)
16511 .n(16)
16512 .k(8)
16513 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016514 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016515 }
16516 }
16517
16518 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_n) {
16519 TEST_REQUIRES_ARM_NEON_DOT;
16520 for (uint32_t n = 1; n <= 16; n++) {
16521 GemmMicrokernelTester()
16522 .mr(4)
16523 .nr(16)
16524 .kr(4)
16525 .sr(1)
16526 .m(4)
16527 .n(n)
16528 .k(8)
16529 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016530 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016531 }
16532 }
16533
16534 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8) {
16535 TEST_REQUIRES_ARM_NEON_DOT;
16536 for (size_t k = 1; k < 8; k++) {
16537 GemmMicrokernelTester()
16538 .mr(4)
16539 .nr(16)
16540 .kr(4)
16541 .sr(1)
16542 .m(4)
16543 .n(16)
16544 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080016545 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016546 }
16547 }
16548
16549 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8_strided_a) {
16550 TEST_REQUIRES_ARM_NEON_DOT;
16551 for (size_t k = 1; k < 8; k++) {
16552 GemmMicrokernelTester()
16553 .mr(4)
16554 .nr(16)
16555 .kr(4)
16556 .sr(1)
16557 .m(4)
16558 .n(16)
16559 .k(k)
16560 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080016561 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016562 }
16563 }
16564
16565 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8_subtile) {
16566 TEST_REQUIRES_ARM_NEON_DOT;
16567 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080016568 for (uint32_t n = 1; n <= 16; n++) {
16569 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016570 GemmMicrokernelTester()
16571 .mr(4)
16572 .nr(16)
16573 .kr(4)
16574 .sr(1)
16575 .m(m)
16576 .n(n)
16577 .k(k)
16578 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016579 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016580 }
16581 }
16582 }
16583 }
16584
16585 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8) {
16586 TEST_REQUIRES_ARM_NEON_DOT;
16587 for (size_t k = 9; k < 16; k++) {
16588 GemmMicrokernelTester()
16589 .mr(4)
16590 .nr(16)
16591 .kr(4)
16592 .sr(1)
16593 .m(4)
16594 .n(16)
16595 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080016596 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016597 }
16598 }
16599
16600 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8_strided_a) {
16601 TEST_REQUIRES_ARM_NEON_DOT;
16602 for (size_t k = 9; k < 16; k++) {
16603 GemmMicrokernelTester()
16604 .mr(4)
16605 .nr(16)
16606 .kr(4)
16607 .sr(1)
16608 .m(4)
16609 .n(16)
16610 .k(k)
16611 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080016612 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016613 }
16614 }
16615
16616 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8_subtile) {
16617 TEST_REQUIRES_ARM_NEON_DOT;
16618 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080016619 for (uint32_t n = 1; n <= 16; n++) {
16620 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016621 GemmMicrokernelTester()
16622 .mr(4)
16623 .nr(16)
16624 .kr(4)
16625 .sr(1)
16626 .m(m)
16627 .n(n)
16628 .k(k)
16629 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016630 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016631 }
16632 }
16633 }
16634 }
16635
16636 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_div_8) {
16637 TEST_REQUIRES_ARM_NEON_DOT;
16638 for (size_t k = 16; k <= 80; k += 8) {
16639 GemmMicrokernelTester()
16640 .mr(4)
16641 .nr(16)
16642 .kr(4)
16643 .sr(1)
16644 .m(4)
16645 .n(16)
16646 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080016647 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016648 }
16649 }
16650
16651 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_div_8_strided_a) {
16652 TEST_REQUIRES_ARM_NEON_DOT;
16653 for (size_t k = 16; k <= 80; k += 8) {
16654 GemmMicrokernelTester()
16655 .mr(4)
16656 .nr(16)
16657 .kr(4)
16658 .sr(1)
16659 .m(4)
16660 .n(16)
16661 .k(k)
16662 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080016663 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016664 }
16665 }
16666
16667 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_div_8_subtile) {
16668 TEST_REQUIRES_ARM_NEON_DOT;
16669 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080016670 for (uint32_t n = 1; n <= 16; n++) {
16671 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016672 GemmMicrokernelTester()
16673 .mr(4)
16674 .nr(16)
16675 .kr(4)
16676 .sr(1)
16677 .m(m)
16678 .n(n)
16679 .k(k)
16680 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016681 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016682 }
16683 }
16684 }
16685 }
16686
16687 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16) {
16688 TEST_REQUIRES_ARM_NEON_DOT;
16689 for (uint32_t n = 17; n < 32; n++) {
16690 for (size_t k = 1; k <= 40; k += 9) {
16691 GemmMicrokernelTester()
16692 .mr(4)
16693 .nr(16)
16694 .kr(4)
16695 .sr(1)
16696 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080016697 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016698 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080016699 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016700 }
16701 }
16702 }
16703
16704 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_cn) {
16705 TEST_REQUIRES_ARM_NEON_DOT;
16706 for (uint32_t n = 17; n < 32; n++) {
16707 for (size_t k = 1; k <= 40; k += 9) {
16708 GemmMicrokernelTester()
16709 .mr(4)
16710 .nr(16)
16711 .kr(4)
16712 .sr(1)
16713 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080016714 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016715 .k(k)
16716 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080016717 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016718 }
16719 }
16720 }
16721
16722 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_a) {
16723 TEST_REQUIRES_ARM_NEON_DOT;
16724 for (uint32_t n = 17; n < 32; n++) {
16725 for (size_t k = 1; k <= 40; k += 9) {
16726 GemmMicrokernelTester()
16727 .mr(4)
16728 .nr(16)
16729 .kr(4)
16730 .sr(1)
16731 .m(4)
16732 .n(n)
16733 .k(k)
16734 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080016735 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016736 }
16737 }
16738 }
16739
16740 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_subtile) {
16741 TEST_REQUIRES_ARM_NEON_DOT;
16742 for (uint32_t n = 17; n < 32; n++) {
16743 for (size_t k = 1; k <= 40; k += 9) {
16744 for (uint32_t m = 1; m <= 4; m++) {
16745 GemmMicrokernelTester()
16746 .mr(4)
16747 .nr(16)
16748 .kr(4)
16749 .sr(1)
16750 .m(m)
16751 .n(n)
16752 .k(k)
16753 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016754 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016755 }
16756 }
16757 }
16758 }
16759
16760 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_div_16) {
16761 TEST_REQUIRES_ARM_NEON_DOT;
16762 for (uint32_t n = 32; n <= 48; n += 16) {
16763 for (size_t k = 1; k <= 40; k += 9) {
16764 GemmMicrokernelTester()
16765 .mr(4)
16766 .nr(16)
16767 .kr(4)
16768 .sr(1)
16769 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080016770 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016771 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080016772 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016773 }
16774 }
16775 }
16776
16777 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_cn) {
16778 TEST_REQUIRES_ARM_NEON_DOT;
16779 for (uint32_t n = 32; n <= 48; n += 16) {
16780 for (size_t k = 1; k <= 40; k += 9) {
16781 GemmMicrokernelTester()
16782 .mr(4)
16783 .nr(16)
16784 .kr(4)
16785 .sr(1)
16786 .m(4)
16787 .n(n)
16788 .k(k)
16789 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080016790 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016791 }
16792 }
16793 }
16794
16795 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_a) {
16796 TEST_REQUIRES_ARM_NEON_DOT;
16797 for (uint32_t n = 32; n <= 48; n += 16) {
16798 for (size_t k = 1; k <= 40; k += 9) {
16799 GemmMicrokernelTester()
16800 .mr(4)
16801 .nr(16)
16802 .kr(4)
16803 .sr(1)
16804 .m(4)
16805 .n(n)
16806 .k(k)
16807 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080016808 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016809 }
16810 }
16811 }
16812
16813 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_subtile) {
16814 TEST_REQUIRES_ARM_NEON_DOT;
16815 for (uint32_t n = 32; n <= 48; n += 16) {
16816 for (size_t k = 1; k <= 40; k += 9) {
16817 for (uint32_t m = 1; m <= 4; m++) {
16818 GemmMicrokernelTester()
16819 .mr(4)
16820 .nr(16)
16821 .kr(4)
16822 .sr(1)
16823 .m(m)
16824 .n(n)
16825 .k(k)
16826 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016827 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016828 }
16829 }
16830 }
16831 }
16832
16833 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, strided_cm_subtile) {
16834 TEST_REQUIRES_ARM_NEON_DOT;
16835 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080016836 for (uint32_t n = 1; n <= 16; n++) {
16837 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016838 GemmMicrokernelTester()
16839 .mr(4)
16840 .nr(16)
16841 .kr(4)
16842 .sr(1)
16843 .m(m)
16844 .n(n)
16845 .k(k)
16846 .cm_stride(19)
16847 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016848 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016849 }
16850 }
16851 }
16852 }
16853
16854 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, qmin) {
16855 TEST_REQUIRES_ARM_NEON_DOT;
16856 GemmMicrokernelTester()
16857 .mr(4)
16858 .nr(16)
16859 .kr(4)
16860 .sr(1)
16861 .m(4)
16862 .n(16)
16863 .k(8)
16864 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080016865 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016866 }
16867
16868 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, qmax) {
16869 TEST_REQUIRES_ARM_NEON_DOT;
16870 GemmMicrokernelTester()
16871 .mr(4)
16872 .nr(16)
16873 .kr(4)
16874 .sr(1)
16875 .m(4)
16876 .n(16)
16877 .k(8)
16878 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080016879 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016880 }
16881
16882 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, strided_cm) {
16883 TEST_REQUIRES_ARM_NEON_DOT;
16884 GemmMicrokernelTester()
16885 .mr(4)
16886 .nr(16)
16887 .kr(4)
16888 .sr(1)
16889 .m(4)
16890 .n(16)
16891 .k(8)
16892 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080016893 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016894 }
16895#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
16896
16897
16898#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
16899 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16) {
16900 TEST_REQUIRES_ARM_NEON_DOT;
16901 GemmMicrokernelTester()
16902 .mr(4)
16903 .nr(16)
16904 .kr(4)
16905 .sr(1)
16906 .m(4)
16907 .n(16)
16908 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -080016909 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016910 }
16911
16912 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cn) {
16913 TEST_REQUIRES_ARM_NEON_DOT;
16914 GemmMicrokernelTester()
16915 .mr(4)
16916 .nr(16)
16917 .kr(4)
16918 .sr(1)
16919 .m(4)
16920 .n(16)
16921 .k(16)
16922 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080016923 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016924 }
16925
16926 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_strided_a) {
16927 TEST_REQUIRES_ARM_NEON_DOT;
16928 GemmMicrokernelTester()
16929 .mr(4)
16930 .nr(16)
16931 .kr(4)
16932 .sr(1)
16933 .m(4)
16934 .n(16)
16935 .k(16)
16936 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080016937 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016938 }
16939
16940 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile) {
16941 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng83844ae2022-01-14 09:52:25 -080016942 for (uint32_t n = 1; n <= 16; n++) {
16943 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016944 GemmMicrokernelTester()
16945 .mr(4)
16946 .nr(16)
16947 .kr(4)
16948 .sr(1)
16949 .m(m)
16950 .n(n)
16951 .k(16)
16952 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016953 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016954 }
16955 }
16956 }
16957
16958 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_m) {
16959 TEST_REQUIRES_ARM_NEON_DOT;
16960 for (uint32_t m = 1; m <= 4; m++) {
16961 GemmMicrokernelTester()
16962 .mr(4)
16963 .nr(16)
16964 .kr(4)
16965 .sr(1)
16966 .m(m)
16967 .n(16)
16968 .k(16)
16969 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016970 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016971 }
16972 }
16973
16974 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_n) {
16975 TEST_REQUIRES_ARM_NEON_DOT;
16976 for (uint32_t n = 1; n <= 16; n++) {
16977 GemmMicrokernelTester()
16978 .mr(4)
16979 .nr(16)
16980 .kr(4)
16981 .sr(1)
16982 .m(4)
16983 .n(n)
16984 .k(16)
16985 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016986 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016987 }
16988 }
16989
16990 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16) {
16991 TEST_REQUIRES_ARM_NEON_DOT;
16992 for (size_t k = 1; k < 16; k++) {
16993 GemmMicrokernelTester()
16994 .mr(4)
16995 .nr(16)
16996 .kr(4)
16997 .sr(1)
16998 .m(4)
16999 .n(16)
17000 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017001 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017002 }
17003 }
17004
17005 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16_strided_a) {
17006 TEST_REQUIRES_ARM_NEON_DOT;
17007 for (size_t k = 1; k < 16; k++) {
17008 GemmMicrokernelTester()
17009 .mr(4)
17010 .nr(16)
17011 .kr(4)
17012 .sr(1)
17013 .m(4)
17014 .n(16)
17015 .k(k)
17016 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080017017 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017018 }
17019 }
17020
17021 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16_subtile) {
17022 TEST_REQUIRES_ARM_NEON_DOT;
17023 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080017024 for (uint32_t n = 1; n <= 16; n++) {
17025 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017026 GemmMicrokernelTester()
17027 .mr(4)
17028 .nr(16)
17029 .kr(4)
17030 .sr(1)
17031 .m(m)
17032 .n(n)
17033 .k(k)
17034 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017035 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017036 }
17037 }
17038 }
17039 }
17040
17041 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16) {
17042 TEST_REQUIRES_ARM_NEON_DOT;
17043 for (size_t k = 17; k < 32; k++) {
17044 GemmMicrokernelTester()
17045 .mr(4)
17046 .nr(16)
17047 .kr(4)
17048 .sr(1)
17049 .m(4)
17050 .n(16)
17051 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017052 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017053 }
17054 }
17055
17056 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16_strided_a) {
17057 TEST_REQUIRES_ARM_NEON_DOT;
17058 for (size_t k = 17; k < 32; k++) {
17059 GemmMicrokernelTester()
17060 .mr(4)
17061 .nr(16)
17062 .kr(4)
17063 .sr(1)
17064 .m(4)
17065 .n(16)
17066 .k(k)
17067 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -080017068 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017069 }
17070 }
17071
17072 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16_subtile) {
17073 TEST_REQUIRES_ARM_NEON_DOT;
17074 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080017075 for (uint32_t n = 1; n <= 16; n++) {
17076 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017077 GemmMicrokernelTester()
17078 .mr(4)
17079 .nr(16)
17080 .kr(4)
17081 .sr(1)
17082 .m(m)
17083 .n(n)
17084 .k(k)
17085 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017086 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017087 }
17088 }
17089 }
17090 }
17091
17092 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16) {
17093 TEST_REQUIRES_ARM_NEON_DOT;
17094 for (size_t k = 32; k <= 160; k += 16) {
17095 GemmMicrokernelTester()
17096 .mr(4)
17097 .nr(16)
17098 .kr(4)
17099 .sr(1)
17100 .m(4)
17101 .n(16)
17102 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017103 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017104 }
17105 }
17106
17107 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16_strided_a) {
17108 TEST_REQUIRES_ARM_NEON_DOT;
17109 for (size_t k = 32; k <= 160; k += 16) {
17110 GemmMicrokernelTester()
17111 .mr(4)
17112 .nr(16)
17113 .kr(4)
17114 .sr(1)
17115 .m(4)
17116 .n(16)
17117 .k(k)
17118 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -080017119 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017120 }
17121 }
17122
17123 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16_subtile) {
17124 TEST_REQUIRES_ARM_NEON_DOT;
17125 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080017126 for (uint32_t n = 1; n <= 16; n++) {
17127 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017128 GemmMicrokernelTester()
17129 .mr(4)
17130 .nr(16)
17131 .kr(4)
17132 .sr(1)
17133 .m(m)
17134 .n(n)
17135 .k(k)
17136 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017137 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017138 }
17139 }
17140 }
17141 }
17142
17143 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16) {
17144 TEST_REQUIRES_ARM_NEON_DOT;
17145 for (uint32_t n = 17; n < 32; n++) {
17146 for (size_t k = 1; k <= 80; k += 17) {
17147 GemmMicrokernelTester()
17148 .mr(4)
17149 .nr(16)
17150 .kr(4)
17151 .sr(1)
17152 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080017153 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017154 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017155 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017156 }
17157 }
17158 }
17159
17160 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_strided_cn) {
17161 TEST_REQUIRES_ARM_NEON_DOT;
17162 for (uint32_t n = 17; n < 32; n++) {
17163 for (size_t k = 1; k <= 80; k += 17) {
17164 GemmMicrokernelTester()
17165 .mr(4)
17166 .nr(16)
17167 .kr(4)
17168 .sr(1)
17169 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080017170 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017171 .k(k)
17172 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080017173 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017174 }
17175 }
17176 }
17177
17178 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_strided_a) {
17179 TEST_REQUIRES_ARM_NEON_DOT;
17180 for (uint32_t n = 17; n < 32; n++) {
17181 for (size_t k = 1; k <= 80; k += 17) {
17182 GemmMicrokernelTester()
17183 .mr(4)
17184 .nr(16)
17185 .kr(4)
17186 .sr(1)
17187 .m(4)
17188 .n(n)
17189 .k(k)
17190 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080017191 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017192 }
17193 }
17194 }
17195
17196 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_subtile) {
17197 TEST_REQUIRES_ARM_NEON_DOT;
17198 for (uint32_t n = 17; n < 32; n++) {
17199 for (size_t k = 1; k <= 80; k += 17) {
17200 for (uint32_t m = 1; m <= 4; m++) {
17201 GemmMicrokernelTester()
17202 .mr(4)
17203 .nr(16)
17204 .kr(4)
17205 .sr(1)
17206 .m(m)
17207 .n(n)
17208 .k(k)
17209 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017210 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017211 }
17212 }
17213 }
17214 }
17215
17216 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16) {
17217 TEST_REQUIRES_ARM_NEON_DOT;
17218 for (uint32_t n = 32; n <= 48; n += 16) {
17219 for (size_t k = 1; k <= 80; k += 17) {
17220 GemmMicrokernelTester()
17221 .mr(4)
17222 .nr(16)
17223 .kr(4)
17224 .sr(1)
17225 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080017226 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017227 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017228 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017229 }
17230 }
17231 }
17232
17233 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_strided_cn) {
17234 TEST_REQUIRES_ARM_NEON_DOT;
17235 for (uint32_t n = 32; n <= 48; n += 16) {
17236 for (size_t k = 1; k <= 80; k += 17) {
17237 GemmMicrokernelTester()
17238 .mr(4)
17239 .nr(16)
17240 .kr(4)
17241 .sr(1)
17242 .m(4)
17243 .n(n)
17244 .k(k)
17245 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080017246 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017247 }
17248 }
17249 }
17250
17251 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_strided_a) {
17252 TEST_REQUIRES_ARM_NEON_DOT;
17253 for (uint32_t n = 32; n <= 48; n += 16) {
17254 for (size_t k = 1; k <= 80; k += 17) {
17255 GemmMicrokernelTester()
17256 .mr(4)
17257 .nr(16)
17258 .kr(4)
17259 .sr(1)
17260 .m(4)
17261 .n(n)
17262 .k(k)
17263 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080017264 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017265 }
17266 }
17267 }
17268
17269 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_subtile) {
17270 TEST_REQUIRES_ARM_NEON_DOT;
17271 for (uint32_t n = 32; n <= 48; n += 16) {
17272 for (size_t k = 1; k <= 80; k += 17) {
17273 for (uint32_t m = 1; m <= 4; m++) {
17274 GemmMicrokernelTester()
17275 .mr(4)
17276 .nr(16)
17277 .kr(4)
17278 .sr(1)
17279 .m(m)
17280 .n(n)
17281 .k(k)
17282 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017283 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017284 }
17285 }
17286 }
17287 }
17288
17289 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm_subtile) {
17290 TEST_REQUIRES_ARM_NEON_DOT;
17291 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080017292 for (uint32_t n = 1; n <= 16; n++) {
17293 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017294 GemmMicrokernelTester()
17295 .mr(4)
17296 .nr(16)
17297 .kr(4)
17298 .sr(1)
17299 .m(m)
17300 .n(n)
17301 .k(k)
17302 .cm_stride(19)
17303 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017304 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017305 }
17306 }
17307 }
17308 }
17309
17310 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmin) {
17311 TEST_REQUIRES_ARM_NEON_DOT;
17312 GemmMicrokernelTester()
17313 .mr(4)
17314 .nr(16)
17315 .kr(4)
17316 .sr(1)
17317 .m(4)
17318 .n(16)
17319 .k(16)
17320 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080017321 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017322 }
17323
17324 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmax) {
17325 TEST_REQUIRES_ARM_NEON_DOT;
17326 GemmMicrokernelTester()
17327 .mr(4)
17328 .nr(16)
17329 .kr(4)
17330 .sr(1)
17331 .m(4)
17332 .n(16)
17333 .k(16)
17334 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080017335 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017336 }
17337
17338 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm) {
17339 TEST_REQUIRES_ARM_NEON_DOT;
17340 GemmMicrokernelTester()
17341 .mr(4)
17342 .nr(16)
17343 .kr(4)
17344 .sr(1)
17345 .m(4)
17346 .n(16)
17347 .k(16)
17348 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080017349 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017350 }
17351#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
17352
17353
17354#if XNN_ARCH_ARM || XNN_ARCH_ARM64
17355 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16) {
17356 TEST_REQUIRES_ARM_NEON;
17357 GemmMicrokernelTester()
17358 .mr(1)
17359 .nr(8)
17360 .kr(8)
17361 .sr(1)
17362 .m(1)
17363 .n(8)
17364 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -080017365 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017366 }
17367
17368 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, strided_cn) {
17369 TEST_REQUIRES_ARM_NEON;
17370 GemmMicrokernelTester()
17371 .mr(1)
17372 .nr(8)
17373 .kr(8)
17374 .sr(1)
17375 .m(1)
17376 .n(8)
17377 .k(16)
17378 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080017379 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017380 }
17381
17382 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16_strided_a) {
17383 TEST_REQUIRES_ARM_NEON;
17384 GemmMicrokernelTester()
17385 .mr(1)
17386 .nr(8)
17387 .kr(8)
17388 .sr(1)
17389 .m(1)
17390 .n(8)
17391 .k(16)
17392 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080017393 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017394 }
17395
17396 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16_subtile) {
17397 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -080017398 for (uint32_t n = 1; n <= 8; n++) {
17399 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017400 GemmMicrokernelTester()
17401 .mr(1)
17402 .nr(8)
17403 .kr(8)
17404 .sr(1)
17405 .m(m)
17406 .n(n)
17407 .k(16)
17408 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017409 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017410 }
17411 }
17412 }
17413
17414 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16_subtile_m) {
17415 TEST_REQUIRES_ARM_NEON;
17416 for (uint32_t m = 1; m <= 1; m++) {
17417 GemmMicrokernelTester()
17418 .mr(1)
17419 .nr(8)
17420 .kr(8)
17421 .sr(1)
17422 .m(m)
17423 .n(8)
17424 .k(16)
17425 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017426 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017427 }
17428 }
17429
17430 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_eq_16_subtile_n) {
17431 TEST_REQUIRES_ARM_NEON;
17432 for (uint32_t n = 1; n <= 8; n++) {
17433 GemmMicrokernelTester()
17434 .mr(1)
17435 .nr(8)
17436 .kr(8)
17437 .sr(1)
17438 .m(1)
17439 .n(n)
17440 .k(16)
17441 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017442 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017443 }
17444 }
17445
17446 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_lt_16) {
17447 TEST_REQUIRES_ARM_NEON;
17448 for (size_t k = 1; k < 16; k++) {
17449 GemmMicrokernelTester()
17450 .mr(1)
17451 .nr(8)
17452 .kr(8)
17453 .sr(1)
17454 .m(1)
17455 .n(8)
17456 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017457 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017458 }
17459 }
17460
17461 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_lt_16_strided_a) {
17462 TEST_REQUIRES_ARM_NEON;
17463 for (size_t k = 1; k < 16; k++) {
17464 GemmMicrokernelTester()
17465 .mr(1)
17466 .nr(8)
17467 .kr(8)
17468 .sr(1)
17469 .m(1)
17470 .n(8)
17471 .k(k)
17472 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080017473 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017474 }
17475 }
17476
17477 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_lt_16_subtile) {
17478 TEST_REQUIRES_ARM_NEON;
17479 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080017480 for (uint32_t n = 1; n <= 8; n++) {
17481 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017482 GemmMicrokernelTester()
17483 .mr(1)
17484 .nr(8)
17485 .kr(8)
17486 .sr(1)
17487 .m(m)
17488 .n(n)
17489 .k(k)
17490 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017491 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017492 }
17493 }
17494 }
17495 }
17496
17497 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_gt_16) {
17498 TEST_REQUIRES_ARM_NEON;
17499 for (size_t k = 17; k < 32; k++) {
17500 GemmMicrokernelTester()
17501 .mr(1)
17502 .nr(8)
17503 .kr(8)
17504 .sr(1)
17505 .m(1)
17506 .n(8)
17507 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017508 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017509 }
17510 }
17511
17512 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_gt_16_strided_a) {
17513 TEST_REQUIRES_ARM_NEON;
17514 for (size_t k = 17; k < 32; k++) {
17515 GemmMicrokernelTester()
17516 .mr(1)
17517 .nr(8)
17518 .kr(8)
17519 .sr(1)
17520 .m(1)
17521 .n(8)
17522 .k(k)
17523 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -080017524 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017525 }
17526 }
17527
17528 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_gt_16_subtile) {
17529 TEST_REQUIRES_ARM_NEON;
17530 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080017531 for (uint32_t n = 1; n <= 8; n++) {
17532 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017533 GemmMicrokernelTester()
17534 .mr(1)
17535 .nr(8)
17536 .kr(8)
17537 .sr(1)
17538 .m(m)
17539 .n(n)
17540 .k(k)
17541 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017542 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017543 }
17544 }
17545 }
17546 }
17547
17548 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_div_16) {
17549 TEST_REQUIRES_ARM_NEON;
17550 for (size_t k = 32; k <= 160; k += 16) {
17551 GemmMicrokernelTester()
17552 .mr(1)
17553 .nr(8)
17554 .kr(8)
17555 .sr(1)
17556 .m(1)
17557 .n(8)
17558 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017559 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017560 }
17561 }
17562
17563 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_div_16_strided_a) {
17564 TEST_REQUIRES_ARM_NEON;
17565 for (size_t k = 32; k <= 160; k += 16) {
17566 GemmMicrokernelTester()
17567 .mr(1)
17568 .nr(8)
17569 .kr(8)
17570 .sr(1)
17571 .m(1)
17572 .n(8)
17573 .k(k)
17574 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -080017575 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017576 }
17577 }
17578
17579 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, k_div_16_subtile) {
17580 TEST_REQUIRES_ARM_NEON;
17581 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080017582 for (uint32_t n = 1; n <= 8; n++) {
17583 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017584 GemmMicrokernelTester()
17585 .mr(1)
17586 .nr(8)
17587 .kr(8)
17588 .sr(1)
17589 .m(m)
17590 .n(n)
17591 .k(k)
17592 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017593 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017594 }
17595 }
17596 }
17597 }
17598
17599 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_gt_8) {
17600 TEST_REQUIRES_ARM_NEON;
17601 for (uint32_t n = 9; n < 16; n++) {
17602 for (size_t k = 1; k <= 80; k += 17) {
17603 GemmMicrokernelTester()
17604 .mr(1)
17605 .nr(8)
17606 .kr(8)
17607 .sr(1)
17608 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080017609 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017610 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017611 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017612 }
17613 }
17614 }
17615
17616 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_gt_8_strided_cn) {
17617 TEST_REQUIRES_ARM_NEON;
17618 for (uint32_t n = 9; n < 16; n++) {
17619 for (size_t k = 1; k <= 80; k += 17) {
17620 GemmMicrokernelTester()
17621 .mr(1)
17622 .nr(8)
17623 .kr(8)
17624 .sr(1)
17625 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080017626 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017627 .k(k)
17628 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080017629 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017630 }
17631 }
17632 }
17633
17634 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_gt_8_strided_a) {
17635 TEST_REQUIRES_ARM_NEON;
17636 for (uint32_t n = 9; n < 16; n++) {
17637 for (size_t k = 1; k <= 80; k += 17) {
17638 GemmMicrokernelTester()
17639 .mr(1)
17640 .nr(8)
17641 .kr(8)
17642 .sr(1)
17643 .m(1)
17644 .n(n)
17645 .k(k)
17646 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080017647 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017648 }
17649 }
17650 }
17651
17652 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_gt_8_subtile) {
17653 TEST_REQUIRES_ARM_NEON;
17654 for (uint32_t n = 9; n < 16; n++) {
17655 for (size_t k = 1; k <= 80; k += 17) {
17656 for (uint32_t m = 1; m <= 1; m++) {
17657 GemmMicrokernelTester()
17658 .mr(1)
17659 .nr(8)
17660 .kr(8)
17661 .sr(1)
17662 .m(m)
17663 .n(n)
17664 .k(k)
17665 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017666 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017667 }
17668 }
17669 }
17670 }
17671
17672 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_div_8) {
17673 TEST_REQUIRES_ARM_NEON;
17674 for (uint32_t n = 16; n <= 24; n += 8) {
17675 for (size_t k = 1; k <= 80; k += 17) {
17676 GemmMicrokernelTester()
17677 .mr(1)
17678 .nr(8)
17679 .kr(8)
17680 .sr(1)
17681 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080017682 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017683 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017684 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017685 }
17686 }
17687 }
17688
17689 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_div_8_strided_cn) {
17690 TEST_REQUIRES_ARM_NEON;
17691 for (uint32_t n = 16; n <= 24; n += 8) {
17692 for (size_t k = 1; k <= 80; k += 17) {
17693 GemmMicrokernelTester()
17694 .mr(1)
17695 .nr(8)
17696 .kr(8)
17697 .sr(1)
17698 .m(1)
17699 .n(n)
17700 .k(k)
17701 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080017702 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017703 }
17704 }
17705 }
17706
17707 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_div_8_strided_a) {
17708 TEST_REQUIRES_ARM_NEON;
17709 for (uint32_t n = 16; n <= 24; n += 8) {
17710 for (size_t k = 1; k <= 80; k += 17) {
17711 GemmMicrokernelTester()
17712 .mr(1)
17713 .nr(8)
17714 .kr(8)
17715 .sr(1)
17716 .m(1)
17717 .n(n)
17718 .k(k)
17719 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080017720 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017721 }
17722 }
17723 }
17724
17725 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, n_div_8_subtile) {
17726 TEST_REQUIRES_ARM_NEON;
17727 for (uint32_t n = 16; n <= 24; n += 8) {
17728 for (size_t k = 1; k <= 80; k += 17) {
17729 for (uint32_t m = 1; m <= 1; m++) {
17730 GemmMicrokernelTester()
17731 .mr(1)
17732 .nr(8)
17733 .kr(8)
17734 .sr(1)
17735 .m(m)
17736 .n(n)
17737 .k(k)
17738 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017739 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017740 }
17741 }
17742 }
17743 }
17744
17745 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, strided_cm_subtile) {
17746 TEST_REQUIRES_ARM_NEON;
17747 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080017748 for (uint32_t n = 1; n <= 8; n++) {
17749 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017750 GemmMicrokernelTester()
17751 .mr(1)
17752 .nr(8)
17753 .kr(8)
17754 .sr(1)
17755 .m(m)
17756 .n(n)
17757 .k(k)
17758 .cm_stride(11)
17759 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017760 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017761 }
17762 }
17763 }
17764 }
17765
17766 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, qmin) {
17767 TEST_REQUIRES_ARM_NEON;
17768 GemmMicrokernelTester()
17769 .mr(1)
17770 .nr(8)
17771 .kr(8)
17772 .sr(1)
17773 .m(1)
17774 .n(8)
17775 .k(16)
17776 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080017777 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017778 }
17779
17780 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, qmax) {
17781 TEST_REQUIRES_ARM_NEON;
17782 GemmMicrokernelTester()
17783 .mr(1)
17784 .nr(8)
17785 .kr(8)
17786 .sr(1)
17787 .m(1)
17788 .n(8)
17789 .k(16)
17790 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080017791 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017792 }
17793
17794 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEON_MLAL, strided_cm) {
17795 TEST_REQUIRES_ARM_NEON;
17796 GemmMicrokernelTester()
17797 .mr(1)
17798 .nr(8)
17799 .kr(8)
17800 .sr(1)
17801 .m(1)
17802 .n(8)
17803 .k(16)
17804 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080017805 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017806 }
17807#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
17808
17809
17810#if XNN_ARCH_ARM || XNN_ARCH_ARM64
17811 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16) {
17812 TEST_REQUIRES_ARM_NEON;
17813 GemmMicrokernelTester()
17814 .mr(2)
17815 .nr(8)
17816 .kr(8)
17817 .sr(1)
17818 .m(2)
17819 .n(8)
17820 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -080017821 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017822 }
17823
17824 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, strided_cn) {
17825 TEST_REQUIRES_ARM_NEON;
17826 GemmMicrokernelTester()
17827 .mr(2)
17828 .nr(8)
17829 .kr(8)
17830 .sr(1)
17831 .m(2)
17832 .n(8)
17833 .k(16)
17834 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080017835 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017836 }
17837
17838 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16_strided_a) {
17839 TEST_REQUIRES_ARM_NEON;
17840 GemmMicrokernelTester()
17841 .mr(2)
17842 .nr(8)
17843 .kr(8)
17844 .sr(1)
17845 .m(2)
17846 .n(8)
17847 .k(16)
17848 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080017849 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017850 }
17851
17852 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16_subtile) {
17853 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -080017854 for (uint32_t n = 1; n <= 8; n++) {
17855 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017856 GemmMicrokernelTester()
17857 .mr(2)
17858 .nr(8)
17859 .kr(8)
17860 .sr(1)
17861 .m(m)
17862 .n(n)
17863 .k(16)
17864 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017865 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017866 }
17867 }
17868 }
17869
17870 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16_subtile_m) {
17871 TEST_REQUIRES_ARM_NEON;
17872 for (uint32_t m = 1; m <= 2; m++) {
17873 GemmMicrokernelTester()
17874 .mr(2)
17875 .nr(8)
17876 .kr(8)
17877 .sr(1)
17878 .m(m)
17879 .n(8)
17880 .k(16)
17881 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017882 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017883 }
17884 }
17885
17886 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_eq_16_subtile_n) {
17887 TEST_REQUIRES_ARM_NEON;
17888 for (uint32_t n = 1; n <= 8; n++) {
17889 GemmMicrokernelTester()
17890 .mr(2)
17891 .nr(8)
17892 .kr(8)
17893 .sr(1)
17894 .m(2)
17895 .n(n)
17896 .k(16)
17897 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017898 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017899 }
17900 }
17901
17902 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_lt_16) {
17903 TEST_REQUIRES_ARM_NEON;
17904 for (size_t k = 1; k < 16; k++) {
17905 GemmMicrokernelTester()
17906 .mr(2)
17907 .nr(8)
17908 .kr(8)
17909 .sr(1)
17910 .m(2)
17911 .n(8)
17912 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017913 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017914 }
17915 }
17916
17917 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_lt_16_strided_a) {
17918 TEST_REQUIRES_ARM_NEON;
17919 for (size_t k = 1; k < 16; k++) {
17920 GemmMicrokernelTester()
17921 .mr(2)
17922 .nr(8)
17923 .kr(8)
17924 .sr(1)
17925 .m(2)
17926 .n(8)
17927 .k(k)
17928 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080017929 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017930 }
17931 }
17932
17933 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_lt_16_subtile) {
17934 TEST_REQUIRES_ARM_NEON;
17935 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080017936 for (uint32_t n = 1; n <= 8; n++) {
17937 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017938 GemmMicrokernelTester()
17939 .mr(2)
17940 .nr(8)
17941 .kr(8)
17942 .sr(1)
17943 .m(m)
17944 .n(n)
17945 .k(k)
17946 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017947 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017948 }
17949 }
17950 }
17951 }
17952
17953 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_gt_16) {
17954 TEST_REQUIRES_ARM_NEON;
17955 for (size_t k = 17; k < 32; k++) {
17956 GemmMicrokernelTester()
17957 .mr(2)
17958 .nr(8)
17959 .kr(8)
17960 .sr(1)
17961 .m(2)
17962 .n(8)
17963 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017964 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017965 }
17966 }
17967
17968 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_gt_16_strided_a) {
17969 TEST_REQUIRES_ARM_NEON;
17970 for (size_t k = 17; k < 32; k++) {
17971 GemmMicrokernelTester()
17972 .mr(2)
17973 .nr(8)
17974 .kr(8)
17975 .sr(1)
17976 .m(2)
17977 .n(8)
17978 .k(k)
17979 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -080017980 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017981 }
17982 }
17983
17984 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_gt_16_subtile) {
17985 TEST_REQUIRES_ARM_NEON;
17986 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080017987 for (uint32_t n = 1; n <= 8; n++) {
17988 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017989 GemmMicrokernelTester()
17990 .mr(2)
17991 .nr(8)
17992 .kr(8)
17993 .sr(1)
17994 .m(m)
17995 .n(n)
17996 .k(k)
17997 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017998 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017999 }
18000 }
18001 }
18002 }
18003
18004 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_div_16) {
18005 TEST_REQUIRES_ARM_NEON;
18006 for (size_t k = 32; k <= 160; k += 16) {
18007 GemmMicrokernelTester()
18008 .mr(2)
18009 .nr(8)
18010 .kr(8)
18011 .sr(1)
18012 .m(2)
18013 .n(8)
18014 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018015 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018016 }
18017 }
18018
18019 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_div_16_strided_a) {
18020 TEST_REQUIRES_ARM_NEON;
18021 for (size_t k = 32; k <= 160; k += 16) {
18022 GemmMicrokernelTester()
18023 .mr(2)
18024 .nr(8)
18025 .kr(8)
18026 .sr(1)
18027 .m(2)
18028 .n(8)
18029 .k(k)
18030 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -080018031 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018032 }
18033 }
18034
18035 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, k_div_16_subtile) {
18036 TEST_REQUIRES_ARM_NEON;
18037 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080018038 for (uint32_t n = 1; n <= 8; n++) {
18039 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018040 GemmMicrokernelTester()
18041 .mr(2)
18042 .nr(8)
18043 .kr(8)
18044 .sr(1)
18045 .m(m)
18046 .n(n)
18047 .k(k)
18048 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018049 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018050 }
18051 }
18052 }
18053 }
18054
18055 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_gt_8) {
18056 TEST_REQUIRES_ARM_NEON;
18057 for (uint32_t n = 9; n < 16; n++) {
18058 for (size_t k = 1; k <= 80; k += 17) {
18059 GemmMicrokernelTester()
18060 .mr(2)
18061 .nr(8)
18062 .kr(8)
18063 .sr(1)
18064 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080018065 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018066 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018067 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018068 }
18069 }
18070 }
18071
18072 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_gt_8_strided_cn) {
18073 TEST_REQUIRES_ARM_NEON;
18074 for (uint32_t n = 9; n < 16; n++) {
18075 for (size_t k = 1; k <= 80; k += 17) {
18076 GemmMicrokernelTester()
18077 .mr(2)
18078 .nr(8)
18079 .kr(8)
18080 .sr(1)
18081 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080018082 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018083 .k(k)
18084 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080018085 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018086 }
18087 }
18088 }
18089
18090 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_gt_8_strided_a) {
18091 TEST_REQUIRES_ARM_NEON;
18092 for (uint32_t n = 9; n < 16; n++) {
18093 for (size_t k = 1; k <= 80; k += 17) {
18094 GemmMicrokernelTester()
18095 .mr(2)
18096 .nr(8)
18097 .kr(8)
18098 .sr(1)
18099 .m(2)
18100 .n(n)
18101 .k(k)
18102 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080018103 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018104 }
18105 }
18106 }
18107
18108 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_gt_8_subtile) {
18109 TEST_REQUIRES_ARM_NEON;
18110 for (uint32_t n = 9; n < 16; n++) {
18111 for (size_t k = 1; k <= 80; k += 17) {
18112 for (uint32_t m = 1; m <= 2; m++) {
18113 GemmMicrokernelTester()
18114 .mr(2)
18115 .nr(8)
18116 .kr(8)
18117 .sr(1)
18118 .m(m)
18119 .n(n)
18120 .k(k)
18121 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018122 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018123 }
18124 }
18125 }
18126 }
18127
18128 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_div_8) {
18129 TEST_REQUIRES_ARM_NEON;
18130 for (uint32_t n = 16; n <= 24; n += 8) {
18131 for (size_t k = 1; k <= 80; k += 17) {
18132 GemmMicrokernelTester()
18133 .mr(2)
18134 .nr(8)
18135 .kr(8)
18136 .sr(1)
18137 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080018138 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018139 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018140 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018141 }
18142 }
18143 }
18144
18145 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_div_8_strided_cn) {
18146 TEST_REQUIRES_ARM_NEON;
18147 for (uint32_t n = 16; n <= 24; n += 8) {
18148 for (size_t k = 1; k <= 80; k += 17) {
18149 GemmMicrokernelTester()
18150 .mr(2)
18151 .nr(8)
18152 .kr(8)
18153 .sr(1)
18154 .m(2)
18155 .n(n)
18156 .k(k)
18157 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080018158 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018159 }
18160 }
18161 }
18162
18163 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_div_8_strided_a) {
18164 TEST_REQUIRES_ARM_NEON;
18165 for (uint32_t n = 16; n <= 24; n += 8) {
18166 for (size_t k = 1; k <= 80; k += 17) {
18167 GemmMicrokernelTester()
18168 .mr(2)
18169 .nr(8)
18170 .kr(8)
18171 .sr(1)
18172 .m(2)
18173 .n(n)
18174 .k(k)
18175 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080018176 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018177 }
18178 }
18179 }
18180
18181 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, n_div_8_subtile) {
18182 TEST_REQUIRES_ARM_NEON;
18183 for (uint32_t n = 16; n <= 24; n += 8) {
18184 for (size_t k = 1; k <= 80; k += 17) {
18185 for (uint32_t m = 1; m <= 2; m++) {
18186 GemmMicrokernelTester()
18187 .mr(2)
18188 .nr(8)
18189 .kr(8)
18190 .sr(1)
18191 .m(m)
18192 .n(n)
18193 .k(k)
18194 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018195 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018196 }
18197 }
18198 }
18199 }
18200
18201 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, strided_cm_subtile) {
18202 TEST_REQUIRES_ARM_NEON;
18203 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080018204 for (uint32_t n = 1; n <= 8; n++) {
18205 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018206 GemmMicrokernelTester()
18207 .mr(2)
18208 .nr(8)
18209 .kr(8)
18210 .sr(1)
18211 .m(m)
18212 .n(n)
18213 .k(k)
18214 .cm_stride(11)
18215 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018216 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018217 }
18218 }
18219 }
18220 }
18221
18222 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, qmin) {
18223 TEST_REQUIRES_ARM_NEON;
18224 GemmMicrokernelTester()
18225 .mr(2)
18226 .nr(8)
18227 .kr(8)
18228 .sr(1)
18229 .m(2)
18230 .n(8)
18231 .k(16)
18232 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080018233 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018234 }
18235
18236 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, qmax) {
18237 TEST_REQUIRES_ARM_NEON;
18238 GemmMicrokernelTester()
18239 .mr(2)
18240 .nr(8)
18241 .kr(8)
18242 .sr(1)
18243 .m(2)
18244 .n(8)
18245 .k(16)
18246 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080018247 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018248 }
18249
18250 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEON_MLAL, strided_cm) {
18251 TEST_REQUIRES_ARM_NEON;
18252 GemmMicrokernelTester()
18253 .mr(2)
18254 .nr(8)
18255 .kr(8)
18256 .sr(1)
18257 .m(2)
18258 .n(8)
18259 .k(16)
18260 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080018261 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neon_mlal, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018262 }
18263#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
18264
18265
18266#if XNN_ARCH_ARM || XNN_ARCH_ARM64
18267 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16) {
18268 TEST_REQUIRES_ARM_NEON_V8;
18269 GemmMicrokernelTester()
18270 .mr(2)
18271 .nr(8)
18272 .kr(8)
18273 .sr(1)
18274 .m(2)
18275 .n(8)
18276 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -080018277 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018278 }
18279
18280 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, strided_cn) {
18281 TEST_REQUIRES_ARM_NEON_V8;
18282 GemmMicrokernelTester()
18283 .mr(2)
18284 .nr(8)
18285 .kr(8)
18286 .sr(1)
18287 .m(2)
18288 .n(8)
18289 .k(16)
18290 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080018291 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018292 }
18293
18294 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16_strided_a) {
18295 TEST_REQUIRES_ARM_NEON_V8;
18296 GemmMicrokernelTester()
18297 .mr(2)
18298 .nr(8)
18299 .kr(8)
18300 .sr(1)
18301 .m(2)
18302 .n(8)
18303 .k(16)
18304 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080018305 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018306 }
18307
18308 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16_subtile) {
18309 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -080018310 for (uint32_t n = 1; n <= 8; n++) {
18311 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018312 GemmMicrokernelTester()
18313 .mr(2)
18314 .nr(8)
18315 .kr(8)
18316 .sr(1)
18317 .m(m)
18318 .n(n)
18319 .k(16)
18320 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018321 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018322 }
18323 }
18324 }
18325
18326 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16_subtile_m) {
18327 TEST_REQUIRES_ARM_NEON_V8;
18328 for (uint32_t m = 1; m <= 2; m++) {
18329 GemmMicrokernelTester()
18330 .mr(2)
18331 .nr(8)
18332 .kr(8)
18333 .sr(1)
18334 .m(m)
18335 .n(8)
18336 .k(16)
18337 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018338 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018339 }
18340 }
18341
18342 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_eq_16_subtile_n) {
18343 TEST_REQUIRES_ARM_NEON_V8;
18344 for (uint32_t n = 1; n <= 8; n++) {
18345 GemmMicrokernelTester()
18346 .mr(2)
18347 .nr(8)
18348 .kr(8)
18349 .sr(1)
18350 .m(2)
18351 .n(n)
18352 .k(16)
18353 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018354 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018355 }
18356 }
18357
18358 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_lt_16) {
18359 TEST_REQUIRES_ARM_NEON_V8;
18360 for (size_t k = 1; k < 16; k++) {
18361 GemmMicrokernelTester()
18362 .mr(2)
18363 .nr(8)
18364 .kr(8)
18365 .sr(1)
18366 .m(2)
18367 .n(8)
18368 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018369 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018370 }
18371 }
18372
18373 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_lt_16_strided_a) {
18374 TEST_REQUIRES_ARM_NEON_V8;
18375 for (size_t k = 1; k < 16; k++) {
18376 GemmMicrokernelTester()
18377 .mr(2)
18378 .nr(8)
18379 .kr(8)
18380 .sr(1)
18381 .m(2)
18382 .n(8)
18383 .k(k)
18384 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080018385 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018386 }
18387 }
18388
18389 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_lt_16_subtile) {
18390 TEST_REQUIRES_ARM_NEON_V8;
18391 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080018392 for (uint32_t n = 1; n <= 8; n++) {
18393 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018394 GemmMicrokernelTester()
18395 .mr(2)
18396 .nr(8)
18397 .kr(8)
18398 .sr(1)
18399 .m(m)
18400 .n(n)
18401 .k(k)
18402 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018403 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018404 }
18405 }
18406 }
18407 }
18408
18409 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_gt_16) {
18410 TEST_REQUIRES_ARM_NEON_V8;
18411 for (size_t k = 17; k < 32; k++) {
18412 GemmMicrokernelTester()
18413 .mr(2)
18414 .nr(8)
18415 .kr(8)
18416 .sr(1)
18417 .m(2)
18418 .n(8)
18419 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018420 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018421 }
18422 }
18423
18424 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_gt_16_strided_a) {
18425 TEST_REQUIRES_ARM_NEON_V8;
18426 for (size_t k = 17; k < 32; k++) {
18427 GemmMicrokernelTester()
18428 .mr(2)
18429 .nr(8)
18430 .kr(8)
18431 .sr(1)
18432 .m(2)
18433 .n(8)
18434 .k(k)
18435 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -080018436 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018437 }
18438 }
18439
18440 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_gt_16_subtile) {
18441 TEST_REQUIRES_ARM_NEON_V8;
18442 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080018443 for (uint32_t n = 1; n <= 8; n++) {
18444 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018445 GemmMicrokernelTester()
18446 .mr(2)
18447 .nr(8)
18448 .kr(8)
18449 .sr(1)
18450 .m(m)
18451 .n(n)
18452 .k(k)
18453 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018454 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018455 }
18456 }
18457 }
18458 }
18459
18460 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_div_16) {
18461 TEST_REQUIRES_ARM_NEON_V8;
18462 for (size_t k = 32; k <= 160; k += 16) {
18463 GemmMicrokernelTester()
18464 .mr(2)
18465 .nr(8)
18466 .kr(8)
18467 .sr(1)
18468 .m(2)
18469 .n(8)
18470 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018471 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018472 }
18473 }
18474
18475 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_div_16_strided_a) {
18476 TEST_REQUIRES_ARM_NEON_V8;
18477 for (size_t k = 32; k <= 160; k += 16) {
18478 GemmMicrokernelTester()
18479 .mr(2)
18480 .nr(8)
18481 .kr(8)
18482 .sr(1)
18483 .m(2)
18484 .n(8)
18485 .k(k)
18486 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -080018487 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018488 }
18489 }
18490
18491 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, k_div_16_subtile) {
18492 TEST_REQUIRES_ARM_NEON_V8;
18493 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080018494 for (uint32_t n = 1; n <= 8; n++) {
18495 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018496 GemmMicrokernelTester()
18497 .mr(2)
18498 .nr(8)
18499 .kr(8)
18500 .sr(1)
18501 .m(m)
18502 .n(n)
18503 .k(k)
18504 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018505 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018506 }
18507 }
18508 }
18509 }
18510
18511 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_gt_8) {
18512 TEST_REQUIRES_ARM_NEON_V8;
18513 for (uint32_t n = 9; n < 16; n++) {
18514 for (size_t k = 1; k <= 80; k += 17) {
18515 GemmMicrokernelTester()
18516 .mr(2)
18517 .nr(8)
18518 .kr(8)
18519 .sr(1)
18520 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080018521 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018522 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018523 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018524 }
18525 }
18526 }
18527
18528 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_gt_8_strided_cn) {
18529 TEST_REQUIRES_ARM_NEON_V8;
18530 for (uint32_t n = 9; n < 16; n++) {
18531 for (size_t k = 1; k <= 80; k += 17) {
18532 GemmMicrokernelTester()
18533 .mr(2)
18534 .nr(8)
18535 .kr(8)
18536 .sr(1)
18537 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080018538 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018539 .k(k)
18540 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080018541 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018542 }
18543 }
18544 }
18545
18546 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_gt_8_strided_a) {
18547 TEST_REQUIRES_ARM_NEON_V8;
18548 for (uint32_t n = 9; n < 16; n++) {
18549 for (size_t k = 1; k <= 80; k += 17) {
18550 GemmMicrokernelTester()
18551 .mr(2)
18552 .nr(8)
18553 .kr(8)
18554 .sr(1)
18555 .m(2)
18556 .n(n)
18557 .k(k)
18558 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080018559 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018560 }
18561 }
18562 }
18563
18564 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_gt_8_subtile) {
18565 TEST_REQUIRES_ARM_NEON_V8;
18566 for (uint32_t n = 9; n < 16; n++) {
18567 for (size_t k = 1; k <= 80; k += 17) {
18568 for (uint32_t m = 1; m <= 2; m++) {
18569 GemmMicrokernelTester()
18570 .mr(2)
18571 .nr(8)
18572 .kr(8)
18573 .sr(1)
18574 .m(m)
18575 .n(n)
18576 .k(k)
18577 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018578 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018579 }
18580 }
18581 }
18582 }
18583
18584 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_div_8) {
18585 TEST_REQUIRES_ARM_NEON_V8;
18586 for (uint32_t n = 16; n <= 24; n += 8) {
18587 for (size_t k = 1; k <= 80; k += 17) {
18588 GemmMicrokernelTester()
18589 .mr(2)
18590 .nr(8)
18591 .kr(8)
18592 .sr(1)
18593 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080018594 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018595 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018596 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018597 }
18598 }
18599 }
18600
18601 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_div_8_strided_cn) {
18602 TEST_REQUIRES_ARM_NEON_V8;
18603 for (uint32_t n = 16; n <= 24; n += 8) {
18604 for (size_t k = 1; k <= 80; k += 17) {
18605 GemmMicrokernelTester()
18606 .mr(2)
18607 .nr(8)
18608 .kr(8)
18609 .sr(1)
18610 .m(2)
18611 .n(n)
18612 .k(k)
18613 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080018614 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018615 }
18616 }
18617 }
18618
18619 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_div_8_strided_a) {
18620 TEST_REQUIRES_ARM_NEON_V8;
18621 for (uint32_t n = 16; n <= 24; n += 8) {
18622 for (size_t k = 1; k <= 80; k += 17) {
18623 GemmMicrokernelTester()
18624 .mr(2)
18625 .nr(8)
18626 .kr(8)
18627 .sr(1)
18628 .m(2)
18629 .n(n)
18630 .k(k)
18631 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080018632 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018633 }
18634 }
18635 }
18636
18637 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, n_div_8_subtile) {
18638 TEST_REQUIRES_ARM_NEON_V8;
18639 for (uint32_t n = 16; n <= 24; n += 8) {
18640 for (size_t k = 1; k <= 80; k += 17) {
18641 for (uint32_t m = 1; m <= 2; m++) {
18642 GemmMicrokernelTester()
18643 .mr(2)
18644 .nr(8)
18645 .kr(8)
18646 .sr(1)
18647 .m(m)
18648 .n(n)
18649 .k(k)
18650 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018651 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018652 }
18653 }
18654 }
18655 }
18656
18657 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, strided_cm_subtile) {
18658 TEST_REQUIRES_ARM_NEON_V8;
18659 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080018660 for (uint32_t n = 1; n <= 8; n++) {
18661 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018662 GemmMicrokernelTester()
18663 .mr(2)
18664 .nr(8)
18665 .kr(8)
18666 .sr(1)
18667 .m(m)
18668 .n(n)
18669 .k(k)
18670 .cm_stride(11)
18671 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018672 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018673 }
18674 }
18675 }
18676 }
18677
18678 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, qmin) {
18679 TEST_REQUIRES_ARM_NEON_V8;
18680 GemmMicrokernelTester()
18681 .mr(2)
18682 .nr(8)
18683 .kr(8)
18684 .sr(1)
18685 .m(2)
18686 .n(8)
18687 .k(16)
18688 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080018689 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018690 }
18691
18692 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, qmax) {
18693 TEST_REQUIRES_ARM_NEON_V8;
18694 GemmMicrokernelTester()
18695 .mr(2)
18696 .nr(8)
18697 .kr(8)
18698 .sr(1)
18699 .m(2)
18700 .n(8)
18701 .k(16)
18702 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080018703 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018704 }
18705
18706 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL, strided_cm) {
18707 TEST_REQUIRES_ARM_NEON_V8;
18708 GemmMicrokernelTester()
18709 .mr(2)
18710 .nr(8)
18711 .kr(8)
18712 .sr(1)
18713 .m(2)
18714 .n(8)
18715 .k(16)
18716 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080018717 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018718 }
18719#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
18720
18721
18722#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
18723 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, k_eq_8) {
18724 TEST_REQUIRES_ARM_NEON_DOT;
18725 GemmMicrokernelTester()
18726 .mr(4)
18727 .nr(8)
18728 .kr(4)
18729 .sr(1)
18730 .m(4)
18731 .n(8)
18732 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080018733 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018734 }
18735
18736 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, strided_cn) {
18737 TEST_REQUIRES_ARM_NEON_DOT;
18738 GemmMicrokernelTester()
18739 .mr(4)
18740 .nr(8)
18741 .kr(4)
18742 .sr(1)
18743 .m(4)
18744 .n(8)
18745 .k(8)
18746 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080018747 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018748 }
18749
18750 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, k_eq_8_strided_a) {
18751 TEST_REQUIRES_ARM_NEON_DOT;
18752 GemmMicrokernelTester()
18753 .mr(4)
18754 .nr(8)
18755 .kr(4)
18756 .sr(1)
18757 .m(4)
18758 .n(8)
18759 .k(8)
18760 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080018761 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018762 }
18763
18764 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, k_eq_8_subtile) {
18765 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng83844ae2022-01-14 09:52:25 -080018766 for (uint32_t n = 1; n <= 8; n++) {
18767 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018768 GemmMicrokernelTester()
18769 .mr(4)
18770 .nr(8)
18771 .kr(4)
18772 .sr(1)
18773 .m(m)
18774 .n(n)
18775 .k(8)
18776 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018777 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018778 }
18779 }
18780 }
18781
18782 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, k_eq_8_subtile_m) {
18783 TEST_REQUIRES_ARM_NEON_DOT;
18784 for (uint32_t m = 1; m <= 4; m++) {
18785 GemmMicrokernelTester()
18786 .mr(4)
18787 .nr(8)
18788 .kr(4)
18789 .sr(1)
18790 .m(m)
18791 .n(8)
18792 .k(8)
18793 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018794 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018795 }
18796 }
18797
18798 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, k_eq_8_subtile_n) {
18799 TEST_REQUIRES_ARM_NEON_DOT;
18800 for (uint32_t n = 1; n <= 8; n++) {
18801 GemmMicrokernelTester()
18802 .mr(4)
18803 .nr(8)
18804 .kr(4)
18805 .sr(1)
18806 .m(4)
18807 .n(n)
18808 .k(8)
18809 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018810 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018811 }
18812 }
18813
18814 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, k_lt_8) {
18815 TEST_REQUIRES_ARM_NEON_DOT;
18816 for (size_t k = 1; k < 8; k++) {
18817 GemmMicrokernelTester()
18818 .mr(4)
18819 .nr(8)
18820 .kr(4)
18821 .sr(1)
18822 .m(4)
18823 .n(8)
18824 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018825 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018826 }
18827 }
18828
18829 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, k_lt_8_strided_a) {
18830 TEST_REQUIRES_ARM_NEON_DOT;
18831 for (size_t k = 1; k < 8; k++) {
18832 GemmMicrokernelTester()
18833 .mr(4)
18834 .nr(8)
18835 .kr(4)
18836 .sr(1)
18837 .m(4)
18838 .n(8)
18839 .k(k)
18840 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080018841 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018842 }
18843 }
18844
18845 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, k_lt_8_subtile) {
18846 TEST_REQUIRES_ARM_NEON_DOT;
18847 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080018848 for (uint32_t n = 1; n <= 8; n++) {
18849 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018850 GemmMicrokernelTester()
18851 .mr(4)
18852 .nr(8)
18853 .kr(4)
18854 .sr(1)
18855 .m(m)
18856 .n(n)
18857 .k(k)
18858 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018859 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018860 }
18861 }
18862 }
18863 }
18864
18865 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, k_gt_8) {
18866 TEST_REQUIRES_ARM_NEON_DOT;
18867 for (size_t k = 9; k < 16; k++) {
18868 GemmMicrokernelTester()
18869 .mr(4)
18870 .nr(8)
18871 .kr(4)
18872 .sr(1)
18873 .m(4)
18874 .n(8)
18875 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018876 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018877 }
18878 }
18879
18880 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, k_gt_8_strided_a) {
18881 TEST_REQUIRES_ARM_NEON_DOT;
18882 for (size_t k = 9; k < 16; k++) {
18883 GemmMicrokernelTester()
18884 .mr(4)
18885 .nr(8)
18886 .kr(4)
18887 .sr(1)
18888 .m(4)
18889 .n(8)
18890 .k(k)
18891 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080018892 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018893 }
18894 }
18895
18896 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, k_gt_8_subtile) {
18897 TEST_REQUIRES_ARM_NEON_DOT;
18898 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080018899 for (uint32_t n = 1; n <= 8; n++) {
18900 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018901 GemmMicrokernelTester()
18902 .mr(4)
18903 .nr(8)
18904 .kr(4)
18905 .sr(1)
18906 .m(m)
18907 .n(n)
18908 .k(k)
18909 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018910 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018911 }
18912 }
18913 }
18914 }
18915
18916 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, k_div_8) {
18917 TEST_REQUIRES_ARM_NEON_DOT;
18918 for (size_t k = 16; k <= 80; k += 8) {
18919 GemmMicrokernelTester()
18920 .mr(4)
18921 .nr(8)
18922 .kr(4)
18923 .sr(1)
18924 .m(4)
18925 .n(8)
18926 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018927 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018928 }
18929 }
18930
18931 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, k_div_8_strided_a) {
18932 TEST_REQUIRES_ARM_NEON_DOT;
18933 for (size_t k = 16; k <= 80; k += 8) {
18934 GemmMicrokernelTester()
18935 .mr(4)
18936 .nr(8)
18937 .kr(4)
18938 .sr(1)
18939 .m(4)
18940 .n(8)
18941 .k(k)
18942 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080018943 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018944 }
18945 }
18946
18947 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, k_div_8_subtile) {
18948 TEST_REQUIRES_ARM_NEON_DOT;
18949 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080018950 for (uint32_t n = 1; n <= 8; n++) {
18951 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018952 GemmMicrokernelTester()
18953 .mr(4)
18954 .nr(8)
18955 .kr(4)
18956 .sr(1)
18957 .m(m)
18958 .n(n)
18959 .k(k)
18960 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018961 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018962 }
18963 }
18964 }
18965 }
18966
18967 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, n_gt_8) {
18968 TEST_REQUIRES_ARM_NEON_DOT;
18969 for (uint32_t n = 9; n < 16; n++) {
18970 for (size_t k = 1; k <= 40; k += 9) {
18971 GemmMicrokernelTester()
18972 .mr(4)
18973 .nr(8)
18974 .kr(4)
18975 .sr(1)
18976 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080018977 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018978 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018979 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018980 }
18981 }
18982 }
18983
18984 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, n_gt_8_strided_cn) {
18985 TEST_REQUIRES_ARM_NEON_DOT;
18986 for (uint32_t n = 9; n < 16; n++) {
18987 for (size_t k = 1; k <= 40; k += 9) {
18988 GemmMicrokernelTester()
18989 .mr(4)
18990 .nr(8)
18991 .kr(4)
18992 .sr(1)
18993 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080018994 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018995 .k(k)
18996 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080018997 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018998 }
18999 }
19000 }
19001
19002 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, n_gt_8_strided_a) {
19003 TEST_REQUIRES_ARM_NEON_DOT;
19004 for (uint32_t n = 9; n < 16; n++) {
19005 for (size_t k = 1; k <= 40; k += 9) {
19006 GemmMicrokernelTester()
19007 .mr(4)
19008 .nr(8)
19009 .kr(4)
19010 .sr(1)
19011 .m(4)
19012 .n(n)
19013 .k(k)
19014 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080019015 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019016 }
19017 }
19018 }
19019
19020 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, n_gt_8_subtile) {
19021 TEST_REQUIRES_ARM_NEON_DOT;
19022 for (uint32_t n = 9; n < 16; n++) {
19023 for (size_t k = 1; k <= 40; k += 9) {
19024 for (uint32_t m = 1; m <= 4; m++) {
19025 GemmMicrokernelTester()
19026 .mr(4)
19027 .nr(8)
19028 .kr(4)
19029 .sr(1)
19030 .m(m)
19031 .n(n)
19032 .k(k)
19033 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019034 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019035 }
19036 }
19037 }
19038 }
19039
19040 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, n_div_8) {
19041 TEST_REQUIRES_ARM_NEON_DOT;
19042 for (uint32_t n = 16; n <= 24; n += 8) {
19043 for (size_t k = 1; k <= 40; k += 9) {
19044 GemmMicrokernelTester()
19045 .mr(4)
19046 .nr(8)
19047 .kr(4)
19048 .sr(1)
19049 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080019050 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019051 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080019052 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019053 }
19054 }
19055 }
19056
19057 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, n_div_8_strided_cn) {
19058 TEST_REQUIRES_ARM_NEON_DOT;
19059 for (uint32_t n = 16; n <= 24; n += 8) {
19060 for (size_t k = 1; k <= 40; k += 9) {
19061 GemmMicrokernelTester()
19062 .mr(4)
19063 .nr(8)
19064 .kr(4)
19065 .sr(1)
19066 .m(4)
19067 .n(n)
19068 .k(k)
19069 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080019070 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019071 }
19072 }
19073 }
19074
19075 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, n_div_8_strided_a) {
19076 TEST_REQUIRES_ARM_NEON_DOT;
19077 for (uint32_t n = 16; n <= 24; n += 8) {
19078 for (size_t k = 1; k <= 40; k += 9) {
19079 GemmMicrokernelTester()
19080 .mr(4)
19081 .nr(8)
19082 .kr(4)
19083 .sr(1)
19084 .m(4)
19085 .n(n)
19086 .k(k)
19087 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080019088 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019089 }
19090 }
19091 }
19092
19093 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, n_div_8_subtile) {
19094 TEST_REQUIRES_ARM_NEON_DOT;
19095 for (uint32_t n = 16; n <= 24; n += 8) {
19096 for (size_t k = 1; k <= 40; k += 9) {
19097 for (uint32_t m = 1; m <= 4; m++) {
19098 GemmMicrokernelTester()
19099 .mr(4)
19100 .nr(8)
19101 .kr(4)
19102 .sr(1)
19103 .m(m)
19104 .n(n)
19105 .k(k)
19106 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019107 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019108 }
19109 }
19110 }
19111 }
19112
19113 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, strided_cm_subtile) {
19114 TEST_REQUIRES_ARM_NEON_DOT;
19115 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080019116 for (uint32_t n = 1; n <= 8; n++) {
19117 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019118 GemmMicrokernelTester()
19119 .mr(4)
19120 .nr(8)
19121 .kr(4)
19122 .sr(1)
19123 .m(m)
19124 .n(n)
19125 .k(k)
19126 .cm_stride(11)
19127 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019128 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019129 }
19130 }
19131 }
19132 }
19133
19134 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, qmin) {
19135 TEST_REQUIRES_ARM_NEON_DOT;
19136 GemmMicrokernelTester()
19137 .mr(4)
19138 .nr(8)
19139 .kr(4)
19140 .sr(1)
19141 .m(4)
19142 .n(8)
19143 .k(8)
19144 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080019145 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019146 }
19147
19148 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, qmax) {
19149 TEST_REQUIRES_ARM_NEON_DOT;
19150 GemmMicrokernelTester()
19151 .mr(4)
19152 .nr(8)
19153 .kr(4)
19154 .sr(1)
19155 .m(4)
19156 .n(8)
19157 .k(8)
19158 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080019159 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019160 }
19161
19162 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__NEONDOT, strided_cm) {
19163 TEST_REQUIRES_ARM_NEON_DOT;
19164 GemmMicrokernelTester()
19165 .mr(4)
19166 .nr(8)
19167 .kr(4)
19168 .sr(1)
19169 .m(4)
19170 .n(8)
19171 .k(8)
19172 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080019173 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019174 }
19175#endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
19176
19177
19178#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
19179 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, k_eq_8) {
19180 TEST_REQUIRES_ARM_NEON_DOT;
19181 GemmMicrokernelTester()
19182 .mr(8)
19183 .nr(8)
19184 .kr(4)
19185 .sr(1)
19186 .m(8)
19187 .n(8)
19188 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080019189 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019190 }
19191
19192 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, strided_cn) {
19193 TEST_REQUIRES_ARM_NEON_DOT;
19194 GemmMicrokernelTester()
19195 .mr(8)
19196 .nr(8)
19197 .kr(4)
19198 .sr(1)
19199 .m(8)
19200 .n(8)
19201 .k(8)
19202 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080019203 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019204 }
19205
19206 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, k_eq_8_strided_a) {
19207 TEST_REQUIRES_ARM_NEON_DOT;
19208 GemmMicrokernelTester()
19209 .mr(8)
19210 .nr(8)
19211 .kr(4)
19212 .sr(1)
19213 .m(8)
19214 .n(8)
19215 .k(8)
19216 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080019217 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019218 }
19219
19220 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, k_eq_8_subtile) {
19221 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng83844ae2022-01-14 09:52:25 -080019222 for (uint32_t n = 1; n <= 8; n++) {
19223 for (uint32_t m = 1; m <= 8; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019224 GemmMicrokernelTester()
19225 .mr(8)
19226 .nr(8)
19227 .kr(4)
19228 .sr(1)
19229 .m(m)
19230 .n(n)
19231 .k(8)
19232 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019233 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019234 }
19235 }
19236 }
19237
19238 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, k_eq_8_subtile_m) {
19239 TEST_REQUIRES_ARM_NEON_DOT;
19240 for (uint32_t m = 1; m <= 8; m++) {
19241 GemmMicrokernelTester()
19242 .mr(8)
19243 .nr(8)
19244 .kr(4)
19245 .sr(1)
19246 .m(m)
19247 .n(8)
19248 .k(8)
19249 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019250 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019251 }
19252 }
19253
19254 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, k_eq_8_subtile_n) {
19255 TEST_REQUIRES_ARM_NEON_DOT;
19256 for (uint32_t n = 1; n <= 8; n++) {
19257 GemmMicrokernelTester()
19258 .mr(8)
19259 .nr(8)
19260 .kr(4)
19261 .sr(1)
19262 .m(8)
19263 .n(n)
19264 .k(8)
19265 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019266 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019267 }
19268 }
19269
19270 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, k_lt_8) {
19271 TEST_REQUIRES_ARM_NEON_DOT;
19272 for (size_t k = 1; k < 8; k++) {
19273 GemmMicrokernelTester()
19274 .mr(8)
19275 .nr(8)
19276 .kr(4)
19277 .sr(1)
19278 .m(8)
19279 .n(8)
19280 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080019281 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019282 }
19283 }
19284
19285 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, k_lt_8_strided_a) {
19286 TEST_REQUIRES_ARM_NEON_DOT;
19287 for (size_t k = 1; k < 8; k++) {
19288 GemmMicrokernelTester()
19289 .mr(8)
19290 .nr(8)
19291 .kr(4)
19292 .sr(1)
19293 .m(8)
19294 .n(8)
19295 .k(k)
19296 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080019297 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019298 }
19299 }
19300
19301 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, k_lt_8_subtile) {
19302 TEST_REQUIRES_ARM_NEON_DOT;
19303 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080019304 for (uint32_t n = 1; n <= 8; n++) {
19305 for (uint32_t m = 1; m <= 8; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019306 GemmMicrokernelTester()
19307 .mr(8)
19308 .nr(8)
19309 .kr(4)
19310 .sr(1)
19311 .m(m)
19312 .n(n)
19313 .k(k)
19314 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019315 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019316 }
19317 }
19318 }
19319 }
19320
19321 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, k_gt_8) {
19322 TEST_REQUIRES_ARM_NEON_DOT;
19323 for (size_t k = 9; k < 16; k++) {
19324 GemmMicrokernelTester()
19325 .mr(8)
19326 .nr(8)
19327 .kr(4)
19328 .sr(1)
19329 .m(8)
19330 .n(8)
19331 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080019332 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019333 }
19334 }
19335
19336 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, k_gt_8_strided_a) {
19337 TEST_REQUIRES_ARM_NEON_DOT;
19338 for (size_t k = 9; k < 16; k++) {
19339 GemmMicrokernelTester()
19340 .mr(8)
19341 .nr(8)
19342 .kr(4)
19343 .sr(1)
19344 .m(8)
19345 .n(8)
19346 .k(k)
19347 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080019348 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019349 }
19350 }
19351
19352 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, k_gt_8_subtile) {
19353 TEST_REQUIRES_ARM_NEON_DOT;
19354 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080019355 for (uint32_t n = 1; n <= 8; n++) {
19356 for (uint32_t m = 1; m <= 8; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019357 GemmMicrokernelTester()
19358 .mr(8)
19359 .nr(8)
19360 .kr(4)
19361 .sr(1)
19362 .m(m)
19363 .n(n)
19364 .k(k)
19365 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019366 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019367 }
19368 }
19369 }
19370 }
19371
19372 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, k_div_8) {
19373 TEST_REQUIRES_ARM_NEON_DOT;
19374 for (size_t k = 16; k <= 80; k += 8) {
19375 GemmMicrokernelTester()
19376 .mr(8)
19377 .nr(8)
19378 .kr(4)
19379 .sr(1)
19380 .m(8)
19381 .n(8)
19382 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080019383 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019384 }
19385 }
19386
19387 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, k_div_8_strided_a) {
19388 TEST_REQUIRES_ARM_NEON_DOT;
19389 for (size_t k = 16; k <= 80; k += 8) {
19390 GemmMicrokernelTester()
19391 .mr(8)
19392 .nr(8)
19393 .kr(4)
19394 .sr(1)
19395 .m(8)
19396 .n(8)
19397 .k(k)
19398 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080019399 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019400 }
19401 }
19402
19403 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, k_div_8_subtile) {
19404 TEST_REQUIRES_ARM_NEON_DOT;
19405 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080019406 for (uint32_t n = 1; n <= 8; n++) {
19407 for (uint32_t m = 1; m <= 8; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019408 GemmMicrokernelTester()
19409 .mr(8)
19410 .nr(8)
19411 .kr(4)
19412 .sr(1)
19413 .m(m)
19414 .n(n)
19415 .k(k)
19416 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019417 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019418 }
19419 }
19420 }
19421 }
19422
19423 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, n_gt_8) {
19424 TEST_REQUIRES_ARM_NEON_DOT;
19425 for (uint32_t n = 9; n < 16; n++) {
19426 for (size_t k = 1; k <= 40; k += 9) {
19427 GemmMicrokernelTester()
19428 .mr(8)
19429 .nr(8)
19430 .kr(4)
19431 .sr(1)
19432 .m(8)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080019433 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019434 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080019435 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019436 }
19437 }
19438 }
19439
19440 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, n_gt_8_strided_cn) {
19441 TEST_REQUIRES_ARM_NEON_DOT;
19442 for (uint32_t n = 9; n < 16; n++) {
19443 for (size_t k = 1; k <= 40; k += 9) {
19444 GemmMicrokernelTester()
19445 .mr(8)
19446 .nr(8)
19447 .kr(4)
19448 .sr(1)
19449 .m(8)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080019450 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019451 .k(k)
19452 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080019453 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019454 }
19455 }
19456 }
19457
19458 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, n_gt_8_strided_a) {
19459 TEST_REQUIRES_ARM_NEON_DOT;
19460 for (uint32_t n = 9; n < 16; n++) {
19461 for (size_t k = 1; k <= 40; k += 9) {
19462 GemmMicrokernelTester()
19463 .mr(8)
19464 .nr(8)
19465 .kr(4)
19466 .sr(1)
19467 .m(8)
19468 .n(n)
19469 .k(k)
19470 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080019471 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019472 }
19473 }
19474 }
19475
19476 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, n_gt_8_subtile) {
19477 TEST_REQUIRES_ARM_NEON_DOT;
19478 for (uint32_t n = 9; n < 16; n++) {
19479 for (size_t k = 1; k <= 40; k += 9) {
19480 for (uint32_t m = 1; m <= 8; m++) {
19481 GemmMicrokernelTester()
19482 .mr(8)
19483 .nr(8)
19484 .kr(4)
19485 .sr(1)
19486 .m(m)
19487 .n(n)
19488 .k(k)
19489 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019490 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019491 }
19492 }
19493 }
19494 }
19495
19496 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, n_div_8) {
19497 TEST_REQUIRES_ARM_NEON_DOT;
19498 for (uint32_t n = 16; n <= 24; n += 8) {
19499 for (size_t k = 1; k <= 40; k += 9) {
19500 GemmMicrokernelTester()
19501 .mr(8)
19502 .nr(8)
19503 .kr(4)
19504 .sr(1)
19505 .m(8)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080019506 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019507 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080019508 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019509 }
19510 }
19511 }
19512
19513 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, n_div_8_strided_cn) {
19514 TEST_REQUIRES_ARM_NEON_DOT;
19515 for (uint32_t n = 16; n <= 24; n += 8) {
19516 for (size_t k = 1; k <= 40; k += 9) {
19517 GemmMicrokernelTester()
19518 .mr(8)
19519 .nr(8)
19520 .kr(4)
19521 .sr(1)
19522 .m(8)
19523 .n(n)
19524 .k(k)
19525 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080019526 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019527 }
19528 }
19529 }
19530
19531 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, n_div_8_strided_a) {
19532 TEST_REQUIRES_ARM_NEON_DOT;
19533 for (uint32_t n = 16; n <= 24; n += 8) {
19534 for (size_t k = 1; k <= 40; k += 9) {
19535 GemmMicrokernelTester()
19536 .mr(8)
19537 .nr(8)
19538 .kr(4)
19539 .sr(1)
19540 .m(8)
19541 .n(n)
19542 .k(k)
19543 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080019544 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019545 }
19546 }
19547 }
19548
19549 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, n_div_8_subtile) {
19550 TEST_REQUIRES_ARM_NEON_DOT;
19551 for (uint32_t n = 16; n <= 24; n += 8) {
19552 for (size_t k = 1; k <= 40; k += 9) {
19553 for (uint32_t m = 1; m <= 8; m++) {
19554 GemmMicrokernelTester()
19555 .mr(8)
19556 .nr(8)
19557 .kr(4)
19558 .sr(1)
19559 .m(m)
19560 .n(n)
19561 .k(k)
19562 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019563 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019564 }
19565 }
19566 }
19567 }
19568
19569 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, strided_cm_subtile) {
19570 TEST_REQUIRES_ARM_NEON_DOT;
19571 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080019572 for (uint32_t n = 1; n <= 8; n++) {
19573 for (uint32_t m = 1; m <= 8; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019574 GemmMicrokernelTester()
19575 .mr(8)
19576 .nr(8)
19577 .kr(4)
19578 .sr(1)
19579 .m(m)
19580 .n(n)
19581 .k(k)
19582 .cm_stride(11)
19583 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019584 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019585 }
19586 }
19587 }
19588 }
19589
19590 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, qmin) {
19591 TEST_REQUIRES_ARM_NEON_DOT;
19592 GemmMicrokernelTester()
19593 .mr(8)
19594 .nr(8)
19595 .kr(4)
19596 .sr(1)
19597 .m(8)
19598 .n(8)
19599 .k(8)
19600 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080019601 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019602 }
19603
19604 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, qmax) {
19605 TEST_REQUIRES_ARM_NEON_DOT;
19606 GemmMicrokernelTester()
19607 .mr(8)
19608 .nr(8)
19609 .kr(4)
19610 .sr(1)
19611 .m(8)
19612 .n(8)
19613 .k(8)
19614 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080019615 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019616 }
19617
19618 TEST(QC8_GEMM_MINMAX_FP32_8X8C4__NEONDOT, strided_cm) {
19619 TEST_REQUIRES_ARM_NEON_DOT;
19620 GemmMicrokernelTester()
19621 .mr(8)
19622 .nr(8)
19623 .kr(4)
19624 .sr(1)
19625 .m(8)
19626 .n(8)
19627 .k(8)
19628 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080019629 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x8c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019630 }
19631#endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
19632
19633
19634#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
19635 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_eq_8) {
19636 TEST_REQUIRES_ARM_NEON_DOT;
19637 GemmMicrokernelTester()
19638 .mr(1)
19639 .nr(16)
19640 .kr(4)
19641 .sr(1)
19642 .m(1)
19643 .n(16)
19644 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080019645 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019646 }
19647
19648 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, strided_cn) {
19649 TEST_REQUIRES_ARM_NEON_DOT;
19650 GemmMicrokernelTester()
19651 .mr(1)
19652 .nr(16)
19653 .kr(4)
19654 .sr(1)
19655 .m(1)
19656 .n(16)
19657 .k(8)
19658 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080019659 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019660 }
19661
19662 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_eq_8_strided_a) {
19663 TEST_REQUIRES_ARM_NEON_DOT;
19664 GemmMicrokernelTester()
19665 .mr(1)
19666 .nr(16)
19667 .kr(4)
19668 .sr(1)
19669 .m(1)
19670 .n(16)
19671 .k(8)
19672 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080019673 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019674 }
19675
19676 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_eq_8_subtile) {
19677 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng83844ae2022-01-14 09:52:25 -080019678 for (uint32_t n = 1; n <= 16; n++) {
19679 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019680 GemmMicrokernelTester()
19681 .mr(1)
19682 .nr(16)
19683 .kr(4)
19684 .sr(1)
19685 .m(m)
19686 .n(n)
19687 .k(8)
19688 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019689 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019690 }
19691 }
19692 }
19693
19694 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_eq_8_subtile_m) {
19695 TEST_REQUIRES_ARM_NEON_DOT;
19696 for (uint32_t m = 1; m <= 1; m++) {
19697 GemmMicrokernelTester()
19698 .mr(1)
19699 .nr(16)
19700 .kr(4)
19701 .sr(1)
19702 .m(m)
19703 .n(16)
19704 .k(8)
19705 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019706 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019707 }
19708 }
19709
19710 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_eq_8_subtile_n) {
19711 TEST_REQUIRES_ARM_NEON_DOT;
19712 for (uint32_t n = 1; n <= 16; n++) {
19713 GemmMicrokernelTester()
19714 .mr(1)
19715 .nr(16)
19716 .kr(4)
19717 .sr(1)
19718 .m(1)
19719 .n(n)
19720 .k(8)
19721 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019722 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019723 }
19724 }
19725
19726 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_lt_8) {
19727 TEST_REQUIRES_ARM_NEON_DOT;
19728 for (size_t k = 1; k < 8; k++) {
19729 GemmMicrokernelTester()
19730 .mr(1)
19731 .nr(16)
19732 .kr(4)
19733 .sr(1)
19734 .m(1)
19735 .n(16)
19736 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080019737 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019738 }
19739 }
19740
19741 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_lt_8_strided_a) {
19742 TEST_REQUIRES_ARM_NEON_DOT;
19743 for (size_t k = 1; k < 8; k++) {
19744 GemmMicrokernelTester()
19745 .mr(1)
19746 .nr(16)
19747 .kr(4)
19748 .sr(1)
19749 .m(1)
19750 .n(16)
19751 .k(k)
19752 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080019753 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019754 }
19755 }
19756
19757 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_lt_8_subtile) {
19758 TEST_REQUIRES_ARM_NEON_DOT;
19759 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080019760 for (uint32_t n = 1; n <= 16; n++) {
19761 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019762 GemmMicrokernelTester()
19763 .mr(1)
19764 .nr(16)
19765 .kr(4)
19766 .sr(1)
19767 .m(m)
19768 .n(n)
19769 .k(k)
19770 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019771 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019772 }
19773 }
19774 }
19775 }
19776
19777 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_gt_8) {
19778 TEST_REQUIRES_ARM_NEON_DOT;
19779 for (size_t k = 9; k < 16; k++) {
19780 GemmMicrokernelTester()
19781 .mr(1)
19782 .nr(16)
19783 .kr(4)
19784 .sr(1)
19785 .m(1)
19786 .n(16)
19787 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080019788 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019789 }
19790 }
19791
19792 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_gt_8_strided_a) {
19793 TEST_REQUIRES_ARM_NEON_DOT;
19794 for (size_t k = 9; k < 16; k++) {
19795 GemmMicrokernelTester()
19796 .mr(1)
19797 .nr(16)
19798 .kr(4)
19799 .sr(1)
19800 .m(1)
19801 .n(16)
19802 .k(k)
19803 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080019804 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019805 }
19806 }
19807
19808 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_gt_8_subtile) {
19809 TEST_REQUIRES_ARM_NEON_DOT;
19810 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080019811 for (uint32_t n = 1; n <= 16; n++) {
19812 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019813 GemmMicrokernelTester()
19814 .mr(1)
19815 .nr(16)
19816 .kr(4)
19817 .sr(1)
19818 .m(m)
19819 .n(n)
19820 .k(k)
19821 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019822 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019823 }
19824 }
19825 }
19826 }
19827
19828 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_div_8) {
19829 TEST_REQUIRES_ARM_NEON_DOT;
19830 for (size_t k = 16; k <= 80; k += 8) {
19831 GemmMicrokernelTester()
19832 .mr(1)
19833 .nr(16)
19834 .kr(4)
19835 .sr(1)
19836 .m(1)
19837 .n(16)
19838 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080019839 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019840 }
19841 }
19842
19843 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_div_8_strided_a) {
19844 TEST_REQUIRES_ARM_NEON_DOT;
19845 for (size_t k = 16; k <= 80; k += 8) {
19846 GemmMicrokernelTester()
19847 .mr(1)
19848 .nr(16)
19849 .kr(4)
19850 .sr(1)
19851 .m(1)
19852 .n(16)
19853 .k(k)
19854 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080019855 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019856 }
19857 }
19858
19859 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_div_8_subtile) {
19860 TEST_REQUIRES_ARM_NEON_DOT;
19861 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080019862 for (uint32_t n = 1; n <= 16; n++) {
19863 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019864 GemmMicrokernelTester()
19865 .mr(1)
19866 .nr(16)
19867 .kr(4)
19868 .sr(1)
19869 .m(m)
19870 .n(n)
19871 .k(k)
19872 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019873 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019874 }
19875 }
19876 }
19877 }
19878
19879 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, n_gt_16) {
19880 TEST_REQUIRES_ARM_NEON_DOT;
19881 for (uint32_t n = 17; n < 32; n++) {
19882 for (size_t k = 1; k <= 40; k += 9) {
19883 GemmMicrokernelTester()
19884 .mr(1)
19885 .nr(16)
19886 .kr(4)
19887 .sr(1)
19888 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080019889 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019890 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080019891 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019892 }
19893 }
19894 }
19895
19896 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, n_gt_16_strided_cn) {
19897 TEST_REQUIRES_ARM_NEON_DOT;
19898 for (uint32_t n = 17; n < 32; n++) {
19899 for (size_t k = 1; k <= 40; k += 9) {
19900 GemmMicrokernelTester()
19901 .mr(1)
19902 .nr(16)
19903 .kr(4)
19904 .sr(1)
19905 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080019906 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019907 .k(k)
19908 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080019909 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019910 }
19911 }
19912 }
19913
19914 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, n_gt_16_strided_a) {
19915 TEST_REQUIRES_ARM_NEON_DOT;
19916 for (uint32_t n = 17; n < 32; n++) {
19917 for (size_t k = 1; k <= 40; k += 9) {
19918 GemmMicrokernelTester()
19919 .mr(1)
19920 .nr(16)
19921 .kr(4)
19922 .sr(1)
19923 .m(1)
19924 .n(n)
19925 .k(k)
19926 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080019927 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019928 }
19929 }
19930 }
19931
19932 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, n_gt_16_subtile) {
19933 TEST_REQUIRES_ARM_NEON_DOT;
19934 for (uint32_t n = 17; n < 32; n++) {
19935 for (size_t k = 1; k <= 40; k += 9) {
19936 for (uint32_t m = 1; m <= 1; m++) {
19937 GemmMicrokernelTester()
19938 .mr(1)
19939 .nr(16)
19940 .kr(4)
19941 .sr(1)
19942 .m(m)
19943 .n(n)
19944 .k(k)
19945 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019946 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019947 }
19948 }
19949 }
19950 }
19951
19952 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, n_div_16) {
19953 TEST_REQUIRES_ARM_NEON_DOT;
19954 for (uint32_t n = 32; n <= 48; n += 16) {
19955 for (size_t k = 1; k <= 40; k += 9) {
19956 GemmMicrokernelTester()
19957 .mr(1)
19958 .nr(16)
19959 .kr(4)
19960 .sr(1)
19961 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080019962 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019963 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080019964 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019965 }
19966 }
19967 }
19968
19969 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, n_div_16_strided_cn) {
19970 TEST_REQUIRES_ARM_NEON_DOT;
19971 for (uint32_t n = 32; n <= 48; n += 16) {
19972 for (size_t k = 1; k <= 40; k += 9) {
19973 GemmMicrokernelTester()
19974 .mr(1)
19975 .nr(16)
19976 .kr(4)
19977 .sr(1)
19978 .m(1)
19979 .n(n)
19980 .k(k)
19981 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080019982 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019983 }
19984 }
19985 }
19986
19987 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, n_div_16_strided_a) {
19988 TEST_REQUIRES_ARM_NEON_DOT;
19989 for (uint32_t n = 32; n <= 48; n += 16) {
19990 for (size_t k = 1; k <= 40; k += 9) {
19991 GemmMicrokernelTester()
19992 .mr(1)
19993 .nr(16)
19994 .kr(4)
19995 .sr(1)
19996 .m(1)
19997 .n(n)
19998 .k(k)
19999 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080020000 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020001 }
20002 }
20003 }
20004
20005 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, n_div_16_subtile) {
20006 TEST_REQUIRES_ARM_NEON_DOT;
20007 for (uint32_t n = 32; n <= 48; n += 16) {
20008 for (size_t k = 1; k <= 40; k += 9) {
20009 for (uint32_t m = 1; m <= 1; m++) {
20010 GemmMicrokernelTester()
20011 .mr(1)
20012 .nr(16)
20013 .kr(4)
20014 .sr(1)
20015 .m(m)
20016 .n(n)
20017 .k(k)
20018 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020019 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020020 }
20021 }
20022 }
20023 }
20024
20025 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, strided_cm_subtile) {
20026 TEST_REQUIRES_ARM_NEON_DOT;
20027 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080020028 for (uint32_t n = 1; n <= 16; n++) {
20029 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020030 GemmMicrokernelTester()
20031 .mr(1)
20032 .nr(16)
20033 .kr(4)
20034 .sr(1)
20035 .m(m)
20036 .n(n)
20037 .k(k)
20038 .cm_stride(19)
20039 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020040 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020041 }
20042 }
20043 }
20044 }
20045
20046 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, qmin) {
20047 TEST_REQUIRES_ARM_NEON_DOT;
20048 GemmMicrokernelTester()
20049 .mr(1)
20050 .nr(16)
20051 .kr(4)
20052 .sr(1)
20053 .m(1)
20054 .n(16)
20055 .k(8)
20056 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080020057 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020058 }
20059
20060 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, qmax) {
20061 TEST_REQUIRES_ARM_NEON_DOT;
20062 GemmMicrokernelTester()
20063 .mr(1)
20064 .nr(16)
20065 .kr(4)
20066 .sr(1)
20067 .m(1)
20068 .n(16)
20069 .k(8)
20070 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080020071 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020072 }
20073
20074 TEST(QC8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, strided_cm) {
20075 TEST_REQUIRES_ARM_NEON_DOT;
20076 GemmMicrokernelTester()
20077 .mr(1)
20078 .nr(16)
20079 .kr(4)
20080 .sr(1)
20081 .m(1)
20082 .n(16)
20083 .k(8)
20084 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080020085 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020086 }
20087#endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
20088
20089
20090#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
20091 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, k_eq_8) {
20092 TEST_REQUIRES_ARM_NEON_DOT;
20093 GemmMicrokernelTester()
20094 .mr(8)
20095 .nr(16)
20096 .kr(4)
20097 .sr(1)
20098 .m(8)
20099 .n(16)
20100 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080020101 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020102 }
20103
20104 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, strided_cn) {
20105 TEST_REQUIRES_ARM_NEON_DOT;
20106 GemmMicrokernelTester()
20107 .mr(8)
20108 .nr(16)
20109 .kr(4)
20110 .sr(1)
20111 .m(8)
20112 .n(16)
20113 .k(8)
20114 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080020115 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020116 }
20117
20118 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, k_eq_8_strided_a) {
20119 TEST_REQUIRES_ARM_NEON_DOT;
20120 GemmMicrokernelTester()
20121 .mr(8)
20122 .nr(16)
20123 .kr(4)
20124 .sr(1)
20125 .m(8)
20126 .n(16)
20127 .k(8)
20128 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080020129 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020130 }
20131
20132 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, k_eq_8_subtile) {
20133 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng83844ae2022-01-14 09:52:25 -080020134 for (uint32_t n = 1; n <= 16; n++) {
20135 for (uint32_t m = 1; m <= 8; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020136 GemmMicrokernelTester()
20137 .mr(8)
20138 .nr(16)
20139 .kr(4)
20140 .sr(1)
20141 .m(m)
20142 .n(n)
20143 .k(8)
20144 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020145 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020146 }
20147 }
20148 }
20149
20150 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, k_eq_8_subtile_m) {
20151 TEST_REQUIRES_ARM_NEON_DOT;
20152 for (uint32_t m = 1; m <= 8; m++) {
20153 GemmMicrokernelTester()
20154 .mr(8)
20155 .nr(16)
20156 .kr(4)
20157 .sr(1)
20158 .m(m)
20159 .n(16)
20160 .k(8)
20161 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020162 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020163 }
20164 }
20165
20166 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, k_eq_8_subtile_n) {
20167 TEST_REQUIRES_ARM_NEON_DOT;
20168 for (uint32_t n = 1; n <= 16; n++) {
20169 GemmMicrokernelTester()
20170 .mr(8)
20171 .nr(16)
20172 .kr(4)
20173 .sr(1)
20174 .m(8)
20175 .n(n)
20176 .k(8)
20177 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020178 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020179 }
20180 }
20181
20182 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, k_lt_8) {
20183 TEST_REQUIRES_ARM_NEON_DOT;
20184 for (size_t k = 1; k < 8; k++) {
20185 GemmMicrokernelTester()
20186 .mr(8)
20187 .nr(16)
20188 .kr(4)
20189 .sr(1)
20190 .m(8)
20191 .n(16)
20192 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080020193 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020194 }
20195 }
20196
20197 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, k_lt_8_strided_a) {
20198 TEST_REQUIRES_ARM_NEON_DOT;
20199 for (size_t k = 1; k < 8; k++) {
20200 GemmMicrokernelTester()
20201 .mr(8)
20202 .nr(16)
20203 .kr(4)
20204 .sr(1)
20205 .m(8)
20206 .n(16)
20207 .k(k)
20208 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080020209 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020210 }
20211 }
20212
20213 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, k_lt_8_subtile) {
20214 TEST_REQUIRES_ARM_NEON_DOT;
20215 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080020216 for (uint32_t n = 1; n <= 16; n++) {
20217 for (uint32_t m = 1; m <= 8; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020218 GemmMicrokernelTester()
20219 .mr(8)
20220 .nr(16)
20221 .kr(4)
20222 .sr(1)
20223 .m(m)
20224 .n(n)
20225 .k(k)
20226 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020227 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020228 }
20229 }
20230 }
20231 }
20232
20233 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, k_gt_8) {
20234 TEST_REQUIRES_ARM_NEON_DOT;
20235 for (size_t k = 9; k < 16; k++) {
20236 GemmMicrokernelTester()
20237 .mr(8)
20238 .nr(16)
20239 .kr(4)
20240 .sr(1)
20241 .m(8)
20242 .n(16)
20243 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080020244 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020245 }
20246 }
20247
20248 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, k_gt_8_strided_a) {
20249 TEST_REQUIRES_ARM_NEON_DOT;
20250 for (size_t k = 9; k < 16; k++) {
20251 GemmMicrokernelTester()
20252 .mr(8)
20253 .nr(16)
20254 .kr(4)
20255 .sr(1)
20256 .m(8)
20257 .n(16)
20258 .k(k)
20259 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080020260 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020261 }
20262 }
20263
20264 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, k_gt_8_subtile) {
20265 TEST_REQUIRES_ARM_NEON_DOT;
20266 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080020267 for (uint32_t n = 1; n <= 16; n++) {
20268 for (uint32_t m = 1; m <= 8; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020269 GemmMicrokernelTester()
20270 .mr(8)
20271 .nr(16)
20272 .kr(4)
20273 .sr(1)
20274 .m(m)
20275 .n(n)
20276 .k(k)
20277 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020278 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020279 }
20280 }
20281 }
20282 }
20283
20284 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, k_div_8) {
20285 TEST_REQUIRES_ARM_NEON_DOT;
20286 for (size_t k = 16; k <= 80; k += 8) {
20287 GemmMicrokernelTester()
20288 .mr(8)
20289 .nr(16)
20290 .kr(4)
20291 .sr(1)
20292 .m(8)
20293 .n(16)
20294 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080020295 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020296 }
20297 }
20298
20299 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, k_div_8_strided_a) {
20300 TEST_REQUIRES_ARM_NEON_DOT;
20301 for (size_t k = 16; k <= 80; k += 8) {
20302 GemmMicrokernelTester()
20303 .mr(8)
20304 .nr(16)
20305 .kr(4)
20306 .sr(1)
20307 .m(8)
20308 .n(16)
20309 .k(k)
20310 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080020311 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020312 }
20313 }
20314
20315 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, k_div_8_subtile) {
20316 TEST_REQUIRES_ARM_NEON_DOT;
20317 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080020318 for (uint32_t n = 1; n <= 16; n++) {
20319 for (uint32_t m = 1; m <= 8; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020320 GemmMicrokernelTester()
20321 .mr(8)
20322 .nr(16)
20323 .kr(4)
20324 .sr(1)
20325 .m(m)
20326 .n(n)
20327 .k(k)
20328 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020329 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020330 }
20331 }
20332 }
20333 }
20334
20335 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, n_gt_16) {
20336 TEST_REQUIRES_ARM_NEON_DOT;
20337 for (uint32_t n = 17; n < 32; n++) {
20338 for (size_t k = 1; k <= 40; k += 9) {
20339 GemmMicrokernelTester()
20340 .mr(8)
20341 .nr(16)
20342 .kr(4)
20343 .sr(1)
20344 .m(8)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080020345 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020346 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080020347 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020348 }
20349 }
20350 }
20351
20352 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, n_gt_16_strided_cn) {
20353 TEST_REQUIRES_ARM_NEON_DOT;
20354 for (uint32_t n = 17; n < 32; n++) {
20355 for (size_t k = 1; k <= 40; k += 9) {
20356 GemmMicrokernelTester()
20357 .mr(8)
20358 .nr(16)
20359 .kr(4)
20360 .sr(1)
20361 .m(8)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080020362 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020363 .k(k)
20364 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080020365 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020366 }
20367 }
20368 }
20369
20370 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, n_gt_16_strided_a) {
20371 TEST_REQUIRES_ARM_NEON_DOT;
20372 for (uint32_t n = 17; n < 32; n++) {
20373 for (size_t k = 1; k <= 40; k += 9) {
20374 GemmMicrokernelTester()
20375 .mr(8)
20376 .nr(16)
20377 .kr(4)
20378 .sr(1)
20379 .m(8)
20380 .n(n)
20381 .k(k)
20382 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080020383 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020384 }
20385 }
20386 }
20387
20388 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, n_gt_16_subtile) {
20389 TEST_REQUIRES_ARM_NEON_DOT;
20390 for (uint32_t n = 17; n < 32; n++) {
20391 for (size_t k = 1; k <= 40; k += 9) {
20392 for (uint32_t m = 1; m <= 8; m++) {
20393 GemmMicrokernelTester()
20394 .mr(8)
20395 .nr(16)
20396 .kr(4)
20397 .sr(1)
20398 .m(m)
20399 .n(n)
20400 .k(k)
20401 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020402 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020403 }
20404 }
20405 }
20406 }
20407
20408 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, n_div_16) {
20409 TEST_REQUIRES_ARM_NEON_DOT;
20410 for (uint32_t n = 32; n <= 48; n += 16) {
20411 for (size_t k = 1; k <= 40; k += 9) {
20412 GemmMicrokernelTester()
20413 .mr(8)
20414 .nr(16)
20415 .kr(4)
20416 .sr(1)
20417 .m(8)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080020418 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020419 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080020420 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020421 }
20422 }
20423 }
20424
20425 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, n_div_16_strided_cn) {
20426 TEST_REQUIRES_ARM_NEON_DOT;
20427 for (uint32_t n = 32; n <= 48; n += 16) {
20428 for (size_t k = 1; k <= 40; k += 9) {
20429 GemmMicrokernelTester()
20430 .mr(8)
20431 .nr(16)
20432 .kr(4)
20433 .sr(1)
20434 .m(8)
20435 .n(n)
20436 .k(k)
20437 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080020438 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020439 }
20440 }
20441 }
20442
20443 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, n_div_16_strided_a) {
20444 TEST_REQUIRES_ARM_NEON_DOT;
20445 for (uint32_t n = 32; n <= 48; n += 16) {
20446 for (size_t k = 1; k <= 40; k += 9) {
20447 GemmMicrokernelTester()
20448 .mr(8)
20449 .nr(16)
20450 .kr(4)
20451 .sr(1)
20452 .m(8)
20453 .n(n)
20454 .k(k)
20455 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080020456 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020457 }
20458 }
20459 }
20460
20461 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, n_div_16_subtile) {
20462 TEST_REQUIRES_ARM_NEON_DOT;
20463 for (uint32_t n = 32; n <= 48; n += 16) {
20464 for (size_t k = 1; k <= 40; k += 9) {
20465 for (uint32_t m = 1; m <= 8; m++) {
20466 GemmMicrokernelTester()
20467 .mr(8)
20468 .nr(16)
20469 .kr(4)
20470 .sr(1)
20471 .m(m)
20472 .n(n)
20473 .k(k)
20474 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020475 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020476 }
20477 }
20478 }
20479 }
20480
20481 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, strided_cm_subtile) {
20482 TEST_REQUIRES_ARM_NEON_DOT;
20483 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080020484 for (uint32_t n = 1; n <= 16; n++) {
20485 for (uint32_t m = 1; m <= 8; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020486 GemmMicrokernelTester()
20487 .mr(8)
20488 .nr(16)
20489 .kr(4)
20490 .sr(1)
20491 .m(m)
20492 .n(n)
20493 .k(k)
20494 .cm_stride(19)
20495 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020496 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020497 }
20498 }
20499 }
20500 }
20501
20502 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, qmin) {
20503 TEST_REQUIRES_ARM_NEON_DOT;
20504 GemmMicrokernelTester()
20505 .mr(8)
20506 .nr(16)
20507 .kr(4)
20508 .sr(1)
20509 .m(8)
20510 .n(16)
20511 .k(8)
20512 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080020513 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020514 }
20515
20516 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, qmax) {
20517 TEST_REQUIRES_ARM_NEON_DOT;
20518 GemmMicrokernelTester()
20519 .mr(8)
20520 .nr(16)
20521 .kr(4)
20522 .sr(1)
20523 .m(8)
20524 .n(16)
20525 .k(8)
20526 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080020527 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020528 }
20529
20530 TEST(QC8_GEMM_MINMAX_FP32_8X16C4__NEONDOT, strided_cm) {
20531 TEST_REQUIRES_ARM_NEON_DOT;
20532 GemmMicrokernelTester()
20533 .mr(8)
20534 .nr(16)
20535 .kr(4)
20536 .sr(1)
20537 .m(8)
20538 .n(16)
20539 .k(8)
20540 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080020541 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020542 }
20543#endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
20544
20545
20546#if XNN_ARCH_X86 || XNN_ARCH_X86_64
20547 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8) {
20548 TEST_REQUIRES_X86_SSE2;
20549 GemmMicrokernelTester()
20550 .mr(1)
20551 .nr(4)
20552 .kr(2)
20553 .sr(1)
20554 .m(1)
20555 .n(4)
20556 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080020557 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020558 }
20559
20560 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, strided_cn) {
20561 TEST_REQUIRES_X86_SSE2;
20562 GemmMicrokernelTester()
20563 .mr(1)
20564 .nr(4)
20565 .kr(2)
20566 .sr(1)
20567 .m(1)
20568 .n(4)
20569 .k(8)
20570 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080020571 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020572 }
20573
20574 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_strided_a) {
20575 TEST_REQUIRES_X86_SSE2;
20576 GemmMicrokernelTester()
20577 .mr(1)
20578 .nr(4)
20579 .kr(2)
20580 .sr(1)
20581 .m(1)
20582 .n(4)
20583 .k(8)
20584 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080020585 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020586 }
20587
20588 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_subtile) {
20589 TEST_REQUIRES_X86_SSE2;
Zhi An Ng83844ae2022-01-14 09:52:25 -080020590 for (uint32_t n = 1; n <= 4; n++) {
20591 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020592 GemmMicrokernelTester()
20593 .mr(1)
20594 .nr(4)
20595 .kr(2)
20596 .sr(1)
20597 .m(m)
20598 .n(n)
20599 .k(8)
20600 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020601 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020602 }
20603 }
20604 }
20605
20606 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_subtile_m) {
20607 TEST_REQUIRES_X86_SSE2;
20608 for (uint32_t m = 1; m <= 1; m++) {
20609 GemmMicrokernelTester()
20610 .mr(1)
20611 .nr(4)
20612 .kr(2)
20613 .sr(1)
20614 .m(m)
20615 .n(4)
20616 .k(8)
20617 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020618 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020619 }
20620 }
20621
20622 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_subtile_n) {
20623 TEST_REQUIRES_X86_SSE2;
20624 for (uint32_t n = 1; n <= 4; n++) {
20625 GemmMicrokernelTester()
20626 .mr(1)
20627 .nr(4)
20628 .kr(2)
20629 .sr(1)
20630 .m(1)
20631 .n(n)
20632 .k(8)
20633 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020634 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020635 }
20636 }
20637
20638 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_lt_8) {
20639 TEST_REQUIRES_X86_SSE2;
20640 for (size_t k = 1; k < 8; k++) {
20641 GemmMicrokernelTester()
20642 .mr(1)
20643 .nr(4)
20644 .kr(2)
20645 .sr(1)
20646 .m(1)
20647 .n(4)
20648 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080020649 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020650 }
20651 }
20652
20653 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_lt_8_strided_a) {
20654 TEST_REQUIRES_X86_SSE2;
20655 for (size_t k = 1; k < 8; k++) {
20656 GemmMicrokernelTester()
20657 .mr(1)
20658 .nr(4)
20659 .kr(2)
20660 .sr(1)
20661 .m(1)
20662 .n(4)
20663 .k(k)
20664 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080020665 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020666 }
20667 }
20668
20669 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_lt_8_subtile) {
20670 TEST_REQUIRES_X86_SSE2;
20671 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080020672 for (uint32_t n = 1; n <= 4; n++) {
20673 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020674 GemmMicrokernelTester()
20675 .mr(1)
20676 .nr(4)
20677 .kr(2)
20678 .sr(1)
20679 .m(m)
20680 .n(n)
20681 .k(k)
20682 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020683 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020684 }
20685 }
20686 }
20687 }
20688
20689 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_gt_8) {
20690 TEST_REQUIRES_X86_SSE2;
20691 for (size_t k = 9; k < 16; k++) {
20692 GemmMicrokernelTester()
20693 .mr(1)
20694 .nr(4)
20695 .kr(2)
20696 .sr(1)
20697 .m(1)
20698 .n(4)
20699 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080020700 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020701 }
20702 }
20703
20704 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_gt_8_strided_a) {
20705 TEST_REQUIRES_X86_SSE2;
20706 for (size_t k = 9; k < 16; k++) {
20707 GemmMicrokernelTester()
20708 .mr(1)
20709 .nr(4)
20710 .kr(2)
20711 .sr(1)
20712 .m(1)
20713 .n(4)
20714 .k(k)
20715 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080020716 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020717 }
20718 }
20719
20720 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_gt_8_subtile) {
20721 TEST_REQUIRES_X86_SSE2;
20722 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080020723 for (uint32_t n = 1; n <= 4; n++) {
20724 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020725 GemmMicrokernelTester()
20726 .mr(1)
20727 .nr(4)
20728 .kr(2)
20729 .sr(1)
20730 .m(m)
20731 .n(n)
20732 .k(k)
20733 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020734 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020735 }
20736 }
20737 }
20738 }
20739
20740 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_div_8) {
20741 TEST_REQUIRES_X86_SSE2;
20742 for (size_t k = 16; k <= 80; k += 8) {
20743 GemmMicrokernelTester()
20744 .mr(1)
20745 .nr(4)
20746 .kr(2)
20747 .sr(1)
20748 .m(1)
20749 .n(4)
20750 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080020751 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020752 }
20753 }
20754
20755 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_div_8_strided_a) {
20756 TEST_REQUIRES_X86_SSE2;
20757 for (size_t k = 16; k <= 80; k += 8) {
20758 GemmMicrokernelTester()
20759 .mr(1)
20760 .nr(4)
20761 .kr(2)
20762 .sr(1)
20763 .m(1)
20764 .n(4)
20765 .k(k)
20766 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080020767 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020768 }
20769 }
20770
20771 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_div_8_subtile) {
20772 TEST_REQUIRES_X86_SSE2;
20773 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080020774 for (uint32_t n = 1; n <= 4; n++) {
20775 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020776 GemmMicrokernelTester()
20777 .mr(1)
20778 .nr(4)
20779 .kr(2)
20780 .sr(1)
20781 .m(m)
20782 .n(n)
20783 .k(k)
20784 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020785 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020786 }
20787 }
20788 }
20789 }
20790
20791 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4) {
20792 TEST_REQUIRES_X86_SSE2;
20793 for (uint32_t n = 5; n < 8; n++) {
20794 for (size_t k = 1; k <= 40; k += 9) {
20795 GemmMicrokernelTester()
20796 .mr(1)
20797 .nr(4)
20798 .kr(2)
20799 .sr(1)
20800 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080020801 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020802 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080020803 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020804 }
20805 }
20806 }
20807
20808 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4_strided_cn) {
20809 TEST_REQUIRES_X86_SSE2;
20810 for (uint32_t n = 5; n < 8; n++) {
20811 for (size_t k = 1; k <= 40; k += 9) {
20812 GemmMicrokernelTester()
20813 .mr(1)
20814 .nr(4)
20815 .kr(2)
20816 .sr(1)
20817 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080020818 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020819 .k(k)
20820 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080020821 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020822 }
20823 }
20824 }
20825
20826 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4_strided_a) {
20827 TEST_REQUIRES_X86_SSE2;
20828 for (uint32_t n = 5; n < 8; n++) {
20829 for (size_t k = 1; k <= 40; k += 9) {
20830 GemmMicrokernelTester()
20831 .mr(1)
20832 .nr(4)
20833 .kr(2)
20834 .sr(1)
20835 .m(1)
20836 .n(n)
20837 .k(k)
20838 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080020839 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020840 }
20841 }
20842 }
20843
20844 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4_subtile) {
20845 TEST_REQUIRES_X86_SSE2;
20846 for (uint32_t n = 5; n < 8; n++) {
20847 for (size_t k = 1; k <= 40; k += 9) {
20848 for (uint32_t m = 1; m <= 1; m++) {
20849 GemmMicrokernelTester()
20850 .mr(1)
20851 .nr(4)
20852 .kr(2)
20853 .sr(1)
20854 .m(m)
20855 .n(n)
20856 .k(k)
20857 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020858 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020859 }
20860 }
20861 }
20862 }
20863
20864 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4) {
20865 TEST_REQUIRES_X86_SSE2;
20866 for (uint32_t n = 8; n <= 12; n += 4) {
20867 for (size_t k = 1; k <= 40; k += 9) {
20868 GemmMicrokernelTester()
20869 .mr(1)
20870 .nr(4)
20871 .kr(2)
20872 .sr(1)
20873 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080020874 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020875 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080020876 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020877 }
20878 }
20879 }
20880
20881 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4_strided_cn) {
20882 TEST_REQUIRES_X86_SSE2;
20883 for (uint32_t n = 8; n <= 12; n += 4) {
20884 for (size_t k = 1; k <= 40; k += 9) {
20885 GemmMicrokernelTester()
20886 .mr(1)
20887 .nr(4)
20888 .kr(2)
20889 .sr(1)
20890 .m(1)
20891 .n(n)
20892 .k(k)
20893 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080020894 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020895 }
20896 }
20897 }
20898
20899 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4_strided_a) {
20900 TEST_REQUIRES_X86_SSE2;
20901 for (uint32_t n = 8; n <= 12; n += 4) {
20902 for (size_t k = 1; k <= 40; k += 9) {
20903 GemmMicrokernelTester()
20904 .mr(1)
20905 .nr(4)
20906 .kr(2)
20907 .sr(1)
20908 .m(1)
20909 .n(n)
20910 .k(k)
20911 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080020912 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020913 }
20914 }
20915 }
20916
20917 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4_subtile) {
20918 TEST_REQUIRES_X86_SSE2;
20919 for (uint32_t n = 8; n <= 12; n += 4) {
20920 for (size_t k = 1; k <= 40; k += 9) {
20921 for (uint32_t m = 1; m <= 1; m++) {
20922 GemmMicrokernelTester()
20923 .mr(1)
20924 .nr(4)
20925 .kr(2)
20926 .sr(1)
20927 .m(m)
20928 .n(n)
20929 .k(k)
20930 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020931 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020932 }
20933 }
20934 }
20935 }
20936
20937 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, strided_cm_subtile) {
20938 TEST_REQUIRES_X86_SSE2;
20939 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080020940 for (uint32_t n = 1; n <= 4; n++) {
20941 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020942 GemmMicrokernelTester()
20943 .mr(1)
20944 .nr(4)
20945 .kr(2)
20946 .sr(1)
20947 .m(m)
20948 .n(n)
20949 .k(k)
20950 .cm_stride(7)
20951 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020952 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020953 }
20954 }
20955 }
20956 }
20957
20958 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, qmin) {
20959 TEST_REQUIRES_X86_SSE2;
20960 GemmMicrokernelTester()
20961 .mr(1)
20962 .nr(4)
20963 .kr(2)
20964 .sr(1)
20965 .m(1)
20966 .n(4)
20967 .k(8)
20968 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080020969 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020970 }
20971
20972 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, qmax) {
20973 TEST_REQUIRES_X86_SSE2;
20974 GemmMicrokernelTester()
20975 .mr(1)
20976 .nr(4)
20977 .kr(2)
20978 .sr(1)
20979 .m(1)
20980 .n(4)
20981 .k(8)
20982 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080020983 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020984 }
20985
20986 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, strided_cm) {
20987 TEST_REQUIRES_X86_SSE2;
20988 GemmMicrokernelTester()
20989 .mr(1)
20990 .nr(4)
20991 .kr(2)
20992 .sr(1)
20993 .m(1)
20994 .n(4)
20995 .k(8)
20996 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080020997 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020998 }
20999#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
21000
21001
21002#if XNN_ARCH_X86 || XNN_ARCH_X86_64
21003 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8) {
21004 TEST_REQUIRES_X86_SSE2;
21005 GemmMicrokernelTester()
21006 .mr(3)
21007 .nr(4)
21008 .kr(2)
21009 .sr(1)
21010 .m(3)
21011 .n(4)
21012 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080021013 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021014 }
21015
21016 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, strided_cn) {
21017 TEST_REQUIRES_X86_SSE2;
21018 GemmMicrokernelTester()
21019 .mr(3)
21020 .nr(4)
21021 .kr(2)
21022 .sr(1)
21023 .m(3)
21024 .n(4)
21025 .k(8)
21026 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080021027 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021028 }
21029
21030 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8_strided_a) {
21031 TEST_REQUIRES_X86_SSE2;
21032 GemmMicrokernelTester()
21033 .mr(3)
21034 .nr(4)
21035 .kr(2)
21036 .sr(1)
21037 .m(3)
21038 .n(4)
21039 .k(8)
21040 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080021041 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021042 }
21043
21044 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8_subtile) {
21045 TEST_REQUIRES_X86_SSE2;
Zhi An Ng83844ae2022-01-14 09:52:25 -080021046 for (uint32_t n = 1; n <= 4; n++) {
21047 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021048 GemmMicrokernelTester()
21049 .mr(3)
21050 .nr(4)
21051 .kr(2)
21052 .sr(1)
21053 .m(m)
21054 .n(n)
21055 .k(8)
21056 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021057 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021058 }
21059 }
21060 }
21061
21062 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8_subtile_m) {
21063 TEST_REQUIRES_X86_SSE2;
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021064 for (uint32_t m = 1; m <= 3; m++) {
21065 GemmMicrokernelTester()
21066 .mr(3)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021067 .nr(4)
21068 .kr(2)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021069 .sr(1)
21070 .m(m)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021071 .n(4)
21072 .k(8)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021073 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021074 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021075 }
21076 }
21077
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021078 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8_subtile_n) {
21079 TEST_REQUIRES_X86_SSE2;
21080 for (uint32_t n = 1; n <= 4; n++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021081 GemmMicrokernelTester()
21082 .mr(3)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021083 .nr(4)
21084 .kr(2)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021085 .sr(1)
21086 .m(3)
21087 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021088 .k(8)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021089 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021090 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021091 }
21092 }
21093
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021094 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_lt_8) {
21095 TEST_REQUIRES_X86_SSE2;
21096 for (size_t k = 1; k < 8; k++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021097 GemmMicrokernelTester()
21098 .mr(3)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021099 .nr(4)
21100 .kr(2)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021101 .sr(1)
21102 .m(3)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021103 .n(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021104 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080021105 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021106 }
21107 }
21108
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021109 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_lt_8_strided_a) {
21110 TEST_REQUIRES_X86_SSE2;
21111 for (size_t k = 1; k < 8; k++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021112 GemmMicrokernelTester()
21113 .mr(3)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021114 .nr(4)
21115 .kr(2)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021116 .sr(1)
21117 .m(3)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021118 .n(4)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021119 .k(k)
21120 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080021121 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021122 }
21123 }
21124
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021125 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_lt_8_subtile) {
21126 TEST_REQUIRES_X86_SSE2;
21127 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080021128 for (uint32_t n = 1; n <= 4; n++) {
21129 for (uint32_t m = 1; m <= 3; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021130 GemmMicrokernelTester()
21131 .mr(3)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021132 .nr(4)
21133 .kr(2)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021134 .sr(1)
21135 .m(m)
21136 .n(n)
21137 .k(k)
21138 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021139 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021140 }
21141 }
21142 }
21143 }
21144
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021145 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_gt_8) {
21146 TEST_REQUIRES_X86_SSE2;
21147 for (size_t k = 9; k < 16; k++) {
21148 GemmMicrokernelTester()
21149 .mr(3)
21150 .nr(4)
21151 .kr(2)
21152 .sr(1)
21153 .m(3)
21154 .n(4)
21155 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080021156 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021157 }
21158 }
21159
21160 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_gt_8_strided_a) {
21161 TEST_REQUIRES_X86_SSE2;
21162 for (size_t k = 9; k < 16; k++) {
21163 GemmMicrokernelTester()
21164 .mr(3)
21165 .nr(4)
21166 .kr(2)
21167 .sr(1)
21168 .m(3)
21169 .n(4)
21170 .k(k)
21171 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080021172 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021173 }
21174 }
21175
21176 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_gt_8_subtile) {
21177 TEST_REQUIRES_X86_SSE2;
21178 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080021179 for (uint32_t n = 1; n <= 4; n++) {
21180 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021181 GemmMicrokernelTester()
21182 .mr(3)
21183 .nr(4)
21184 .kr(2)
21185 .sr(1)
21186 .m(m)
21187 .n(n)
21188 .k(k)
21189 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021190 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021191 }
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021192 }
21193 }
21194 }
21195
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021196 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_div_8) {
21197 TEST_REQUIRES_X86_SSE2;
21198 for (size_t k = 16; k <= 80; k += 8) {
21199 GemmMicrokernelTester()
21200 .mr(3)
21201 .nr(4)
21202 .kr(2)
21203 .sr(1)
21204 .m(3)
21205 .n(4)
21206 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080021207 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021208 }
21209 }
21210
21211 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_div_8_strided_a) {
21212 TEST_REQUIRES_X86_SSE2;
21213 for (size_t k = 16; k <= 80; k += 8) {
21214 GemmMicrokernelTester()
21215 .mr(3)
21216 .nr(4)
21217 .kr(2)
21218 .sr(1)
21219 .m(3)
21220 .n(4)
21221 .k(k)
21222 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080021223 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021224 }
21225 }
21226
21227 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_div_8_subtile) {
21228 TEST_REQUIRES_X86_SSE2;
21229 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080021230 for (uint32_t n = 1; n <= 4; n++) {
21231 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021232 GemmMicrokernelTester()
21233 .mr(3)
21234 .nr(4)
21235 .kr(2)
21236 .sr(1)
21237 .m(m)
21238 .n(n)
21239 .k(k)
21240 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021241 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021242 }
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021243 }
21244 }
21245 }
21246
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021247 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4) {
21248 TEST_REQUIRES_X86_SSE2;
21249 for (uint32_t n = 5; n < 8; n++) {
21250 for (size_t k = 1; k <= 40; k += 9) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021251 GemmMicrokernelTester()
21252 .mr(3)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021253 .nr(4)
21254 .kr(2)
21255 .sr(1)
21256 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080021257 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021258 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080021259 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021260 }
21261 }
21262 }
21263
21264 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4_strided_cn) {
21265 TEST_REQUIRES_X86_SSE2;
21266 for (uint32_t n = 5; n < 8; n++) {
21267 for (size_t k = 1; k <= 40; k += 9) {
21268 GemmMicrokernelTester()
21269 .mr(3)
21270 .nr(4)
21271 .kr(2)
21272 .sr(1)
21273 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080021274 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021275 .k(k)
21276 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080021277 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021278 }
21279 }
21280 }
21281
21282 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4_strided_a) {
21283 TEST_REQUIRES_X86_SSE2;
21284 for (uint32_t n = 5; n < 8; n++) {
21285 for (size_t k = 1; k <= 40; k += 9) {
21286 GemmMicrokernelTester()
21287 .mr(3)
21288 .nr(4)
21289 .kr(2)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021290 .sr(1)
21291 .m(3)
21292 .n(n)
21293 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021294 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080021295 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021296 }
21297 }
21298 }
21299
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021300 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4_subtile) {
21301 TEST_REQUIRES_X86_SSE2;
21302 for (uint32_t n = 5; n < 8; n++) {
21303 for (size_t k = 1; k <= 40; k += 9) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021304 for (uint32_t m = 1; m <= 3; m++) {
21305 GemmMicrokernelTester()
21306 .mr(3)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021307 .nr(4)
21308 .kr(2)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021309 .sr(1)
21310 .m(m)
21311 .n(n)
21312 .k(k)
21313 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021314 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021315 }
21316 }
21317 }
21318 }
21319
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021320 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4) {
21321 TEST_REQUIRES_X86_SSE2;
21322 for (uint32_t n = 8; n <= 12; n += 4) {
21323 for (size_t k = 1; k <= 40; k += 9) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021324 GemmMicrokernelTester()
21325 .mr(3)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021326 .nr(4)
21327 .kr(2)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021328 .sr(1)
21329 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080021330 .n(n)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021331 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080021332 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021333 }
21334 }
21335 }
21336
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021337 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4_strided_cn) {
21338 TEST_REQUIRES_X86_SSE2;
21339 for (uint32_t n = 8; n <= 12; n += 4) {
21340 for (size_t k = 1; k <= 40; k += 9) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021341 GemmMicrokernelTester()
21342 .mr(3)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021343 .nr(4)
21344 .kr(2)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021345 .sr(1)
21346 .m(3)
21347 .n(n)
21348 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021349 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080021350 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021351 }
21352 }
21353 }
21354
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021355 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4_strided_a) {
21356 TEST_REQUIRES_X86_SSE2;
21357 for (uint32_t n = 8; n <= 12; n += 4) {
21358 for (size_t k = 1; k <= 40; k += 9) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021359 GemmMicrokernelTester()
21360 .mr(3)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021361 .nr(4)
21362 .kr(2)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021363 .sr(1)
21364 .m(3)
21365 .n(n)
21366 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021367 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080021368 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021369 }
21370 }
21371 }
21372
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021373 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4_subtile) {
21374 TEST_REQUIRES_X86_SSE2;
21375 for (uint32_t n = 8; n <= 12; n += 4) {
21376 for (size_t k = 1; k <= 40; k += 9) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021377 for (uint32_t m = 1; m <= 3; m++) {
21378 GemmMicrokernelTester()
21379 .mr(3)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021380 .nr(4)
21381 .kr(2)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021382 .sr(1)
21383 .m(m)
21384 .n(n)
21385 .k(k)
21386 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021387 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021388 }
21389 }
21390 }
21391 }
21392
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021393 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, strided_cm_subtile) {
21394 TEST_REQUIRES_X86_SSE2;
21395 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080021396 for (uint32_t n = 1; n <= 4; n++) {
21397 for (uint32_t m = 1; m <= 3; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021398 GemmMicrokernelTester()
21399 .mr(3)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021400 .nr(4)
21401 .kr(2)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021402 .sr(1)
21403 .m(m)
21404 .n(n)
21405 .k(k)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021406 .cm_stride(7)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021407 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021408 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021409 }
21410 }
21411 }
21412 }
21413
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021414 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, qmin) {
21415 TEST_REQUIRES_X86_SSE2;
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021416 GemmMicrokernelTester()
21417 .mr(3)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021418 .nr(4)
21419 .kr(2)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021420 .sr(1)
21421 .m(3)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021422 .n(4)
21423 .k(8)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021424 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080021425 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021426 }
21427
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021428 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, qmax) {
21429 TEST_REQUIRES_X86_SSE2;
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021430 GemmMicrokernelTester()
21431 .mr(3)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021432 .nr(4)
21433 .kr(2)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021434 .sr(1)
21435 .m(3)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021436 .n(4)
21437 .k(8)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021438 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080021439 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021440 }
21441
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021442 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, strided_cm) {
21443 TEST_REQUIRES_X86_SSE2;
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021444 GemmMicrokernelTester()
21445 .mr(3)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021446 .nr(4)
21447 .kr(2)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021448 .sr(1)
21449 .m(3)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021450 .n(4)
21451 .k(8)
21452 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080021453 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080021454 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021455#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
21456
21457
21458#if XNN_ARCH_X86 || XNN_ARCH_X86_64
21459 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8) {
21460 TEST_REQUIRES_X86_SSE41;
21461 GemmMicrokernelTester()
21462 .mr(2)
21463 .nr(4)
21464 .kr(2)
21465 .sr(1)
21466 .m(2)
21467 .n(4)
21468 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080021469 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021470 }
21471
21472 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, strided_cn) {
21473 TEST_REQUIRES_X86_SSE41;
21474 GemmMicrokernelTester()
21475 .mr(2)
21476 .nr(4)
21477 .kr(2)
21478 .sr(1)
21479 .m(2)
21480 .n(4)
21481 .k(8)
21482 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080021483 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021484 }
21485
21486 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_strided_a) {
21487 TEST_REQUIRES_X86_SSE41;
21488 GemmMicrokernelTester()
21489 .mr(2)
21490 .nr(4)
21491 .kr(2)
21492 .sr(1)
21493 .m(2)
21494 .n(4)
21495 .k(8)
21496 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080021497 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021498 }
21499
21500 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_subtile) {
21501 TEST_REQUIRES_X86_SSE41;
Zhi An Ng83844ae2022-01-14 09:52:25 -080021502 for (uint32_t n = 1; n <= 4; n++) {
21503 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021504 GemmMicrokernelTester()
21505 .mr(2)
21506 .nr(4)
21507 .kr(2)
21508 .sr(1)
21509 .m(m)
21510 .n(n)
21511 .k(8)
21512 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021513 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021514 }
21515 }
21516 }
21517
21518 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_subtile_m) {
21519 TEST_REQUIRES_X86_SSE41;
21520 for (uint32_t m = 1; m <= 2; m++) {
21521 GemmMicrokernelTester()
21522 .mr(2)
21523 .nr(4)
21524 .kr(2)
21525 .sr(1)
21526 .m(m)
21527 .n(4)
21528 .k(8)
21529 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021530 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021531 }
21532 }
21533
21534 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_subtile_n) {
21535 TEST_REQUIRES_X86_SSE41;
21536 for (uint32_t n = 1; n <= 4; n++) {
21537 GemmMicrokernelTester()
21538 .mr(2)
21539 .nr(4)
21540 .kr(2)
21541 .sr(1)
21542 .m(2)
21543 .n(n)
21544 .k(8)
21545 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021546 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021547 }
21548 }
21549
21550 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_lt_8) {
21551 TEST_REQUIRES_X86_SSE41;
21552 for (size_t k = 1; k < 8; k++) {
21553 GemmMicrokernelTester()
21554 .mr(2)
21555 .nr(4)
21556 .kr(2)
21557 .sr(1)
21558 .m(2)
21559 .n(4)
21560 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080021561 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021562 }
21563 }
21564
21565 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_lt_8_strided_a) {
21566 TEST_REQUIRES_X86_SSE41;
21567 for (size_t k = 1; k < 8; k++) {
21568 GemmMicrokernelTester()
21569 .mr(2)
21570 .nr(4)
21571 .kr(2)
21572 .sr(1)
21573 .m(2)
21574 .n(4)
21575 .k(k)
21576 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080021577 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021578 }
21579 }
21580
21581 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_lt_8_subtile) {
21582 TEST_REQUIRES_X86_SSE41;
21583 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080021584 for (uint32_t n = 1; n <= 4; n++) {
21585 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021586 GemmMicrokernelTester()
21587 .mr(2)
21588 .nr(4)
21589 .kr(2)
21590 .sr(1)
21591 .m(m)
21592 .n(n)
21593 .k(k)
21594 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021595 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021596 }
21597 }
21598 }
21599 }
21600
21601 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_gt_8) {
21602 TEST_REQUIRES_X86_SSE41;
21603 for (size_t k = 9; k < 16; k++) {
21604 GemmMicrokernelTester()
21605 .mr(2)
21606 .nr(4)
21607 .kr(2)
21608 .sr(1)
21609 .m(2)
21610 .n(4)
21611 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080021612 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021613 }
21614 }
21615
21616 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_gt_8_strided_a) {
21617 TEST_REQUIRES_X86_SSE41;
21618 for (size_t k = 9; k < 16; k++) {
21619 GemmMicrokernelTester()
21620 .mr(2)
21621 .nr(4)
21622 .kr(2)
21623 .sr(1)
21624 .m(2)
21625 .n(4)
21626 .k(k)
21627 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080021628 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021629 }
21630 }
21631
21632 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_gt_8_subtile) {
21633 TEST_REQUIRES_X86_SSE41;
21634 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080021635 for (uint32_t n = 1; n <= 4; n++) {
21636 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021637 GemmMicrokernelTester()
21638 .mr(2)
21639 .nr(4)
21640 .kr(2)
21641 .sr(1)
21642 .m(m)
21643 .n(n)
21644 .k(k)
21645 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021646 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021647 }
21648 }
21649 }
21650 }
21651
21652 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_div_8) {
21653 TEST_REQUIRES_X86_SSE41;
21654 for (size_t k = 16; k <= 80; k += 8) {
21655 GemmMicrokernelTester()
21656 .mr(2)
21657 .nr(4)
21658 .kr(2)
21659 .sr(1)
21660 .m(2)
21661 .n(4)
21662 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080021663 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021664 }
21665 }
21666
21667 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_div_8_strided_a) {
21668 TEST_REQUIRES_X86_SSE41;
21669 for (size_t k = 16; k <= 80; k += 8) {
21670 GemmMicrokernelTester()
21671 .mr(2)
21672 .nr(4)
21673 .kr(2)
21674 .sr(1)
21675 .m(2)
21676 .n(4)
21677 .k(k)
21678 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080021679 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021680 }
21681 }
21682
21683 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_div_8_subtile) {
21684 TEST_REQUIRES_X86_SSE41;
21685 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080021686 for (uint32_t n = 1; n <= 4; n++) {
21687 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021688 GemmMicrokernelTester()
21689 .mr(2)
21690 .nr(4)
21691 .kr(2)
21692 .sr(1)
21693 .m(m)
21694 .n(n)
21695 .k(k)
21696 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021697 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021698 }
21699 }
21700 }
21701 }
21702
21703 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4) {
21704 TEST_REQUIRES_X86_SSE41;
21705 for (uint32_t n = 5; n < 8; n++) {
21706 for (size_t k = 1; k <= 40; k += 9) {
21707 GemmMicrokernelTester()
21708 .mr(2)
21709 .nr(4)
21710 .kr(2)
21711 .sr(1)
21712 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080021713 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021714 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080021715 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021716 }
21717 }
21718 }
21719
21720 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4_strided_cn) {
21721 TEST_REQUIRES_X86_SSE41;
21722 for (uint32_t n = 5; n < 8; n++) {
21723 for (size_t k = 1; k <= 40; k += 9) {
21724 GemmMicrokernelTester()
21725 .mr(2)
21726 .nr(4)
21727 .kr(2)
21728 .sr(1)
21729 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080021730 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021731 .k(k)
21732 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080021733 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021734 }
21735 }
21736 }
21737
21738 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4_strided_a) {
21739 TEST_REQUIRES_X86_SSE41;
21740 for (uint32_t n = 5; n < 8; n++) {
21741 for (size_t k = 1; k <= 40; k += 9) {
21742 GemmMicrokernelTester()
21743 .mr(2)
21744 .nr(4)
21745 .kr(2)
21746 .sr(1)
21747 .m(2)
21748 .n(n)
21749 .k(k)
21750 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080021751 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021752 }
21753 }
21754 }
21755
21756 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4_subtile) {
21757 TEST_REQUIRES_X86_SSE41;
21758 for (uint32_t n = 5; n < 8; n++) {
21759 for (size_t k = 1; k <= 40; k += 9) {
21760 for (uint32_t m = 1; m <= 2; m++) {
21761 GemmMicrokernelTester()
21762 .mr(2)
21763 .nr(4)
21764 .kr(2)
21765 .sr(1)
21766 .m(m)
21767 .n(n)
21768 .k(k)
21769 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021770 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021771 }
21772 }
21773 }
21774 }
21775
21776 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4) {
21777 TEST_REQUIRES_X86_SSE41;
21778 for (uint32_t n = 8; n <= 12; n += 4) {
21779 for (size_t k = 1; k <= 40; k += 9) {
21780 GemmMicrokernelTester()
21781 .mr(2)
21782 .nr(4)
21783 .kr(2)
21784 .sr(1)
21785 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080021786 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021787 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080021788 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021789 }
21790 }
21791 }
21792
21793 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4_strided_cn) {
21794 TEST_REQUIRES_X86_SSE41;
21795 for (uint32_t n = 8; n <= 12; n += 4) {
21796 for (size_t k = 1; k <= 40; k += 9) {
21797 GemmMicrokernelTester()
21798 .mr(2)
21799 .nr(4)
21800 .kr(2)
21801 .sr(1)
21802 .m(2)
21803 .n(n)
21804 .k(k)
21805 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080021806 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021807 }
21808 }
21809 }
21810
21811 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4_strided_a) {
21812 TEST_REQUIRES_X86_SSE41;
21813 for (uint32_t n = 8; n <= 12; n += 4) {
21814 for (size_t k = 1; k <= 40; k += 9) {
21815 GemmMicrokernelTester()
21816 .mr(2)
21817 .nr(4)
21818 .kr(2)
21819 .sr(1)
21820 .m(2)
21821 .n(n)
21822 .k(k)
21823 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080021824 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021825 }
21826 }
21827 }
21828
21829 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4_subtile) {
21830 TEST_REQUIRES_X86_SSE41;
21831 for (uint32_t n = 8; n <= 12; n += 4) {
21832 for (size_t k = 1; k <= 40; k += 9) {
21833 for (uint32_t m = 1; m <= 2; m++) {
21834 GemmMicrokernelTester()
21835 .mr(2)
21836 .nr(4)
21837 .kr(2)
21838 .sr(1)
21839 .m(m)
21840 .n(n)
21841 .k(k)
21842 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021843 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021844 }
21845 }
21846 }
21847 }
21848
21849 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, strided_cm_subtile) {
21850 TEST_REQUIRES_X86_SSE41;
21851 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080021852 for (uint32_t n = 1; n <= 4; n++) {
21853 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021854 GemmMicrokernelTester()
21855 .mr(2)
21856 .nr(4)
21857 .kr(2)
21858 .sr(1)
21859 .m(m)
21860 .n(n)
21861 .k(k)
21862 .cm_stride(7)
21863 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021864 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021865 }
21866 }
21867 }
21868 }
21869
21870 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, qmin) {
21871 TEST_REQUIRES_X86_SSE41;
21872 GemmMicrokernelTester()
21873 .mr(2)
21874 .nr(4)
21875 .kr(2)
21876 .sr(1)
21877 .m(2)
21878 .n(4)
21879 .k(8)
21880 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080021881 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021882 }
21883
21884 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, qmax) {
21885 TEST_REQUIRES_X86_SSE41;
21886 GemmMicrokernelTester()
21887 .mr(2)
21888 .nr(4)
21889 .kr(2)
21890 .sr(1)
21891 .m(2)
21892 .n(4)
21893 .k(8)
21894 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080021895 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021896 }
21897
21898 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, strided_cm) {
21899 TEST_REQUIRES_X86_SSE41;
21900 GemmMicrokernelTester()
21901 .mr(2)
21902 .nr(4)
21903 .kr(2)
21904 .sr(1)
21905 .m(2)
21906 .n(4)
21907 .k(8)
21908 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080021909 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021910 }
21911#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
21912
21913
21914#if XNN_ARCH_X86 || XNN_ARCH_X86_64
21915 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8) {
21916 TEST_REQUIRES_X86_SSE41;
21917 GemmMicrokernelTester()
21918 .mr(3)
21919 .nr(4)
21920 .kr(2)
21921 .sr(1)
21922 .m(3)
21923 .n(4)
21924 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080021925 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021926 }
21927
21928 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, strided_cn) {
21929 TEST_REQUIRES_X86_SSE41;
21930 GemmMicrokernelTester()
21931 .mr(3)
21932 .nr(4)
21933 .kr(2)
21934 .sr(1)
21935 .m(3)
21936 .n(4)
21937 .k(8)
21938 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080021939 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021940 }
21941
21942 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8_strided_a) {
21943 TEST_REQUIRES_X86_SSE41;
21944 GemmMicrokernelTester()
21945 .mr(3)
21946 .nr(4)
21947 .kr(2)
21948 .sr(1)
21949 .m(3)
21950 .n(4)
21951 .k(8)
21952 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080021953 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021954 }
21955
21956 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8_subtile) {
21957 TEST_REQUIRES_X86_SSE41;
Zhi An Ng83844ae2022-01-14 09:52:25 -080021958 for (uint32_t n = 1; n <= 4; n++) {
21959 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021960 GemmMicrokernelTester()
21961 .mr(3)
21962 .nr(4)
21963 .kr(2)
21964 .sr(1)
21965 .m(m)
21966 .n(n)
21967 .k(8)
21968 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021969 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021970 }
21971 }
21972 }
21973
21974 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8_subtile_m) {
21975 TEST_REQUIRES_X86_SSE41;
21976 for (uint32_t m = 1; m <= 3; m++) {
21977 GemmMicrokernelTester()
21978 .mr(3)
21979 .nr(4)
21980 .kr(2)
21981 .sr(1)
21982 .m(m)
21983 .n(4)
21984 .k(8)
21985 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021986 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021987 }
21988 }
21989
21990 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8_subtile_n) {
21991 TEST_REQUIRES_X86_SSE41;
21992 for (uint32_t n = 1; n <= 4; n++) {
21993 GemmMicrokernelTester()
21994 .mr(3)
21995 .nr(4)
21996 .kr(2)
21997 .sr(1)
21998 .m(3)
21999 .n(n)
22000 .k(8)
22001 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022002 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022003 }
22004 }
22005
22006 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_lt_8) {
22007 TEST_REQUIRES_X86_SSE41;
22008 for (size_t k = 1; k < 8; k++) {
22009 GemmMicrokernelTester()
22010 .mr(3)
22011 .nr(4)
22012 .kr(2)
22013 .sr(1)
22014 .m(3)
22015 .n(4)
22016 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022017 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022018 }
22019 }
22020
22021 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_lt_8_strided_a) {
22022 TEST_REQUIRES_X86_SSE41;
22023 for (size_t k = 1; k < 8; k++) {
22024 GemmMicrokernelTester()
22025 .mr(3)
22026 .nr(4)
22027 .kr(2)
22028 .sr(1)
22029 .m(3)
22030 .n(4)
22031 .k(k)
22032 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080022033 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022034 }
22035 }
22036
22037 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_lt_8_subtile) {
22038 TEST_REQUIRES_X86_SSE41;
22039 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080022040 for (uint32_t n = 1; n <= 4; n++) {
22041 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022042 GemmMicrokernelTester()
22043 .mr(3)
22044 .nr(4)
22045 .kr(2)
22046 .sr(1)
22047 .m(m)
22048 .n(n)
22049 .k(k)
22050 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022051 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022052 }
22053 }
22054 }
22055 }
22056
22057 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_gt_8) {
22058 TEST_REQUIRES_X86_SSE41;
22059 for (size_t k = 9; k < 16; k++) {
22060 GemmMicrokernelTester()
22061 .mr(3)
22062 .nr(4)
22063 .kr(2)
22064 .sr(1)
22065 .m(3)
22066 .n(4)
22067 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022068 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022069 }
22070 }
22071
22072 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_gt_8_strided_a) {
22073 TEST_REQUIRES_X86_SSE41;
22074 for (size_t k = 9; k < 16; k++) {
22075 GemmMicrokernelTester()
22076 .mr(3)
22077 .nr(4)
22078 .kr(2)
22079 .sr(1)
22080 .m(3)
22081 .n(4)
22082 .k(k)
22083 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080022084 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022085 }
22086 }
22087
22088 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_gt_8_subtile) {
22089 TEST_REQUIRES_X86_SSE41;
22090 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080022091 for (uint32_t n = 1; n <= 4; n++) {
22092 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022093 GemmMicrokernelTester()
22094 .mr(3)
22095 .nr(4)
22096 .kr(2)
22097 .sr(1)
22098 .m(m)
22099 .n(n)
22100 .k(k)
22101 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022102 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022103 }
22104 }
22105 }
22106 }
22107
22108 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_div_8) {
22109 TEST_REQUIRES_X86_SSE41;
22110 for (size_t k = 16; k <= 80; k += 8) {
22111 GemmMicrokernelTester()
22112 .mr(3)
22113 .nr(4)
22114 .kr(2)
22115 .sr(1)
22116 .m(3)
22117 .n(4)
22118 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022119 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022120 }
22121 }
22122
22123 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_div_8_strided_a) {
22124 TEST_REQUIRES_X86_SSE41;
22125 for (size_t k = 16; k <= 80; k += 8) {
22126 GemmMicrokernelTester()
22127 .mr(3)
22128 .nr(4)
22129 .kr(2)
22130 .sr(1)
22131 .m(3)
22132 .n(4)
22133 .k(k)
22134 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080022135 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022136 }
22137 }
22138
22139 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_div_8_subtile) {
22140 TEST_REQUIRES_X86_SSE41;
22141 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080022142 for (uint32_t n = 1; n <= 4; n++) {
22143 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022144 GemmMicrokernelTester()
22145 .mr(3)
22146 .nr(4)
22147 .kr(2)
22148 .sr(1)
22149 .m(m)
22150 .n(n)
22151 .k(k)
22152 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022153 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022154 }
22155 }
22156 }
22157 }
22158
22159 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4) {
22160 TEST_REQUIRES_X86_SSE41;
22161 for (uint32_t n = 5; n < 8; n++) {
22162 for (size_t k = 1; k <= 40; k += 9) {
22163 GemmMicrokernelTester()
22164 .mr(3)
22165 .nr(4)
22166 .kr(2)
22167 .sr(1)
22168 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080022169 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022170 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022171 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022172 }
22173 }
22174 }
22175
22176 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4_strided_cn) {
22177 TEST_REQUIRES_X86_SSE41;
22178 for (uint32_t n = 5; n < 8; n++) {
22179 for (size_t k = 1; k <= 40; k += 9) {
22180 GemmMicrokernelTester()
22181 .mr(3)
22182 .nr(4)
22183 .kr(2)
22184 .sr(1)
22185 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080022186 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022187 .k(k)
22188 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080022189 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022190 }
22191 }
22192 }
22193
22194 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4_strided_a) {
22195 TEST_REQUIRES_X86_SSE41;
22196 for (uint32_t n = 5; n < 8; n++) {
22197 for (size_t k = 1; k <= 40; k += 9) {
22198 GemmMicrokernelTester()
22199 .mr(3)
22200 .nr(4)
22201 .kr(2)
22202 .sr(1)
22203 .m(3)
22204 .n(n)
22205 .k(k)
22206 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080022207 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022208 }
22209 }
22210 }
22211
22212 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4_subtile) {
22213 TEST_REQUIRES_X86_SSE41;
22214 for (uint32_t n = 5; n < 8; n++) {
22215 for (size_t k = 1; k <= 40; k += 9) {
22216 for (uint32_t m = 1; m <= 3; m++) {
22217 GemmMicrokernelTester()
22218 .mr(3)
22219 .nr(4)
22220 .kr(2)
22221 .sr(1)
22222 .m(m)
22223 .n(n)
22224 .k(k)
22225 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022226 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022227 }
22228 }
22229 }
22230 }
22231
22232 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4) {
22233 TEST_REQUIRES_X86_SSE41;
22234 for (uint32_t n = 8; n <= 12; n += 4) {
22235 for (size_t k = 1; k <= 40; k += 9) {
22236 GemmMicrokernelTester()
22237 .mr(3)
22238 .nr(4)
22239 .kr(2)
22240 .sr(1)
22241 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080022242 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022243 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022244 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022245 }
22246 }
22247 }
22248
22249 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4_strided_cn) {
22250 TEST_REQUIRES_X86_SSE41;
22251 for (uint32_t n = 8; n <= 12; n += 4) {
22252 for (size_t k = 1; k <= 40; k += 9) {
22253 GemmMicrokernelTester()
22254 .mr(3)
22255 .nr(4)
22256 .kr(2)
22257 .sr(1)
22258 .m(3)
22259 .n(n)
22260 .k(k)
22261 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080022262 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022263 }
22264 }
22265 }
22266
22267 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4_strided_a) {
22268 TEST_REQUIRES_X86_SSE41;
22269 for (uint32_t n = 8; n <= 12; n += 4) {
22270 for (size_t k = 1; k <= 40; k += 9) {
22271 GemmMicrokernelTester()
22272 .mr(3)
22273 .nr(4)
22274 .kr(2)
22275 .sr(1)
22276 .m(3)
22277 .n(n)
22278 .k(k)
22279 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080022280 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022281 }
22282 }
22283 }
22284
22285 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4_subtile) {
22286 TEST_REQUIRES_X86_SSE41;
22287 for (uint32_t n = 8; n <= 12; n += 4) {
22288 for (size_t k = 1; k <= 40; k += 9) {
22289 for (uint32_t m = 1; m <= 3; m++) {
22290 GemmMicrokernelTester()
22291 .mr(3)
22292 .nr(4)
22293 .kr(2)
22294 .sr(1)
22295 .m(m)
22296 .n(n)
22297 .k(k)
22298 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022299 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022300 }
22301 }
22302 }
22303 }
22304
22305 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, strided_cm_subtile) {
22306 TEST_REQUIRES_X86_SSE41;
22307 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080022308 for (uint32_t n = 1; n <= 4; n++) {
22309 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022310 GemmMicrokernelTester()
22311 .mr(3)
22312 .nr(4)
22313 .kr(2)
22314 .sr(1)
22315 .m(m)
22316 .n(n)
22317 .k(k)
22318 .cm_stride(7)
22319 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022320 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022321 }
22322 }
22323 }
22324 }
22325
22326 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, qmin) {
22327 TEST_REQUIRES_X86_SSE41;
22328 GemmMicrokernelTester()
22329 .mr(3)
22330 .nr(4)
22331 .kr(2)
22332 .sr(1)
22333 .m(3)
22334 .n(4)
22335 .k(8)
22336 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080022337 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022338 }
22339
22340 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, qmax) {
22341 TEST_REQUIRES_X86_SSE41;
22342 GemmMicrokernelTester()
22343 .mr(3)
22344 .nr(4)
22345 .kr(2)
22346 .sr(1)
22347 .m(3)
22348 .n(4)
22349 .k(8)
22350 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080022351 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022352 }
22353
22354 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, strided_cm) {
22355 TEST_REQUIRES_X86_SSE41;
22356 GemmMicrokernelTester()
22357 .mr(3)
22358 .nr(4)
22359 .kr(2)
22360 .sr(1)
22361 .m(3)
22362 .n(4)
22363 .k(8)
22364 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080022365 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022366 }
22367#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
22368
22369
22370#if XNN_ARCH_X86 || XNN_ARCH_X86_64
22371 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8) {
22372 TEST_REQUIRES_X86_SSE41;
22373 GemmMicrokernelTester()
22374 .mr(4)
22375 .nr(4)
22376 .kr(2)
22377 .sr(1)
22378 .m(4)
22379 .n(4)
22380 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080022381 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022382 }
22383
22384 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, strided_cn) {
22385 TEST_REQUIRES_X86_SSE41;
22386 GemmMicrokernelTester()
22387 .mr(4)
22388 .nr(4)
22389 .kr(2)
22390 .sr(1)
22391 .m(4)
22392 .n(4)
22393 .k(8)
22394 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080022395 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022396 }
22397
22398 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_strided_a) {
22399 TEST_REQUIRES_X86_SSE41;
22400 GemmMicrokernelTester()
22401 .mr(4)
22402 .nr(4)
22403 .kr(2)
22404 .sr(1)
22405 .m(4)
22406 .n(4)
22407 .k(8)
22408 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080022409 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022410 }
22411
22412 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_subtile) {
22413 TEST_REQUIRES_X86_SSE41;
Zhi An Ng83844ae2022-01-14 09:52:25 -080022414 for (uint32_t n = 1; n <= 4; n++) {
22415 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022416 GemmMicrokernelTester()
22417 .mr(4)
22418 .nr(4)
22419 .kr(2)
22420 .sr(1)
22421 .m(m)
22422 .n(n)
22423 .k(8)
22424 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022425 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022426 }
22427 }
22428 }
22429
22430 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_subtile_m) {
22431 TEST_REQUIRES_X86_SSE41;
22432 for (uint32_t m = 1; m <= 4; m++) {
22433 GemmMicrokernelTester()
22434 .mr(4)
22435 .nr(4)
22436 .kr(2)
22437 .sr(1)
22438 .m(m)
22439 .n(4)
22440 .k(8)
22441 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022442 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022443 }
22444 }
22445
22446 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_subtile_n) {
22447 TEST_REQUIRES_X86_SSE41;
22448 for (uint32_t n = 1; n <= 4; n++) {
22449 GemmMicrokernelTester()
22450 .mr(4)
22451 .nr(4)
22452 .kr(2)
22453 .sr(1)
22454 .m(4)
22455 .n(n)
22456 .k(8)
22457 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022458 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022459 }
22460 }
22461
22462 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_lt_8) {
22463 TEST_REQUIRES_X86_SSE41;
22464 for (size_t k = 1; k < 8; k++) {
22465 GemmMicrokernelTester()
22466 .mr(4)
22467 .nr(4)
22468 .kr(2)
22469 .sr(1)
22470 .m(4)
22471 .n(4)
22472 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022473 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022474 }
22475 }
22476
22477 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_lt_8_strided_a) {
22478 TEST_REQUIRES_X86_SSE41;
22479 for (size_t k = 1; k < 8; k++) {
22480 GemmMicrokernelTester()
22481 .mr(4)
22482 .nr(4)
22483 .kr(2)
22484 .sr(1)
22485 .m(4)
22486 .n(4)
22487 .k(k)
22488 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080022489 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022490 }
22491 }
22492
22493 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_lt_8_subtile) {
22494 TEST_REQUIRES_X86_SSE41;
22495 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080022496 for (uint32_t n = 1; n <= 4; n++) {
22497 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022498 GemmMicrokernelTester()
22499 .mr(4)
22500 .nr(4)
22501 .kr(2)
22502 .sr(1)
22503 .m(m)
22504 .n(n)
22505 .k(k)
22506 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022507 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022508 }
22509 }
22510 }
22511 }
22512
22513 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_gt_8) {
22514 TEST_REQUIRES_X86_SSE41;
22515 for (size_t k = 9; k < 16; k++) {
22516 GemmMicrokernelTester()
22517 .mr(4)
22518 .nr(4)
22519 .kr(2)
22520 .sr(1)
22521 .m(4)
22522 .n(4)
22523 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022524 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022525 }
22526 }
22527
22528 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_gt_8_strided_a) {
22529 TEST_REQUIRES_X86_SSE41;
22530 for (size_t k = 9; k < 16; k++) {
22531 GemmMicrokernelTester()
22532 .mr(4)
22533 .nr(4)
22534 .kr(2)
22535 .sr(1)
22536 .m(4)
22537 .n(4)
22538 .k(k)
22539 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080022540 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022541 }
22542 }
22543
22544 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_gt_8_subtile) {
22545 TEST_REQUIRES_X86_SSE41;
22546 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080022547 for (uint32_t n = 1; n <= 4; n++) {
22548 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022549 GemmMicrokernelTester()
22550 .mr(4)
22551 .nr(4)
22552 .kr(2)
22553 .sr(1)
22554 .m(m)
22555 .n(n)
22556 .k(k)
22557 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022558 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022559 }
22560 }
22561 }
22562 }
22563
22564 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_div_8) {
22565 TEST_REQUIRES_X86_SSE41;
22566 for (size_t k = 16; k <= 80; k += 8) {
22567 GemmMicrokernelTester()
22568 .mr(4)
22569 .nr(4)
22570 .kr(2)
22571 .sr(1)
22572 .m(4)
22573 .n(4)
22574 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022575 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022576 }
22577 }
22578
22579 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_div_8_strided_a) {
22580 TEST_REQUIRES_X86_SSE41;
22581 for (size_t k = 16; k <= 80; k += 8) {
22582 GemmMicrokernelTester()
22583 .mr(4)
22584 .nr(4)
22585 .kr(2)
22586 .sr(1)
22587 .m(4)
22588 .n(4)
22589 .k(k)
22590 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080022591 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022592 }
22593 }
22594
22595 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_div_8_subtile) {
22596 TEST_REQUIRES_X86_SSE41;
22597 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080022598 for (uint32_t n = 1; n <= 4; n++) {
22599 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022600 GemmMicrokernelTester()
22601 .mr(4)
22602 .nr(4)
22603 .kr(2)
22604 .sr(1)
22605 .m(m)
22606 .n(n)
22607 .k(k)
22608 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022609 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022610 }
22611 }
22612 }
22613 }
22614
22615 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4) {
22616 TEST_REQUIRES_X86_SSE41;
22617 for (uint32_t n = 5; n < 8; n++) {
22618 for (size_t k = 1; k <= 40; k += 9) {
22619 GemmMicrokernelTester()
22620 .mr(4)
22621 .nr(4)
22622 .kr(2)
22623 .sr(1)
22624 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080022625 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022626 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022627 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022628 }
22629 }
22630 }
22631
22632 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4_strided_cn) {
22633 TEST_REQUIRES_X86_SSE41;
22634 for (uint32_t n = 5; n < 8; n++) {
22635 for (size_t k = 1; k <= 40; k += 9) {
22636 GemmMicrokernelTester()
22637 .mr(4)
22638 .nr(4)
22639 .kr(2)
22640 .sr(1)
22641 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080022642 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022643 .k(k)
22644 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080022645 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022646 }
22647 }
22648 }
22649
22650 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4_strided_a) {
22651 TEST_REQUIRES_X86_SSE41;
22652 for (uint32_t n = 5; n < 8; n++) {
22653 for (size_t k = 1; k <= 40; k += 9) {
22654 GemmMicrokernelTester()
22655 .mr(4)
22656 .nr(4)
22657 .kr(2)
22658 .sr(1)
22659 .m(4)
22660 .n(n)
22661 .k(k)
22662 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080022663 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022664 }
22665 }
22666 }
22667
22668 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4_subtile) {
22669 TEST_REQUIRES_X86_SSE41;
22670 for (uint32_t n = 5; n < 8; n++) {
22671 for (size_t k = 1; k <= 40; k += 9) {
22672 for (uint32_t m = 1; m <= 4; m++) {
22673 GemmMicrokernelTester()
22674 .mr(4)
22675 .nr(4)
22676 .kr(2)
22677 .sr(1)
22678 .m(m)
22679 .n(n)
22680 .k(k)
22681 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022682 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022683 }
22684 }
22685 }
22686 }
22687
22688 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4) {
22689 TEST_REQUIRES_X86_SSE41;
22690 for (uint32_t n = 8; n <= 12; n += 4) {
22691 for (size_t k = 1; k <= 40; k += 9) {
22692 GemmMicrokernelTester()
22693 .mr(4)
22694 .nr(4)
22695 .kr(2)
22696 .sr(1)
22697 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080022698 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022699 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022700 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022701 }
22702 }
22703 }
22704
22705 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4_strided_cn) {
22706 TEST_REQUIRES_X86_SSE41;
22707 for (uint32_t n = 8; n <= 12; n += 4) {
22708 for (size_t k = 1; k <= 40; k += 9) {
22709 GemmMicrokernelTester()
22710 .mr(4)
22711 .nr(4)
22712 .kr(2)
22713 .sr(1)
22714 .m(4)
22715 .n(n)
22716 .k(k)
22717 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080022718 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022719 }
22720 }
22721 }
22722
22723 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4_strided_a) {
22724 TEST_REQUIRES_X86_SSE41;
22725 for (uint32_t n = 8; n <= 12; n += 4) {
22726 for (size_t k = 1; k <= 40; k += 9) {
22727 GemmMicrokernelTester()
22728 .mr(4)
22729 .nr(4)
22730 .kr(2)
22731 .sr(1)
22732 .m(4)
22733 .n(n)
22734 .k(k)
22735 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080022736 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022737 }
22738 }
22739 }
22740
22741 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4_subtile) {
22742 TEST_REQUIRES_X86_SSE41;
22743 for (uint32_t n = 8; n <= 12; n += 4) {
22744 for (size_t k = 1; k <= 40; k += 9) {
22745 for (uint32_t m = 1; m <= 4; m++) {
22746 GemmMicrokernelTester()
22747 .mr(4)
22748 .nr(4)
22749 .kr(2)
22750 .sr(1)
22751 .m(m)
22752 .n(n)
22753 .k(k)
22754 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022755 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022756 }
22757 }
22758 }
22759 }
22760
22761 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, strided_cm_subtile) {
22762 TEST_REQUIRES_X86_SSE41;
22763 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080022764 for (uint32_t n = 1; n <= 4; n++) {
22765 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022766 GemmMicrokernelTester()
22767 .mr(4)
22768 .nr(4)
22769 .kr(2)
22770 .sr(1)
22771 .m(m)
22772 .n(n)
22773 .k(k)
22774 .cm_stride(7)
22775 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022776 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022777 }
22778 }
22779 }
22780 }
22781
22782 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, qmin) {
22783 TEST_REQUIRES_X86_SSE41;
22784 GemmMicrokernelTester()
22785 .mr(4)
22786 .nr(4)
22787 .kr(2)
22788 .sr(1)
22789 .m(4)
22790 .n(4)
22791 .k(8)
22792 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080022793 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022794 }
22795
22796 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, qmax) {
22797 TEST_REQUIRES_X86_SSE41;
22798 GemmMicrokernelTester()
22799 .mr(4)
22800 .nr(4)
22801 .kr(2)
22802 .sr(1)
22803 .m(4)
22804 .n(4)
22805 .k(8)
22806 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080022807 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022808 }
22809
22810 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, strided_cm) {
22811 TEST_REQUIRES_X86_SSE41;
22812 GemmMicrokernelTester()
22813 .mr(4)
22814 .nr(4)
22815 .kr(2)
22816 .sr(1)
22817 .m(4)
22818 .n(4)
22819 .k(8)
22820 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080022821 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022822 }
22823#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
22824
22825
22826#if XNN_ARCH_X86 || XNN_ARCH_X86_64
22827 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8) {
22828 TEST_REQUIRES_X86_AVX;
22829 GemmMicrokernelTester()
22830 .mr(1)
22831 .nr(4)
22832 .kr(2)
22833 .sr(1)
22834 .m(1)
22835 .n(4)
22836 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080022837 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022838 }
22839
22840 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, strided_cn) {
22841 TEST_REQUIRES_X86_AVX;
22842 GemmMicrokernelTester()
22843 .mr(1)
22844 .nr(4)
22845 .kr(2)
22846 .sr(1)
22847 .m(1)
22848 .n(4)
22849 .k(8)
22850 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080022851 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022852 }
22853
22854 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_strided_a) {
22855 TEST_REQUIRES_X86_AVX;
22856 GemmMicrokernelTester()
22857 .mr(1)
22858 .nr(4)
22859 .kr(2)
22860 .sr(1)
22861 .m(1)
22862 .n(4)
22863 .k(8)
22864 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080022865 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022866 }
22867
22868 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_subtile) {
22869 TEST_REQUIRES_X86_AVX;
Zhi An Ng83844ae2022-01-14 09:52:25 -080022870 for (uint32_t n = 1; n <= 4; n++) {
22871 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022872 GemmMicrokernelTester()
22873 .mr(1)
22874 .nr(4)
22875 .kr(2)
22876 .sr(1)
22877 .m(m)
22878 .n(n)
22879 .k(8)
22880 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022881 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022882 }
22883 }
22884 }
22885
22886 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_subtile_m) {
22887 TEST_REQUIRES_X86_AVX;
22888 for (uint32_t m = 1; m <= 1; m++) {
22889 GemmMicrokernelTester()
22890 .mr(1)
22891 .nr(4)
22892 .kr(2)
22893 .sr(1)
22894 .m(m)
22895 .n(4)
22896 .k(8)
22897 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022898 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022899 }
22900 }
22901
22902 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_subtile_n) {
22903 TEST_REQUIRES_X86_AVX;
22904 for (uint32_t n = 1; n <= 4; n++) {
22905 GemmMicrokernelTester()
22906 .mr(1)
22907 .nr(4)
22908 .kr(2)
22909 .sr(1)
22910 .m(1)
22911 .n(n)
22912 .k(8)
22913 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022914 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022915 }
22916 }
22917
22918 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_lt_8) {
22919 TEST_REQUIRES_X86_AVX;
22920 for (size_t k = 1; k < 8; k++) {
22921 GemmMicrokernelTester()
22922 .mr(1)
22923 .nr(4)
22924 .kr(2)
22925 .sr(1)
22926 .m(1)
22927 .n(4)
22928 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022929 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022930 }
22931 }
22932
22933 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_lt_8_strided_a) {
22934 TEST_REQUIRES_X86_AVX;
22935 for (size_t k = 1; k < 8; k++) {
22936 GemmMicrokernelTester()
22937 .mr(1)
22938 .nr(4)
22939 .kr(2)
22940 .sr(1)
22941 .m(1)
22942 .n(4)
22943 .k(k)
22944 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080022945 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022946 }
22947 }
22948
22949 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_lt_8_subtile) {
22950 TEST_REQUIRES_X86_AVX;
22951 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080022952 for (uint32_t n = 1; n <= 4; n++) {
22953 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022954 GemmMicrokernelTester()
22955 .mr(1)
22956 .nr(4)
22957 .kr(2)
22958 .sr(1)
22959 .m(m)
22960 .n(n)
22961 .k(k)
22962 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022963 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022964 }
22965 }
22966 }
22967 }
22968
22969 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_gt_8) {
22970 TEST_REQUIRES_X86_AVX;
22971 for (size_t k = 9; k < 16; k++) {
22972 GemmMicrokernelTester()
22973 .mr(1)
22974 .nr(4)
22975 .kr(2)
22976 .sr(1)
22977 .m(1)
22978 .n(4)
22979 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022980 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022981 }
22982 }
22983
22984 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_gt_8_strided_a) {
22985 TEST_REQUIRES_X86_AVX;
22986 for (size_t k = 9; k < 16; k++) {
22987 GemmMicrokernelTester()
22988 .mr(1)
22989 .nr(4)
22990 .kr(2)
22991 .sr(1)
22992 .m(1)
22993 .n(4)
22994 .k(k)
22995 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080022996 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022997 }
22998 }
22999
23000 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_gt_8_subtile) {
23001 TEST_REQUIRES_X86_AVX;
23002 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080023003 for (uint32_t n = 1; n <= 4; n++) {
23004 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023005 GemmMicrokernelTester()
23006 .mr(1)
23007 .nr(4)
23008 .kr(2)
23009 .sr(1)
23010 .m(m)
23011 .n(n)
23012 .k(k)
23013 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023014 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023015 }
23016 }
23017 }
23018 }
23019
23020 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_div_8) {
23021 TEST_REQUIRES_X86_AVX;
23022 for (size_t k = 16; k <= 80; k += 8) {
23023 GemmMicrokernelTester()
23024 .mr(1)
23025 .nr(4)
23026 .kr(2)
23027 .sr(1)
23028 .m(1)
23029 .n(4)
23030 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023031 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023032 }
23033 }
23034
23035 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_div_8_strided_a) {
23036 TEST_REQUIRES_X86_AVX;
23037 for (size_t k = 16; k <= 80; k += 8) {
23038 GemmMicrokernelTester()
23039 .mr(1)
23040 .nr(4)
23041 .kr(2)
23042 .sr(1)
23043 .m(1)
23044 .n(4)
23045 .k(k)
23046 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080023047 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023048 }
23049 }
23050
23051 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_div_8_subtile) {
23052 TEST_REQUIRES_X86_AVX;
23053 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080023054 for (uint32_t n = 1; n <= 4; n++) {
23055 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023056 GemmMicrokernelTester()
23057 .mr(1)
23058 .nr(4)
23059 .kr(2)
23060 .sr(1)
23061 .m(m)
23062 .n(n)
23063 .k(k)
23064 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023065 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023066 }
23067 }
23068 }
23069 }
23070
23071 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4) {
23072 TEST_REQUIRES_X86_AVX;
23073 for (uint32_t n = 5; n < 8; n++) {
23074 for (size_t k = 1; k <= 40; k += 9) {
23075 GemmMicrokernelTester()
23076 .mr(1)
23077 .nr(4)
23078 .kr(2)
23079 .sr(1)
23080 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080023081 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023082 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023083 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023084 }
23085 }
23086 }
23087
23088 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4_strided_cn) {
23089 TEST_REQUIRES_X86_AVX;
23090 for (uint32_t n = 5; n < 8; n++) {
23091 for (size_t k = 1; k <= 40; k += 9) {
23092 GemmMicrokernelTester()
23093 .mr(1)
23094 .nr(4)
23095 .kr(2)
23096 .sr(1)
23097 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080023098 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023099 .k(k)
23100 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080023101 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023102 }
23103 }
23104 }
23105
23106 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4_strided_a) {
23107 TEST_REQUIRES_X86_AVX;
23108 for (uint32_t n = 5; n < 8; n++) {
23109 for (size_t k = 1; k <= 40; k += 9) {
23110 GemmMicrokernelTester()
23111 .mr(1)
23112 .nr(4)
23113 .kr(2)
23114 .sr(1)
23115 .m(1)
23116 .n(n)
23117 .k(k)
23118 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080023119 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023120 }
23121 }
23122 }
23123
23124 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4_subtile) {
23125 TEST_REQUIRES_X86_AVX;
23126 for (uint32_t n = 5; n < 8; n++) {
23127 for (size_t k = 1; k <= 40; k += 9) {
23128 for (uint32_t m = 1; m <= 1; m++) {
23129 GemmMicrokernelTester()
23130 .mr(1)
23131 .nr(4)
23132 .kr(2)
23133 .sr(1)
23134 .m(m)
23135 .n(n)
23136 .k(k)
23137 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023138 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023139 }
23140 }
23141 }
23142 }
23143
23144 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4) {
23145 TEST_REQUIRES_X86_AVX;
23146 for (uint32_t n = 8; n <= 12; n += 4) {
23147 for (size_t k = 1; k <= 40; k += 9) {
23148 GemmMicrokernelTester()
23149 .mr(1)
23150 .nr(4)
23151 .kr(2)
23152 .sr(1)
23153 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080023154 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023155 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023156 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023157 }
23158 }
23159 }
23160
23161 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4_strided_cn) {
23162 TEST_REQUIRES_X86_AVX;
23163 for (uint32_t n = 8; n <= 12; n += 4) {
23164 for (size_t k = 1; k <= 40; k += 9) {
23165 GemmMicrokernelTester()
23166 .mr(1)
23167 .nr(4)
23168 .kr(2)
23169 .sr(1)
23170 .m(1)
23171 .n(n)
23172 .k(k)
23173 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080023174 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023175 }
23176 }
23177 }
23178
23179 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4_strided_a) {
23180 TEST_REQUIRES_X86_AVX;
23181 for (uint32_t n = 8; n <= 12; n += 4) {
23182 for (size_t k = 1; k <= 40; k += 9) {
23183 GemmMicrokernelTester()
23184 .mr(1)
23185 .nr(4)
23186 .kr(2)
23187 .sr(1)
23188 .m(1)
23189 .n(n)
23190 .k(k)
23191 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080023192 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023193 }
23194 }
23195 }
23196
23197 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4_subtile) {
23198 TEST_REQUIRES_X86_AVX;
23199 for (uint32_t n = 8; n <= 12; n += 4) {
23200 for (size_t k = 1; k <= 40; k += 9) {
23201 for (uint32_t m = 1; m <= 1; m++) {
23202 GemmMicrokernelTester()
23203 .mr(1)
23204 .nr(4)
23205 .kr(2)
23206 .sr(1)
23207 .m(m)
23208 .n(n)
23209 .k(k)
23210 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023211 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023212 }
23213 }
23214 }
23215 }
23216
23217 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, strided_cm_subtile) {
23218 TEST_REQUIRES_X86_AVX;
23219 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080023220 for (uint32_t n = 1; n <= 4; n++) {
23221 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023222 GemmMicrokernelTester()
23223 .mr(1)
23224 .nr(4)
23225 .kr(2)
23226 .sr(1)
23227 .m(m)
23228 .n(n)
23229 .k(k)
23230 .cm_stride(7)
23231 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023232 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023233 }
23234 }
23235 }
23236 }
23237
23238 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, qmin) {
23239 TEST_REQUIRES_X86_AVX;
23240 GemmMicrokernelTester()
23241 .mr(1)
23242 .nr(4)
23243 .kr(2)
23244 .sr(1)
23245 .m(1)
23246 .n(4)
23247 .k(8)
23248 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080023249 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023250 }
23251
23252 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, qmax) {
23253 TEST_REQUIRES_X86_AVX;
23254 GemmMicrokernelTester()
23255 .mr(1)
23256 .nr(4)
23257 .kr(2)
23258 .sr(1)
23259 .m(1)
23260 .n(4)
23261 .k(8)
23262 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080023263 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023264 }
23265
23266 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, strided_cm) {
23267 TEST_REQUIRES_X86_AVX;
23268 GemmMicrokernelTester()
23269 .mr(1)
23270 .nr(4)
23271 .kr(2)
23272 .sr(1)
23273 .m(1)
23274 .n(4)
23275 .k(8)
23276 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080023277 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023278 }
23279#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
23280
23281
23282#if XNN_ARCH_X86 || XNN_ARCH_X86_64
23283 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8) {
23284 TEST_REQUIRES_X86_AVX;
23285 GemmMicrokernelTester()
23286 .mr(3)
23287 .nr(4)
23288 .kr(2)
23289 .sr(1)
23290 .m(3)
23291 .n(4)
23292 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080023293 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023294 }
23295
23296 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, strided_cn) {
23297 TEST_REQUIRES_X86_AVX;
23298 GemmMicrokernelTester()
23299 .mr(3)
23300 .nr(4)
23301 .kr(2)
23302 .sr(1)
23303 .m(3)
23304 .n(4)
23305 .k(8)
23306 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080023307 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023308 }
23309
23310 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8_strided_a) {
23311 TEST_REQUIRES_X86_AVX;
23312 GemmMicrokernelTester()
23313 .mr(3)
23314 .nr(4)
23315 .kr(2)
23316 .sr(1)
23317 .m(3)
23318 .n(4)
23319 .k(8)
23320 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080023321 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023322 }
23323
23324 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8_subtile) {
23325 TEST_REQUIRES_X86_AVX;
Zhi An Ng83844ae2022-01-14 09:52:25 -080023326 for (uint32_t n = 1; n <= 4; n++) {
23327 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023328 GemmMicrokernelTester()
23329 .mr(3)
23330 .nr(4)
23331 .kr(2)
23332 .sr(1)
23333 .m(m)
23334 .n(n)
23335 .k(8)
23336 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023337 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023338 }
23339 }
23340 }
23341
23342 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8_subtile_m) {
23343 TEST_REQUIRES_X86_AVX;
23344 for (uint32_t m = 1; m <= 3; m++) {
23345 GemmMicrokernelTester()
23346 .mr(3)
23347 .nr(4)
23348 .kr(2)
23349 .sr(1)
23350 .m(m)
23351 .n(4)
23352 .k(8)
23353 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023354 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023355 }
23356 }
23357
23358 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8_subtile_n) {
23359 TEST_REQUIRES_X86_AVX;
23360 for (uint32_t n = 1; n <= 4; n++) {
23361 GemmMicrokernelTester()
23362 .mr(3)
23363 .nr(4)
23364 .kr(2)
23365 .sr(1)
23366 .m(3)
23367 .n(n)
23368 .k(8)
23369 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023370 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023371 }
23372 }
23373
23374 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_lt_8) {
23375 TEST_REQUIRES_X86_AVX;
23376 for (size_t k = 1; k < 8; k++) {
23377 GemmMicrokernelTester()
23378 .mr(3)
23379 .nr(4)
23380 .kr(2)
23381 .sr(1)
23382 .m(3)
23383 .n(4)
23384 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023385 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023386 }
23387 }
23388
23389 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_lt_8_strided_a) {
23390 TEST_REQUIRES_X86_AVX;
23391 for (size_t k = 1; k < 8; k++) {
23392 GemmMicrokernelTester()
23393 .mr(3)
23394 .nr(4)
23395 .kr(2)
23396 .sr(1)
23397 .m(3)
23398 .n(4)
23399 .k(k)
23400 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080023401 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023402 }
23403 }
23404
23405 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_lt_8_subtile) {
23406 TEST_REQUIRES_X86_AVX;
23407 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080023408 for (uint32_t n = 1; n <= 4; n++) {
23409 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023410 GemmMicrokernelTester()
23411 .mr(3)
23412 .nr(4)
23413 .kr(2)
23414 .sr(1)
23415 .m(m)
23416 .n(n)
23417 .k(k)
23418 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023419 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023420 }
23421 }
23422 }
23423 }
23424
23425 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_gt_8) {
23426 TEST_REQUIRES_X86_AVX;
23427 for (size_t k = 9; k < 16; k++) {
23428 GemmMicrokernelTester()
23429 .mr(3)
23430 .nr(4)
23431 .kr(2)
23432 .sr(1)
23433 .m(3)
23434 .n(4)
23435 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023436 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023437 }
23438 }
23439
23440 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_gt_8_strided_a) {
23441 TEST_REQUIRES_X86_AVX;
23442 for (size_t k = 9; k < 16; k++) {
23443 GemmMicrokernelTester()
23444 .mr(3)
23445 .nr(4)
23446 .kr(2)
23447 .sr(1)
23448 .m(3)
23449 .n(4)
23450 .k(k)
23451 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080023452 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023453 }
23454 }
23455
23456 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_gt_8_subtile) {
23457 TEST_REQUIRES_X86_AVX;
23458 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080023459 for (uint32_t n = 1; n <= 4; n++) {
23460 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023461 GemmMicrokernelTester()
23462 .mr(3)
23463 .nr(4)
23464 .kr(2)
23465 .sr(1)
23466 .m(m)
23467 .n(n)
23468 .k(k)
23469 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023470 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023471 }
23472 }
23473 }
23474 }
23475
23476 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_div_8) {
23477 TEST_REQUIRES_X86_AVX;
23478 for (size_t k = 16; k <= 80; k += 8) {
23479 GemmMicrokernelTester()
23480 .mr(3)
23481 .nr(4)
23482 .kr(2)
23483 .sr(1)
23484 .m(3)
23485 .n(4)
23486 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023487 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023488 }
23489 }
23490
23491 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_div_8_strided_a) {
23492 TEST_REQUIRES_X86_AVX;
23493 for (size_t k = 16; k <= 80; k += 8) {
23494 GemmMicrokernelTester()
23495 .mr(3)
23496 .nr(4)
23497 .kr(2)
23498 .sr(1)
23499 .m(3)
23500 .n(4)
23501 .k(k)
23502 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080023503 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023504 }
23505 }
23506
23507 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_div_8_subtile) {
23508 TEST_REQUIRES_X86_AVX;
23509 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080023510 for (uint32_t n = 1; n <= 4; n++) {
23511 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023512 GemmMicrokernelTester()
23513 .mr(3)
23514 .nr(4)
23515 .kr(2)
23516 .sr(1)
23517 .m(m)
23518 .n(n)
23519 .k(k)
23520 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023521 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023522 }
23523 }
23524 }
23525 }
23526
23527 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4) {
23528 TEST_REQUIRES_X86_AVX;
23529 for (uint32_t n = 5; n < 8; n++) {
23530 for (size_t k = 1; k <= 40; k += 9) {
23531 GemmMicrokernelTester()
23532 .mr(3)
23533 .nr(4)
23534 .kr(2)
23535 .sr(1)
23536 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080023537 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023538 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023539 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023540 }
23541 }
23542 }
23543
23544 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4_strided_cn) {
23545 TEST_REQUIRES_X86_AVX;
23546 for (uint32_t n = 5; n < 8; n++) {
23547 for (size_t k = 1; k <= 40; k += 9) {
23548 GemmMicrokernelTester()
23549 .mr(3)
23550 .nr(4)
23551 .kr(2)
23552 .sr(1)
23553 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080023554 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023555 .k(k)
23556 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080023557 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023558 }
23559 }
23560 }
23561
23562 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4_strided_a) {
23563 TEST_REQUIRES_X86_AVX;
23564 for (uint32_t n = 5; n < 8; n++) {
23565 for (size_t k = 1; k <= 40; k += 9) {
23566 GemmMicrokernelTester()
23567 .mr(3)
23568 .nr(4)
23569 .kr(2)
23570 .sr(1)
23571 .m(3)
23572 .n(n)
23573 .k(k)
23574 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080023575 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023576 }
23577 }
23578 }
23579
23580 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4_subtile) {
23581 TEST_REQUIRES_X86_AVX;
23582 for (uint32_t n = 5; n < 8; n++) {
23583 for (size_t k = 1; k <= 40; k += 9) {
23584 for (uint32_t m = 1; m <= 3; m++) {
23585 GemmMicrokernelTester()
23586 .mr(3)
23587 .nr(4)
23588 .kr(2)
23589 .sr(1)
23590 .m(m)
23591 .n(n)
23592 .k(k)
23593 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023594 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023595 }
23596 }
23597 }
23598 }
23599
23600 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4) {
23601 TEST_REQUIRES_X86_AVX;
23602 for (uint32_t n = 8; n <= 12; n += 4) {
23603 for (size_t k = 1; k <= 40; k += 9) {
23604 GemmMicrokernelTester()
23605 .mr(3)
23606 .nr(4)
23607 .kr(2)
23608 .sr(1)
23609 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080023610 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023611 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023612 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023613 }
23614 }
23615 }
23616
23617 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4_strided_cn) {
23618 TEST_REQUIRES_X86_AVX;
23619 for (uint32_t n = 8; n <= 12; n += 4) {
23620 for (size_t k = 1; k <= 40; k += 9) {
23621 GemmMicrokernelTester()
23622 .mr(3)
23623 .nr(4)
23624 .kr(2)
23625 .sr(1)
23626 .m(3)
23627 .n(n)
23628 .k(k)
23629 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080023630 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023631 }
23632 }
23633 }
23634
23635 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4_strided_a) {
23636 TEST_REQUIRES_X86_AVX;
23637 for (uint32_t n = 8; n <= 12; n += 4) {
23638 for (size_t k = 1; k <= 40; k += 9) {
23639 GemmMicrokernelTester()
23640 .mr(3)
23641 .nr(4)
23642 .kr(2)
23643 .sr(1)
23644 .m(3)
23645 .n(n)
23646 .k(k)
23647 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080023648 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023649 }
23650 }
23651 }
23652
23653 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4_subtile) {
23654 TEST_REQUIRES_X86_AVX;
23655 for (uint32_t n = 8; n <= 12; n += 4) {
23656 for (size_t k = 1; k <= 40; k += 9) {
23657 for (uint32_t m = 1; m <= 3; m++) {
23658 GemmMicrokernelTester()
23659 .mr(3)
23660 .nr(4)
23661 .kr(2)
23662 .sr(1)
23663 .m(m)
23664 .n(n)
23665 .k(k)
23666 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023667 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023668 }
23669 }
23670 }
23671 }
23672
23673 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, strided_cm_subtile) {
23674 TEST_REQUIRES_X86_AVX;
23675 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080023676 for (uint32_t n = 1; n <= 4; n++) {
23677 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023678 GemmMicrokernelTester()
23679 .mr(3)
23680 .nr(4)
23681 .kr(2)
23682 .sr(1)
23683 .m(m)
23684 .n(n)
23685 .k(k)
23686 .cm_stride(7)
23687 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023688 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023689 }
23690 }
23691 }
23692 }
23693
23694 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, qmin) {
23695 TEST_REQUIRES_X86_AVX;
23696 GemmMicrokernelTester()
23697 .mr(3)
23698 .nr(4)
23699 .kr(2)
23700 .sr(1)
23701 .m(3)
23702 .n(4)
23703 .k(8)
23704 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080023705 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023706 }
23707
23708 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, qmax) {
23709 TEST_REQUIRES_X86_AVX;
23710 GemmMicrokernelTester()
23711 .mr(3)
23712 .nr(4)
23713 .kr(2)
23714 .sr(1)
23715 .m(3)
23716 .n(4)
23717 .k(8)
23718 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080023719 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023720 }
23721
23722 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, strided_cm) {
23723 TEST_REQUIRES_X86_AVX;
23724 GemmMicrokernelTester()
23725 .mr(3)
23726 .nr(4)
23727 .kr(2)
23728 .sr(1)
23729 .m(3)
23730 .n(4)
23731 .k(8)
23732 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080023733 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023734 }
23735#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
23736
23737
23738#if XNN_ARCH_X86 || XNN_ARCH_X86_64
23739 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8) {
23740 TEST_REQUIRES_X86_XOP;
23741 GemmMicrokernelTester()
23742 .mr(2)
23743 .nr(4)
23744 .kr(2)
23745 .sr(1)
23746 .m(2)
23747 .n(4)
23748 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080023749 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023750 }
23751
23752 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, strided_cn) {
23753 TEST_REQUIRES_X86_XOP;
23754 GemmMicrokernelTester()
23755 .mr(2)
23756 .nr(4)
23757 .kr(2)
23758 .sr(1)
23759 .m(2)
23760 .n(4)
23761 .k(8)
23762 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080023763 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023764 }
23765
23766 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8_strided_a) {
23767 TEST_REQUIRES_X86_XOP;
23768 GemmMicrokernelTester()
23769 .mr(2)
23770 .nr(4)
23771 .kr(2)
23772 .sr(1)
23773 .m(2)
23774 .n(4)
23775 .k(8)
23776 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080023777 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023778 }
23779
23780 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8_subtile) {
23781 TEST_REQUIRES_X86_XOP;
Zhi An Ng83844ae2022-01-14 09:52:25 -080023782 for (uint32_t n = 1; n <= 4; n++) {
23783 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023784 GemmMicrokernelTester()
23785 .mr(2)
23786 .nr(4)
23787 .kr(2)
23788 .sr(1)
23789 .m(m)
23790 .n(n)
23791 .k(8)
23792 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023793 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023794 }
23795 }
23796 }
23797
23798 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8_subtile_m) {
23799 TEST_REQUIRES_X86_XOP;
23800 for (uint32_t m = 1; m <= 2; m++) {
23801 GemmMicrokernelTester()
23802 .mr(2)
23803 .nr(4)
23804 .kr(2)
23805 .sr(1)
23806 .m(m)
23807 .n(4)
23808 .k(8)
23809 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023810 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023811 }
23812 }
23813
23814 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8_subtile_n) {
23815 TEST_REQUIRES_X86_XOP;
23816 for (uint32_t n = 1; n <= 4; n++) {
23817 GemmMicrokernelTester()
23818 .mr(2)
23819 .nr(4)
23820 .kr(2)
23821 .sr(1)
23822 .m(2)
23823 .n(n)
23824 .k(8)
23825 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023826 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023827 }
23828 }
23829
23830 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_lt_8) {
23831 TEST_REQUIRES_X86_XOP;
23832 for (size_t k = 1; k < 8; k++) {
23833 GemmMicrokernelTester()
23834 .mr(2)
23835 .nr(4)
23836 .kr(2)
23837 .sr(1)
23838 .m(2)
23839 .n(4)
23840 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023841 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023842 }
23843 }
23844
23845 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_lt_8_strided_a) {
23846 TEST_REQUIRES_X86_XOP;
23847 for (size_t k = 1; k < 8; k++) {
23848 GemmMicrokernelTester()
23849 .mr(2)
23850 .nr(4)
23851 .kr(2)
23852 .sr(1)
23853 .m(2)
23854 .n(4)
23855 .k(k)
23856 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080023857 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023858 }
23859 }
23860
23861 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_lt_8_subtile) {
23862 TEST_REQUIRES_X86_XOP;
23863 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080023864 for (uint32_t n = 1; n <= 4; n++) {
23865 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023866 GemmMicrokernelTester()
23867 .mr(2)
23868 .nr(4)
23869 .kr(2)
23870 .sr(1)
23871 .m(m)
23872 .n(n)
23873 .k(k)
23874 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023875 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023876 }
23877 }
23878 }
23879 }
23880
23881 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_gt_8) {
23882 TEST_REQUIRES_X86_XOP;
23883 for (size_t k = 9; k < 16; k++) {
23884 GemmMicrokernelTester()
23885 .mr(2)
23886 .nr(4)
23887 .kr(2)
23888 .sr(1)
23889 .m(2)
23890 .n(4)
23891 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023892 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023893 }
23894 }
23895
23896 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_gt_8_strided_a) {
23897 TEST_REQUIRES_X86_XOP;
23898 for (size_t k = 9; k < 16; k++) {
23899 GemmMicrokernelTester()
23900 .mr(2)
23901 .nr(4)
23902 .kr(2)
23903 .sr(1)
23904 .m(2)
23905 .n(4)
23906 .k(k)
23907 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080023908 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023909 }
23910 }
23911
23912 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_gt_8_subtile) {
23913 TEST_REQUIRES_X86_XOP;
23914 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080023915 for (uint32_t n = 1; n <= 4; n++) {
23916 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023917 GemmMicrokernelTester()
23918 .mr(2)
23919 .nr(4)
23920 .kr(2)
23921 .sr(1)
23922 .m(m)
23923 .n(n)
23924 .k(k)
23925 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023926 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023927 }
23928 }
23929 }
23930 }
23931
23932 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_div_8) {
23933 TEST_REQUIRES_X86_XOP;
23934 for (size_t k = 16; k <= 80; k += 8) {
23935 GemmMicrokernelTester()
23936 .mr(2)
23937 .nr(4)
23938 .kr(2)
23939 .sr(1)
23940 .m(2)
23941 .n(4)
23942 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023943 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023944 }
23945 }
23946
23947 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_div_8_strided_a) {
23948 TEST_REQUIRES_X86_XOP;
23949 for (size_t k = 16; k <= 80; k += 8) {
23950 GemmMicrokernelTester()
23951 .mr(2)
23952 .nr(4)
23953 .kr(2)
23954 .sr(1)
23955 .m(2)
23956 .n(4)
23957 .k(k)
23958 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080023959 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023960 }
23961 }
23962
23963 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_div_8_subtile) {
23964 TEST_REQUIRES_X86_XOP;
23965 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080023966 for (uint32_t n = 1; n <= 4; n++) {
23967 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023968 GemmMicrokernelTester()
23969 .mr(2)
23970 .nr(4)
23971 .kr(2)
23972 .sr(1)
23973 .m(m)
23974 .n(n)
23975 .k(k)
23976 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023977 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023978 }
23979 }
23980 }
23981 }
23982
23983 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4) {
23984 TEST_REQUIRES_X86_XOP;
23985 for (uint32_t n = 5; n < 8; n++) {
23986 for (size_t k = 1; k <= 40; k += 9) {
23987 GemmMicrokernelTester()
23988 .mr(2)
23989 .nr(4)
23990 .kr(2)
23991 .sr(1)
23992 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080023993 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023994 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023995 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023996 }
23997 }
23998 }
23999
24000 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4_strided_cn) {
24001 TEST_REQUIRES_X86_XOP;
24002 for (uint32_t n = 5; n < 8; n++) {
24003 for (size_t k = 1; k <= 40; k += 9) {
24004 GemmMicrokernelTester()
24005 .mr(2)
24006 .nr(4)
24007 .kr(2)
24008 .sr(1)
24009 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080024010 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024011 .k(k)
24012 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080024013 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024014 }
24015 }
24016 }
24017
24018 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4_strided_a) {
24019 TEST_REQUIRES_X86_XOP;
24020 for (uint32_t n = 5; n < 8; n++) {
24021 for (size_t k = 1; k <= 40; k += 9) {
24022 GemmMicrokernelTester()
24023 .mr(2)
24024 .nr(4)
24025 .kr(2)
24026 .sr(1)
24027 .m(2)
24028 .n(n)
24029 .k(k)
24030 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080024031 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024032 }
24033 }
24034 }
24035
24036 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4_subtile) {
24037 TEST_REQUIRES_X86_XOP;
24038 for (uint32_t n = 5; n < 8; n++) {
24039 for (size_t k = 1; k <= 40; k += 9) {
24040 for (uint32_t m = 1; m <= 2; m++) {
24041 GemmMicrokernelTester()
24042 .mr(2)
24043 .nr(4)
24044 .kr(2)
24045 .sr(1)
24046 .m(m)
24047 .n(n)
24048 .k(k)
24049 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024050 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024051 }
24052 }
24053 }
24054 }
24055
24056 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4) {
24057 TEST_REQUIRES_X86_XOP;
24058 for (uint32_t n = 8; n <= 12; n += 4) {
24059 for (size_t k = 1; k <= 40; k += 9) {
24060 GemmMicrokernelTester()
24061 .mr(2)
24062 .nr(4)
24063 .kr(2)
24064 .sr(1)
24065 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080024066 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024067 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080024068 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024069 }
24070 }
24071 }
24072
24073 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4_strided_cn) {
24074 TEST_REQUIRES_X86_XOP;
24075 for (uint32_t n = 8; n <= 12; n += 4) {
24076 for (size_t k = 1; k <= 40; k += 9) {
24077 GemmMicrokernelTester()
24078 .mr(2)
24079 .nr(4)
24080 .kr(2)
24081 .sr(1)
24082 .m(2)
24083 .n(n)
24084 .k(k)
24085 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080024086 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024087 }
24088 }
24089 }
24090
24091 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4_strided_a) {
24092 TEST_REQUIRES_X86_XOP;
24093 for (uint32_t n = 8; n <= 12; n += 4) {
24094 for (size_t k = 1; k <= 40; k += 9) {
24095 GemmMicrokernelTester()
24096 .mr(2)
24097 .nr(4)
24098 .kr(2)
24099 .sr(1)
24100 .m(2)
24101 .n(n)
24102 .k(k)
24103 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080024104 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024105 }
24106 }
24107 }
24108
24109 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4_subtile) {
24110 TEST_REQUIRES_X86_XOP;
24111 for (uint32_t n = 8; n <= 12; n += 4) {
24112 for (size_t k = 1; k <= 40; k += 9) {
24113 for (uint32_t m = 1; m <= 2; m++) {
24114 GemmMicrokernelTester()
24115 .mr(2)
24116 .nr(4)
24117 .kr(2)
24118 .sr(1)
24119 .m(m)
24120 .n(n)
24121 .k(k)
24122 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024123 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024124 }
24125 }
24126 }
24127 }
24128
24129 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, strided_cm_subtile) {
24130 TEST_REQUIRES_X86_XOP;
24131 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080024132 for (uint32_t n = 1; n <= 4; n++) {
24133 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024134 GemmMicrokernelTester()
24135 .mr(2)
24136 .nr(4)
24137 .kr(2)
24138 .sr(1)
24139 .m(m)
24140 .n(n)
24141 .k(k)
24142 .cm_stride(7)
24143 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024144 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024145 }
24146 }
24147 }
24148 }
24149
24150 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, qmin) {
24151 TEST_REQUIRES_X86_XOP;
24152 GemmMicrokernelTester()
24153 .mr(2)
24154 .nr(4)
24155 .kr(2)
24156 .sr(1)
24157 .m(2)
24158 .n(4)
24159 .k(8)
24160 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080024161 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024162 }
24163
24164 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, qmax) {
24165 TEST_REQUIRES_X86_XOP;
24166 GemmMicrokernelTester()
24167 .mr(2)
24168 .nr(4)
24169 .kr(2)
24170 .sr(1)
24171 .m(2)
24172 .n(4)
24173 .k(8)
24174 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080024175 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024176 }
24177
24178 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, strided_cm) {
24179 TEST_REQUIRES_X86_XOP;
24180 GemmMicrokernelTester()
24181 .mr(2)
24182 .nr(4)
24183 .kr(2)
24184 .sr(1)
24185 .m(2)
24186 .n(4)
24187 .k(8)
24188 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080024189 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024190 }
24191#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
24192
24193
24194#if XNN_ARCH_X86 || XNN_ARCH_X86_64
24195 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8) {
24196 TEST_REQUIRES_X86_XOP;
24197 GemmMicrokernelTester()
24198 .mr(4)
24199 .nr(4)
24200 .kr(2)
24201 .sr(1)
24202 .m(4)
24203 .n(4)
24204 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080024205 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024206 }
24207
24208 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, strided_cn) {
24209 TEST_REQUIRES_X86_XOP;
24210 GemmMicrokernelTester()
24211 .mr(4)
24212 .nr(4)
24213 .kr(2)
24214 .sr(1)
24215 .m(4)
24216 .n(4)
24217 .k(8)
24218 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080024219 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024220 }
24221
24222 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8_strided_a) {
24223 TEST_REQUIRES_X86_XOP;
24224 GemmMicrokernelTester()
24225 .mr(4)
24226 .nr(4)
24227 .kr(2)
24228 .sr(1)
24229 .m(4)
24230 .n(4)
24231 .k(8)
24232 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080024233 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024234 }
24235
24236 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8_subtile) {
24237 TEST_REQUIRES_X86_XOP;
Zhi An Ng83844ae2022-01-14 09:52:25 -080024238 for (uint32_t n = 1; n <= 4; n++) {
24239 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024240 GemmMicrokernelTester()
24241 .mr(4)
24242 .nr(4)
24243 .kr(2)
24244 .sr(1)
24245 .m(m)
24246 .n(n)
24247 .k(8)
24248 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024249 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024250 }
24251 }
24252 }
24253
24254 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8_subtile_m) {
24255 TEST_REQUIRES_X86_XOP;
24256 for (uint32_t m = 1; m <= 4; m++) {
24257 GemmMicrokernelTester()
24258 .mr(4)
24259 .nr(4)
24260 .kr(2)
24261 .sr(1)
24262 .m(m)
24263 .n(4)
24264 .k(8)
24265 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024266 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024267 }
24268 }
24269
24270 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8_subtile_n) {
24271 TEST_REQUIRES_X86_XOP;
24272 for (uint32_t n = 1; n <= 4; n++) {
24273 GemmMicrokernelTester()
24274 .mr(4)
24275 .nr(4)
24276 .kr(2)
24277 .sr(1)
24278 .m(4)
24279 .n(n)
24280 .k(8)
24281 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024282 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024283 }
24284 }
24285
24286 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_lt_8) {
24287 TEST_REQUIRES_X86_XOP;
24288 for (size_t k = 1; k < 8; k++) {
24289 GemmMicrokernelTester()
24290 .mr(4)
24291 .nr(4)
24292 .kr(2)
24293 .sr(1)
24294 .m(4)
24295 .n(4)
24296 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080024297 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024298 }
24299 }
24300
24301 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_lt_8_strided_a) {
24302 TEST_REQUIRES_X86_XOP;
24303 for (size_t k = 1; k < 8; k++) {
24304 GemmMicrokernelTester()
24305 .mr(4)
24306 .nr(4)
24307 .kr(2)
24308 .sr(1)
24309 .m(4)
24310 .n(4)
24311 .k(k)
24312 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080024313 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024314 }
24315 }
24316
24317 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_lt_8_subtile) {
24318 TEST_REQUIRES_X86_XOP;
24319 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080024320 for (uint32_t n = 1; n <= 4; n++) {
24321 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024322 GemmMicrokernelTester()
24323 .mr(4)
24324 .nr(4)
24325 .kr(2)
24326 .sr(1)
24327 .m(m)
24328 .n(n)
24329 .k(k)
24330 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024331 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024332 }
24333 }
24334 }
24335 }
24336
24337 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_gt_8) {
24338 TEST_REQUIRES_X86_XOP;
24339 for (size_t k = 9; k < 16; k++) {
24340 GemmMicrokernelTester()
24341 .mr(4)
24342 .nr(4)
24343 .kr(2)
24344 .sr(1)
24345 .m(4)
24346 .n(4)
24347 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080024348 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024349 }
24350 }
24351
24352 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_gt_8_strided_a) {
24353 TEST_REQUIRES_X86_XOP;
24354 for (size_t k = 9; k < 16; k++) {
24355 GemmMicrokernelTester()
24356 .mr(4)
24357 .nr(4)
24358 .kr(2)
24359 .sr(1)
24360 .m(4)
24361 .n(4)
24362 .k(k)
24363 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080024364 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024365 }
24366 }
24367
24368 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_gt_8_subtile) {
24369 TEST_REQUIRES_X86_XOP;
24370 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080024371 for (uint32_t n = 1; n <= 4; n++) {
24372 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024373 GemmMicrokernelTester()
24374 .mr(4)
24375 .nr(4)
24376 .kr(2)
24377 .sr(1)
24378 .m(m)
24379 .n(n)
24380 .k(k)
24381 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024382 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024383 }
24384 }
24385 }
24386 }
24387
24388 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_div_8) {
24389 TEST_REQUIRES_X86_XOP;
24390 for (size_t k = 16; k <= 80; k += 8) {
24391 GemmMicrokernelTester()
24392 .mr(4)
24393 .nr(4)
24394 .kr(2)
24395 .sr(1)
24396 .m(4)
24397 .n(4)
24398 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080024399 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024400 }
24401 }
24402
24403 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_div_8_strided_a) {
24404 TEST_REQUIRES_X86_XOP;
24405 for (size_t k = 16; k <= 80; k += 8) {
24406 GemmMicrokernelTester()
24407 .mr(4)
24408 .nr(4)
24409 .kr(2)
24410 .sr(1)
24411 .m(4)
24412 .n(4)
24413 .k(k)
24414 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080024415 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024416 }
24417 }
24418
24419 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_div_8_subtile) {
24420 TEST_REQUIRES_X86_XOP;
24421 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080024422 for (uint32_t n = 1; n <= 4; n++) {
24423 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024424 GemmMicrokernelTester()
24425 .mr(4)
24426 .nr(4)
24427 .kr(2)
24428 .sr(1)
24429 .m(m)
24430 .n(n)
24431 .k(k)
24432 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024433 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024434 }
24435 }
24436 }
24437 }
24438
24439 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4) {
24440 TEST_REQUIRES_X86_XOP;
24441 for (uint32_t n = 5; n < 8; n++) {
24442 for (size_t k = 1; k <= 40; k += 9) {
24443 GemmMicrokernelTester()
24444 .mr(4)
24445 .nr(4)
24446 .kr(2)
24447 .sr(1)
24448 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080024449 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024450 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080024451 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024452 }
24453 }
24454 }
24455
24456 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4_strided_cn) {
24457 TEST_REQUIRES_X86_XOP;
24458 for (uint32_t n = 5; n < 8; n++) {
24459 for (size_t k = 1; k <= 40; k += 9) {
24460 GemmMicrokernelTester()
24461 .mr(4)
24462 .nr(4)
24463 .kr(2)
24464 .sr(1)
24465 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080024466 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024467 .k(k)
24468 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080024469 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024470 }
24471 }
24472 }
24473
24474 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4_strided_a) {
24475 TEST_REQUIRES_X86_XOP;
24476 for (uint32_t n = 5; n < 8; n++) {
24477 for (size_t k = 1; k <= 40; k += 9) {
24478 GemmMicrokernelTester()
24479 .mr(4)
24480 .nr(4)
24481 .kr(2)
24482 .sr(1)
24483 .m(4)
24484 .n(n)
24485 .k(k)
24486 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080024487 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024488 }
24489 }
24490 }
24491
24492 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4_subtile) {
24493 TEST_REQUIRES_X86_XOP;
24494 for (uint32_t n = 5; n < 8; n++) {
24495 for (size_t k = 1; k <= 40; k += 9) {
24496 for (uint32_t m = 1; m <= 4; m++) {
24497 GemmMicrokernelTester()
24498 .mr(4)
24499 .nr(4)
24500 .kr(2)
24501 .sr(1)
24502 .m(m)
24503 .n(n)
24504 .k(k)
24505 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024506 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024507 }
24508 }
24509 }
24510 }
24511
24512 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4) {
24513 TEST_REQUIRES_X86_XOP;
24514 for (uint32_t n = 8; n <= 12; n += 4) {
24515 for (size_t k = 1; k <= 40; k += 9) {
24516 GemmMicrokernelTester()
24517 .mr(4)
24518 .nr(4)
24519 .kr(2)
24520 .sr(1)
24521 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080024522 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024523 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080024524 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024525 }
24526 }
24527 }
24528
24529 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4_strided_cn) {
24530 TEST_REQUIRES_X86_XOP;
24531 for (uint32_t n = 8; n <= 12; n += 4) {
24532 for (size_t k = 1; k <= 40; k += 9) {
24533 GemmMicrokernelTester()
24534 .mr(4)
24535 .nr(4)
24536 .kr(2)
24537 .sr(1)
24538 .m(4)
24539 .n(n)
24540 .k(k)
24541 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080024542 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024543 }
24544 }
24545 }
24546
24547 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4_strided_a) {
24548 TEST_REQUIRES_X86_XOP;
24549 for (uint32_t n = 8; n <= 12; n += 4) {
24550 for (size_t k = 1; k <= 40; k += 9) {
24551 GemmMicrokernelTester()
24552 .mr(4)
24553 .nr(4)
24554 .kr(2)
24555 .sr(1)
24556 .m(4)
24557 .n(n)
24558 .k(k)
24559 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080024560 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024561 }
24562 }
24563 }
24564
24565 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4_subtile) {
24566 TEST_REQUIRES_X86_XOP;
24567 for (uint32_t n = 8; n <= 12; n += 4) {
24568 for (size_t k = 1; k <= 40; k += 9) {
24569 for (uint32_t m = 1; m <= 4; m++) {
24570 GemmMicrokernelTester()
24571 .mr(4)
24572 .nr(4)
24573 .kr(2)
24574 .sr(1)
24575 .m(m)
24576 .n(n)
24577 .k(k)
24578 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024579 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024580 }
24581 }
24582 }
24583 }
24584
24585 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, strided_cm_subtile) {
24586 TEST_REQUIRES_X86_XOP;
24587 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080024588 for (uint32_t n = 1; n <= 4; n++) {
24589 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024590 GemmMicrokernelTester()
24591 .mr(4)
24592 .nr(4)
24593 .kr(2)
24594 .sr(1)
24595 .m(m)
24596 .n(n)
24597 .k(k)
24598 .cm_stride(7)
24599 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024600 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024601 }
24602 }
24603 }
24604 }
24605
24606 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, qmin) {
24607 TEST_REQUIRES_X86_XOP;
24608 GemmMicrokernelTester()
24609 .mr(4)
24610 .nr(4)
24611 .kr(2)
24612 .sr(1)
24613 .m(4)
24614 .n(4)
24615 .k(8)
24616 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080024617 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024618 }
24619
24620 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, qmax) {
24621 TEST_REQUIRES_X86_XOP;
24622 GemmMicrokernelTester()
24623 .mr(4)
24624 .nr(4)
24625 .kr(2)
24626 .sr(1)
24627 .m(4)
24628 .n(4)
24629 .k(8)
24630 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080024631 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024632 }
24633
24634 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, strided_cm) {
24635 TEST_REQUIRES_X86_XOP;
24636 GemmMicrokernelTester()
24637 .mr(4)
24638 .nr(4)
24639 .kr(2)
24640 .sr(1)
24641 .m(4)
24642 .n(4)
24643 .k(8)
24644 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080024645 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024646 }
24647#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
24648
24649
24650#if XNN_ARCH_X86 || XNN_ARCH_X86_64
24651 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8) {
24652 TEST_REQUIRES_X86_SSE2;
24653 GemmMicrokernelTester()
24654 .mr(2)
24655 .nr(4)
24656 .kr(2)
24657 .sr(1)
24658 .m(2)
24659 .n(4)
24660 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080024661 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024662 }
24663
24664 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, strided_cn) {
24665 TEST_REQUIRES_X86_SSE2;
24666 GemmMicrokernelTester()
24667 .mr(2)
24668 .nr(4)
24669 .kr(2)
24670 .sr(1)
24671 .m(2)
24672 .n(4)
24673 .k(8)
24674 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080024675 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024676 }
24677
24678 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8_strided_a) {
24679 TEST_REQUIRES_X86_SSE2;
24680 GemmMicrokernelTester()
24681 .mr(2)
24682 .nr(4)
24683 .kr(2)
24684 .sr(1)
24685 .m(2)
24686 .n(4)
24687 .k(8)
24688 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080024689 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024690 }
24691
24692 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8_subtile) {
24693 TEST_REQUIRES_X86_SSE2;
Zhi An Ng83844ae2022-01-14 09:52:25 -080024694 for (uint32_t n = 1; n <= 4; n++) {
24695 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024696 GemmMicrokernelTester()
24697 .mr(2)
24698 .nr(4)
24699 .kr(2)
24700 .sr(1)
24701 .m(m)
24702 .n(n)
24703 .k(8)
24704 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024705 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024706 }
24707 }
24708 }
24709
24710 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8_subtile_m) {
24711 TEST_REQUIRES_X86_SSE2;
24712 for (uint32_t m = 1; m <= 2; m++) {
24713 GemmMicrokernelTester()
24714 .mr(2)
24715 .nr(4)
24716 .kr(2)
24717 .sr(1)
24718 .m(m)
24719 .n(4)
24720 .k(8)
24721 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024722 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024723 }
24724 }
24725
24726 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8_subtile_n) {
24727 TEST_REQUIRES_X86_SSE2;
24728 for (uint32_t n = 1; n <= 4; n++) {
24729 GemmMicrokernelTester()
24730 .mr(2)
24731 .nr(4)
24732 .kr(2)
24733 .sr(1)
24734 .m(2)
24735 .n(n)
24736 .k(8)
24737 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024738 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024739 }
24740 }
24741
24742 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_lt_8) {
24743 TEST_REQUIRES_X86_SSE2;
24744 for (size_t k = 1; k < 8; k++) {
24745 GemmMicrokernelTester()
24746 .mr(2)
24747 .nr(4)
24748 .kr(2)
24749 .sr(1)
24750 .m(2)
24751 .n(4)
24752 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080024753 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024754 }
24755 }
24756
24757 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_lt_8_strided_a) {
24758 TEST_REQUIRES_X86_SSE2;
24759 for (size_t k = 1; k < 8; k++) {
24760 GemmMicrokernelTester()
24761 .mr(2)
24762 .nr(4)
24763 .kr(2)
24764 .sr(1)
24765 .m(2)
24766 .n(4)
24767 .k(k)
24768 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080024769 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024770 }
24771 }
24772
24773 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_lt_8_subtile) {
24774 TEST_REQUIRES_X86_SSE2;
24775 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080024776 for (uint32_t n = 1; n <= 4; n++) {
24777 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024778 GemmMicrokernelTester()
24779 .mr(2)
24780 .nr(4)
24781 .kr(2)
24782 .sr(1)
24783 .m(m)
24784 .n(n)
24785 .k(k)
24786 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024787 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024788 }
24789 }
24790 }
24791 }
24792
24793 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_gt_8) {
24794 TEST_REQUIRES_X86_SSE2;
24795 for (size_t k = 9; k < 16; k++) {
24796 GemmMicrokernelTester()
24797 .mr(2)
24798 .nr(4)
24799 .kr(2)
24800 .sr(1)
24801 .m(2)
24802 .n(4)
24803 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080024804 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024805 }
24806 }
24807
24808 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_gt_8_strided_a) {
24809 TEST_REQUIRES_X86_SSE2;
24810 for (size_t k = 9; k < 16; k++) {
24811 GemmMicrokernelTester()
24812 .mr(2)
24813 .nr(4)
24814 .kr(2)
24815 .sr(1)
24816 .m(2)
24817 .n(4)
24818 .k(k)
24819 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080024820 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024821 }
24822 }
24823
24824 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_gt_8_subtile) {
24825 TEST_REQUIRES_X86_SSE2;
24826 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080024827 for (uint32_t n = 1; n <= 4; n++) {
24828 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024829 GemmMicrokernelTester()
24830 .mr(2)
24831 .nr(4)
24832 .kr(2)
24833 .sr(1)
24834 .m(m)
24835 .n(n)
24836 .k(k)
24837 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024838 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024839 }
24840 }
24841 }
24842 }
24843
24844 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_div_8) {
24845 TEST_REQUIRES_X86_SSE2;
24846 for (size_t k = 16; k <= 80; k += 8) {
24847 GemmMicrokernelTester()
24848 .mr(2)
24849 .nr(4)
24850 .kr(2)
24851 .sr(1)
24852 .m(2)
24853 .n(4)
24854 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080024855 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024856 }
24857 }
24858
24859 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_div_8_strided_a) {
24860 TEST_REQUIRES_X86_SSE2;
24861 for (size_t k = 16; k <= 80; k += 8) {
24862 GemmMicrokernelTester()
24863 .mr(2)
24864 .nr(4)
24865 .kr(2)
24866 .sr(1)
24867 .m(2)
24868 .n(4)
24869 .k(k)
24870 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080024871 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024872 }
24873 }
24874
24875 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_div_8_subtile) {
24876 TEST_REQUIRES_X86_SSE2;
24877 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080024878 for (uint32_t n = 1; n <= 4; n++) {
24879 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024880 GemmMicrokernelTester()
24881 .mr(2)
24882 .nr(4)
24883 .kr(2)
24884 .sr(1)
24885 .m(m)
24886 .n(n)
24887 .k(k)
24888 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024889 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024890 }
24891 }
24892 }
24893 }
24894
24895 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4) {
24896 TEST_REQUIRES_X86_SSE2;
24897 for (uint32_t n = 5; n < 8; n++) {
24898 for (size_t k = 1; k <= 40; k += 9) {
24899 GemmMicrokernelTester()
24900 .mr(2)
24901 .nr(4)
24902 .kr(2)
24903 .sr(1)
24904 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080024905 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024906 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080024907 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024908 }
24909 }
24910 }
24911
24912 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4_strided_cn) {
24913 TEST_REQUIRES_X86_SSE2;
24914 for (uint32_t n = 5; n < 8; n++) {
24915 for (size_t k = 1; k <= 40; k += 9) {
24916 GemmMicrokernelTester()
24917 .mr(2)
24918 .nr(4)
24919 .kr(2)
24920 .sr(1)
24921 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080024922 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024923 .k(k)
24924 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080024925 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024926 }
24927 }
24928 }
24929
24930 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4_strided_a) {
24931 TEST_REQUIRES_X86_SSE2;
24932 for (uint32_t n = 5; n < 8; n++) {
24933 for (size_t k = 1; k <= 40; k += 9) {
24934 GemmMicrokernelTester()
24935 .mr(2)
24936 .nr(4)
24937 .kr(2)
24938 .sr(1)
24939 .m(2)
24940 .n(n)
24941 .k(k)
24942 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080024943 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024944 }
24945 }
24946 }
24947
24948 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4_subtile) {
24949 TEST_REQUIRES_X86_SSE2;
24950 for (uint32_t n = 5; n < 8; n++) {
24951 for (size_t k = 1; k <= 40; k += 9) {
24952 for (uint32_t m = 1; m <= 2; m++) {
24953 GemmMicrokernelTester()
24954 .mr(2)
24955 .nr(4)
24956 .kr(2)
24957 .sr(1)
24958 .m(m)
24959 .n(n)
24960 .k(k)
24961 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024962 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024963 }
24964 }
24965 }
24966 }
24967
24968 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4) {
24969 TEST_REQUIRES_X86_SSE2;
24970 for (uint32_t n = 8; n <= 12; n += 4) {
24971 for (size_t k = 1; k <= 40; k += 9) {
24972 GemmMicrokernelTester()
24973 .mr(2)
24974 .nr(4)
24975 .kr(2)
24976 .sr(1)
24977 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080024978 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024979 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080024980 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024981 }
24982 }
24983 }
24984
24985 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4_strided_cn) {
24986 TEST_REQUIRES_X86_SSE2;
24987 for (uint32_t n = 8; n <= 12; n += 4) {
24988 for (size_t k = 1; k <= 40; k += 9) {
24989 GemmMicrokernelTester()
24990 .mr(2)
24991 .nr(4)
24992 .kr(2)
24993 .sr(1)
24994 .m(2)
24995 .n(n)
24996 .k(k)
24997 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080024998 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024999 }
25000 }
25001 }
25002
25003 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4_strided_a) {
25004 TEST_REQUIRES_X86_SSE2;
25005 for (uint32_t n = 8; n <= 12; n += 4) {
25006 for (size_t k = 1; k <= 40; k += 9) {
25007 GemmMicrokernelTester()
25008 .mr(2)
25009 .nr(4)
25010 .kr(2)
25011 .sr(1)
25012 .m(2)
25013 .n(n)
25014 .k(k)
25015 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080025016 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025017 }
25018 }
25019 }
25020
25021 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4_subtile) {
25022 TEST_REQUIRES_X86_SSE2;
25023 for (uint32_t n = 8; n <= 12; n += 4) {
25024 for (size_t k = 1; k <= 40; k += 9) {
25025 for (uint32_t m = 1; m <= 2; m++) {
25026 GemmMicrokernelTester()
25027 .mr(2)
25028 .nr(4)
25029 .kr(2)
25030 .sr(1)
25031 .m(m)
25032 .n(n)
25033 .k(k)
25034 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025035 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025036 }
25037 }
25038 }
25039 }
25040
25041 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, strided_cm_subtile) {
25042 TEST_REQUIRES_X86_SSE2;
25043 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080025044 for (uint32_t n = 1; n <= 4; n++) {
25045 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025046 GemmMicrokernelTester()
25047 .mr(2)
25048 .nr(4)
25049 .kr(2)
25050 .sr(1)
25051 .m(m)
25052 .n(n)
25053 .k(k)
25054 .cm_stride(7)
25055 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025056 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025057 }
25058 }
25059 }
25060 }
25061
25062 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, qmin) {
25063 TEST_REQUIRES_X86_SSE2;
25064 GemmMicrokernelTester()
25065 .mr(2)
25066 .nr(4)
25067 .kr(2)
25068 .sr(1)
25069 .m(2)
25070 .n(4)
25071 .k(8)
25072 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080025073 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025074 }
25075
25076 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, qmax) {
25077 TEST_REQUIRES_X86_SSE2;
25078 GemmMicrokernelTester()
25079 .mr(2)
25080 .nr(4)
25081 .kr(2)
25082 .sr(1)
25083 .m(2)
25084 .n(4)
25085 .k(8)
25086 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080025087 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025088 }
25089
25090 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, strided_cm) {
25091 TEST_REQUIRES_X86_SSE2;
25092 GemmMicrokernelTester()
25093 .mr(2)
25094 .nr(4)
25095 .kr(2)
25096 .sr(1)
25097 .m(2)
25098 .n(4)
25099 .k(8)
25100 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080025101 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025102 }
25103#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
25104
25105
25106#if XNN_ARCH_X86 || XNN_ARCH_X86_64
25107 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8) {
25108 TEST_REQUIRES_X86_SSE2;
25109 GemmMicrokernelTester()
25110 .mr(4)
25111 .nr(4)
25112 .kr(2)
25113 .sr(1)
25114 .m(4)
25115 .n(4)
25116 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080025117 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025118 }
25119
25120 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, strided_cn) {
25121 TEST_REQUIRES_X86_SSE2;
25122 GemmMicrokernelTester()
25123 .mr(4)
25124 .nr(4)
25125 .kr(2)
25126 .sr(1)
25127 .m(4)
25128 .n(4)
25129 .k(8)
25130 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080025131 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025132 }
25133
25134 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8_strided_a) {
25135 TEST_REQUIRES_X86_SSE2;
25136 GemmMicrokernelTester()
25137 .mr(4)
25138 .nr(4)
25139 .kr(2)
25140 .sr(1)
25141 .m(4)
25142 .n(4)
25143 .k(8)
25144 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080025145 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025146 }
25147
25148 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8_subtile) {
25149 TEST_REQUIRES_X86_SSE2;
Zhi An Ng83844ae2022-01-14 09:52:25 -080025150 for (uint32_t n = 1; n <= 4; n++) {
25151 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025152 GemmMicrokernelTester()
25153 .mr(4)
25154 .nr(4)
25155 .kr(2)
25156 .sr(1)
25157 .m(m)
25158 .n(n)
25159 .k(8)
25160 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025161 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025162 }
25163 }
25164 }
25165
25166 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8_subtile_m) {
25167 TEST_REQUIRES_X86_SSE2;
25168 for (uint32_t m = 1; m <= 4; m++) {
25169 GemmMicrokernelTester()
25170 .mr(4)
25171 .nr(4)
25172 .kr(2)
25173 .sr(1)
25174 .m(m)
25175 .n(4)
25176 .k(8)
25177 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025178 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025179 }
25180 }
25181
25182 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8_subtile_n) {
25183 TEST_REQUIRES_X86_SSE2;
25184 for (uint32_t n = 1; n <= 4; n++) {
25185 GemmMicrokernelTester()
25186 .mr(4)
25187 .nr(4)
25188 .kr(2)
25189 .sr(1)
25190 .m(4)
25191 .n(n)
25192 .k(8)
25193 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025194 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025195 }
25196 }
25197
25198 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_lt_8) {
25199 TEST_REQUIRES_X86_SSE2;
25200 for (size_t k = 1; k < 8; k++) {
25201 GemmMicrokernelTester()
25202 .mr(4)
25203 .nr(4)
25204 .kr(2)
25205 .sr(1)
25206 .m(4)
25207 .n(4)
25208 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080025209 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025210 }
25211 }
25212
25213 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_lt_8_strided_a) {
25214 TEST_REQUIRES_X86_SSE2;
25215 for (size_t k = 1; k < 8; k++) {
25216 GemmMicrokernelTester()
25217 .mr(4)
25218 .nr(4)
25219 .kr(2)
25220 .sr(1)
25221 .m(4)
25222 .n(4)
25223 .k(k)
25224 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080025225 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025226 }
25227 }
25228
25229 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_lt_8_subtile) {
25230 TEST_REQUIRES_X86_SSE2;
25231 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080025232 for (uint32_t n = 1; n <= 4; n++) {
25233 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025234 GemmMicrokernelTester()
25235 .mr(4)
25236 .nr(4)
25237 .kr(2)
25238 .sr(1)
25239 .m(m)
25240 .n(n)
25241 .k(k)
25242 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025243 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025244 }
25245 }
25246 }
25247 }
25248
25249 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_gt_8) {
25250 TEST_REQUIRES_X86_SSE2;
25251 for (size_t k = 9; k < 16; k++) {
25252 GemmMicrokernelTester()
25253 .mr(4)
25254 .nr(4)
25255 .kr(2)
25256 .sr(1)
25257 .m(4)
25258 .n(4)
25259 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080025260 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025261 }
25262 }
25263
25264 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_gt_8_strided_a) {
25265 TEST_REQUIRES_X86_SSE2;
25266 for (size_t k = 9; k < 16; k++) {
25267 GemmMicrokernelTester()
25268 .mr(4)
25269 .nr(4)
25270 .kr(2)
25271 .sr(1)
25272 .m(4)
25273 .n(4)
25274 .k(k)
25275 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080025276 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025277 }
25278 }
25279
25280 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_gt_8_subtile) {
25281 TEST_REQUIRES_X86_SSE2;
25282 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080025283 for (uint32_t n = 1; n <= 4; n++) {
25284 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025285 GemmMicrokernelTester()
25286 .mr(4)
25287 .nr(4)
25288 .kr(2)
25289 .sr(1)
25290 .m(m)
25291 .n(n)
25292 .k(k)
25293 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025294 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025295 }
25296 }
25297 }
25298 }
25299
25300 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_div_8) {
25301 TEST_REQUIRES_X86_SSE2;
25302 for (size_t k = 16; k <= 80; k += 8) {
25303 GemmMicrokernelTester()
25304 .mr(4)
25305 .nr(4)
25306 .kr(2)
25307 .sr(1)
25308 .m(4)
25309 .n(4)
25310 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080025311 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025312 }
25313 }
25314
25315 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_div_8_strided_a) {
25316 TEST_REQUIRES_X86_SSE2;
25317 for (size_t k = 16; k <= 80; k += 8) {
25318 GemmMicrokernelTester()
25319 .mr(4)
25320 .nr(4)
25321 .kr(2)
25322 .sr(1)
25323 .m(4)
25324 .n(4)
25325 .k(k)
25326 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080025327 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025328 }
25329 }
25330
25331 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_div_8_subtile) {
25332 TEST_REQUIRES_X86_SSE2;
25333 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080025334 for (uint32_t n = 1; n <= 4; n++) {
25335 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025336 GemmMicrokernelTester()
25337 .mr(4)
25338 .nr(4)
25339 .kr(2)
25340 .sr(1)
25341 .m(m)
25342 .n(n)
25343 .k(k)
25344 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025345 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025346 }
25347 }
25348 }
25349 }
25350
25351 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4) {
25352 TEST_REQUIRES_X86_SSE2;
25353 for (uint32_t n = 5; n < 8; n++) {
25354 for (size_t k = 1; k <= 40; k += 9) {
25355 GemmMicrokernelTester()
25356 .mr(4)
25357 .nr(4)
25358 .kr(2)
25359 .sr(1)
25360 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080025361 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025362 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080025363 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025364 }
25365 }
25366 }
25367
25368 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4_strided_cn) {
25369 TEST_REQUIRES_X86_SSE2;
25370 for (uint32_t n = 5; n < 8; n++) {
25371 for (size_t k = 1; k <= 40; k += 9) {
25372 GemmMicrokernelTester()
25373 .mr(4)
25374 .nr(4)
25375 .kr(2)
25376 .sr(1)
25377 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080025378 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025379 .k(k)
25380 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080025381 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025382 }
25383 }
25384 }
25385
25386 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4_strided_a) {
25387 TEST_REQUIRES_X86_SSE2;
25388 for (uint32_t n = 5; n < 8; n++) {
25389 for (size_t k = 1; k <= 40; k += 9) {
25390 GemmMicrokernelTester()
25391 .mr(4)
25392 .nr(4)
25393 .kr(2)
25394 .sr(1)
25395 .m(4)
25396 .n(n)
25397 .k(k)
25398 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080025399 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025400 }
25401 }
25402 }
25403
25404 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4_subtile) {
25405 TEST_REQUIRES_X86_SSE2;
25406 for (uint32_t n = 5; n < 8; n++) {
25407 for (size_t k = 1; k <= 40; k += 9) {
25408 for (uint32_t m = 1; m <= 4; m++) {
25409 GemmMicrokernelTester()
25410 .mr(4)
25411 .nr(4)
25412 .kr(2)
25413 .sr(1)
25414 .m(m)
25415 .n(n)
25416 .k(k)
25417 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025418 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025419 }
25420 }
25421 }
25422 }
25423
25424 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4) {
25425 TEST_REQUIRES_X86_SSE2;
25426 for (uint32_t n = 8; n <= 12; n += 4) {
25427 for (size_t k = 1; k <= 40; k += 9) {
25428 GemmMicrokernelTester()
25429 .mr(4)
25430 .nr(4)
25431 .kr(2)
25432 .sr(1)
25433 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080025434 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025435 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080025436 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025437 }
25438 }
25439 }
25440
25441 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4_strided_cn) {
25442 TEST_REQUIRES_X86_SSE2;
25443 for (uint32_t n = 8; n <= 12; n += 4) {
25444 for (size_t k = 1; k <= 40; k += 9) {
25445 GemmMicrokernelTester()
25446 .mr(4)
25447 .nr(4)
25448 .kr(2)
25449 .sr(1)
25450 .m(4)
25451 .n(n)
25452 .k(k)
25453 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080025454 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025455 }
25456 }
25457 }
25458
25459 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4_strided_a) {
25460 TEST_REQUIRES_X86_SSE2;
25461 for (uint32_t n = 8; n <= 12; n += 4) {
25462 for (size_t k = 1; k <= 40; k += 9) {
25463 GemmMicrokernelTester()
25464 .mr(4)
25465 .nr(4)
25466 .kr(2)
25467 .sr(1)
25468 .m(4)
25469 .n(n)
25470 .k(k)
25471 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080025472 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025473 }
25474 }
25475 }
25476
25477 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4_subtile) {
25478 TEST_REQUIRES_X86_SSE2;
25479 for (uint32_t n = 8; n <= 12; n += 4) {
25480 for (size_t k = 1; k <= 40; k += 9) {
25481 for (uint32_t m = 1; m <= 4; m++) {
25482 GemmMicrokernelTester()
25483 .mr(4)
25484 .nr(4)
25485 .kr(2)
25486 .sr(1)
25487 .m(m)
25488 .n(n)
25489 .k(k)
25490 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025491 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025492 }
25493 }
25494 }
25495 }
25496
25497 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, strided_cm_subtile) {
25498 TEST_REQUIRES_X86_SSE2;
25499 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080025500 for (uint32_t n = 1; n <= 4; n++) {
25501 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025502 GemmMicrokernelTester()
25503 .mr(4)
25504 .nr(4)
25505 .kr(2)
25506 .sr(1)
25507 .m(m)
25508 .n(n)
25509 .k(k)
25510 .cm_stride(7)
25511 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025512 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025513 }
25514 }
25515 }
25516 }
25517
25518 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, qmin) {
25519 TEST_REQUIRES_X86_SSE2;
25520 GemmMicrokernelTester()
25521 .mr(4)
25522 .nr(4)
25523 .kr(2)
25524 .sr(1)
25525 .m(4)
25526 .n(4)
25527 .k(8)
25528 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080025529 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025530 }
25531
25532 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, qmax) {
25533 TEST_REQUIRES_X86_SSE2;
25534 GemmMicrokernelTester()
25535 .mr(4)
25536 .nr(4)
25537 .kr(2)
25538 .sr(1)
25539 .m(4)
25540 .n(4)
25541 .k(8)
25542 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080025543 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025544 }
25545
25546 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, strided_cm) {
25547 TEST_REQUIRES_X86_SSE2;
25548 GemmMicrokernelTester()
25549 .mr(4)
25550 .nr(4)
25551 .kr(2)
25552 .sr(1)
25553 .m(4)
25554 .n(4)
25555 .k(8)
25556 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080025557 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025558 }
25559#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
25560
25561
25562#if XNN_ARCH_X86 || XNN_ARCH_X86_64
25563 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8) {
25564 TEST_REQUIRES_X86_SSE41;
25565 GemmMicrokernelTester()
25566 .mr(3)
25567 .nr(4)
25568 .kr(2)
25569 .sr(1)
25570 .m(3)
25571 .n(4)
25572 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080025573 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025574 }
25575
25576 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, strided_cn) {
25577 TEST_REQUIRES_X86_SSE41;
25578 GemmMicrokernelTester()
25579 .mr(3)
25580 .nr(4)
25581 .kr(2)
25582 .sr(1)
25583 .m(3)
25584 .n(4)
25585 .k(8)
25586 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080025587 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025588 }
25589
25590 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8_strided_a) {
25591 TEST_REQUIRES_X86_SSE41;
25592 GemmMicrokernelTester()
25593 .mr(3)
25594 .nr(4)
25595 .kr(2)
25596 .sr(1)
25597 .m(3)
25598 .n(4)
25599 .k(8)
25600 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080025601 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025602 }
25603
25604 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8_subtile) {
25605 TEST_REQUIRES_X86_SSE41;
Zhi An Ng83844ae2022-01-14 09:52:25 -080025606 for (uint32_t n = 1; n <= 4; n++) {
25607 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025608 GemmMicrokernelTester()
25609 .mr(3)
25610 .nr(4)
25611 .kr(2)
25612 .sr(1)
25613 .m(m)
25614 .n(n)
25615 .k(8)
25616 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025617 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025618 }
25619 }
25620 }
25621
25622 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8_subtile_m) {
25623 TEST_REQUIRES_X86_SSE41;
25624 for (uint32_t m = 1; m <= 3; m++) {
25625 GemmMicrokernelTester()
25626 .mr(3)
25627 .nr(4)
25628 .kr(2)
25629 .sr(1)
25630 .m(m)
25631 .n(4)
25632 .k(8)
25633 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025634 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025635 }
25636 }
25637
25638 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8_subtile_n) {
25639 TEST_REQUIRES_X86_SSE41;
25640 for (uint32_t n = 1; n <= 4; n++) {
25641 GemmMicrokernelTester()
25642 .mr(3)
25643 .nr(4)
25644 .kr(2)
25645 .sr(1)
25646 .m(3)
25647 .n(n)
25648 .k(8)
25649 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025650 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025651 }
25652 }
25653
25654 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_lt_8) {
25655 TEST_REQUIRES_X86_SSE41;
25656 for (size_t k = 1; k < 8; k++) {
25657 GemmMicrokernelTester()
25658 .mr(3)
25659 .nr(4)
25660 .kr(2)
25661 .sr(1)
25662 .m(3)
25663 .n(4)
25664 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080025665 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025666 }
25667 }
25668
25669 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_lt_8_strided_a) {
25670 TEST_REQUIRES_X86_SSE41;
25671 for (size_t k = 1; k < 8; k++) {
25672 GemmMicrokernelTester()
25673 .mr(3)
25674 .nr(4)
25675 .kr(2)
25676 .sr(1)
25677 .m(3)
25678 .n(4)
25679 .k(k)
25680 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080025681 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025682 }
25683 }
25684
25685 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_lt_8_subtile) {
25686 TEST_REQUIRES_X86_SSE41;
25687 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080025688 for (uint32_t n = 1; n <= 4; n++) {
25689 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025690 GemmMicrokernelTester()
25691 .mr(3)
25692 .nr(4)
25693 .kr(2)
25694 .sr(1)
25695 .m(m)
25696 .n(n)
25697 .k(k)
25698 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025699 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025700 }
25701 }
25702 }
25703 }
25704
25705 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_gt_8) {
25706 TEST_REQUIRES_X86_SSE41;
25707 for (size_t k = 9; k < 16; k++) {
25708 GemmMicrokernelTester()
25709 .mr(3)
25710 .nr(4)
25711 .kr(2)
25712 .sr(1)
25713 .m(3)
25714 .n(4)
25715 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080025716 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025717 }
25718 }
25719
25720 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_gt_8_strided_a) {
25721 TEST_REQUIRES_X86_SSE41;
25722 for (size_t k = 9; k < 16; k++) {
25723 GemmMicrokernelTester()
25724 .mr(3)
25725 .nr(4)
25726 .kr(2)
25727 .sr(1)
25728 .m(3)
25729 .n(4)
25730 .k(k)
25731 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080025732 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025733 }
25734 }
25735
25736 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_gt_8_subtile) {
25737 TEST_REQUIRES_X86_SSE41;
25738 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080025739 for (uint32_t n = 1; n <= 4; n++) {
25740 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025741 GemmMicrokernelTester()
25742 .mr(3)
25743 .nr(4)
25744 .kr(2)
25745 .sr(1)
25746 .m(m)
25747 .n(n)
25748 .k(k)
25749 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025750 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025751 }
25752 }
25753 }
25754 }
25755
25756 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_div_8) {
25757 TEST_REQUIRES_X86_SSE41;
25758 for (size_t k = 16; k <= 80; k += 8) {
25759 GemmMicrokernelTester()
25760 .mr(3)
25761 .nr(4)
25762 .kr(2)
25763 .sr(1)
25764 .m(3)
25765 .n(4)
25766 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080025767 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025768 }
25769 }
25770
25771 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_div_8_strided_a) {
25772 TEST_REQUIRES_X86_SSE41;
25773 for (size_t k = 16; k <= 80; k += 8) {
25774 GemmMicrokernelTester()
25775 .mr(3)
25776 .nr(4)
25777 .kr(2)
25778 .sr(1)
25779 .m(3)
25780 .n(4)
25781 .k(k)
25782 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080025783 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025784 }
25785 }
25786
25787 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_div_8_subtile) {
25788 TEST_REQUIRES_X86_SSE41;
25789 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080025790 for (uint32_t n = 1; n <= 4; n++) {
25791 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025792 GemmMicrokernelTester()
25793 .mr(3)
25794 .nr(4)
25795 .kr(2)
25796 .sr(1)
25797 .m(m)
25798 .n(n)
25799 .k(k)
25800 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025801 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025802 }
25803 }
25804 }
25805 }
25806
25807 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4) {
25808 TEST_REQUIRES_X86_SSE41;
25809 for (uint32_t n = 5; n < 8; n++) {
25810 for (size_t k = 1; k <= 40; k += 9) {
25811 GemmMicrokernelTester()
25812 .mr(3)
25813 .nr(4)
25814 .kr(2)
25815 .sr(1)
25816 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080025817 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025818 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080025819 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025820 }
25821 }
25822 }
25823
25824 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4_strided_cn) {
25825 TEST_REQUIRES_X86_SSE41;
25826 for (uint32_t n = 5; n < 8; n++) {
25827 for (size_t k = 1; k <= 40; k += 9) {
25828 GemmMicrokernelTester()
25829 .mr(3)
25830 .nr(4)
25831 .kr(2)
25832 .sr(1)
25833 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080025834 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025835 .k(k)
25836 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080025837 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025838 }
25839 }
25840 }
25841
25842 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4_strided_a) {
25843 TEST_REQUIRES_X86_SSE41;
25844 for (uint32_t n = 5; n < 8; n++) {
25845 for (size_t k = 1; k <= 40; k += 9) {
25846 GemmMicrokernelTester()
25847 .mr(3)
25848 .nr(4)
25849 .kr(2)
25850 .sr(1)
25851 .m(3)
25852 .n(n)
25853 .k(k)
25854 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080025855 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025856 }
25857 }
25858 }
25859
25860 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4_subtile) {
25861 TEST_REQUIRES_X86_SSE41;
25862 for (uint32_t n = 5; n < 8; n++) {
25863 for (size_t k = 1; k <= 40; k += 9) {
25864 for (uint32_t m = 1; m <= 3; m++) {
25865 GemmMicrokernelTester()
25866 .mr(3)
25867 .nr(4)
25868 .kr(2)
25869 .sr(1)
25870 .m(m)
25871 .n(n)
25872 .k(k)
25873 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025874 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025875 }
25876 }
25877 }
25878 }
25879
25880 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4) {
25881 TEST_REQUIRES_X86_SSE41;
25882 for (uint32_t n = 8; n <= 12; n += 4) {
25883 for (size_t k = 1; k <= 40; k += 9) {
25884 GemmMicrokernelTester()
25885 .mr(3)
25886 .nr(4)
25887 .kr(2)
25888 .sr(1)
25889 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080025890 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025891 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080025892 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025893 }
25894 }
25895 }
25896
25897 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4_strided_cn) {
25898 TEST_REQUIRES_X86_SSE41;
25899 for (uint32_t n = 8; n <= 12; n += 4) {
25900 for (size_t k = 1; k <= 40; k += 9) {
25901 GemmMicrokernelTester()
25902 .mr(3)
25903 .nr(4)
25904 .kr(2)
25905 .sr(1)
25906 .m(3)
25907 .n(n)
25908 .k(k)
25909 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080025910 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025911 }
25912 }
25913 }
25914
25915 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4_strided_a) {
25916 TEST_REQUIRES_X86_SSE41;
25917 for (uint32_t n = 8; n <= 12; n += 4) {
25918 for (size_t k = 1; k <= 40; k += 9) {
25919 GemmMicrokernelTester()
25920 .mr(3)
25921 .nr(4)
25922 .kr(2)
25923 .sr(1)
25924 .m(3)
25925 .n(n)
25926 .k(k)
25927 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080025928 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025929 }
25930 }
25931 }
25932
25933 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4_subtile) {
25934 TEST_REQUIRES_X86_SSE41;
25935 for (uint32_t n = 8; n <= 12; n += 4) {
25936 for (size_t k = 1; k <= 40; k += 9) {
25937 for (uint32_t m = 1; m <= 3; m++) {
25938 GemmMicrokernelTester()
25939 .mr(3)
25940 .nr(4)
25941 .kr(2)
25942 .sr(1)
25943 .m(m)
25944 .n(n)
25945 .k(k)
25946 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025947 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025948 }
25949 }
25950 }
25951 }
25952
25953 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, strided_cm_subtile) {
25954 TEST_REQUIRES_X86_SSE41;
25955 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080025956 for (uint32_t n = 1; n <= 4; n++) {
25957 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025958 GemmMicrokernelTester()
25959 .mr(3)
25960 .nr(4)
25961 .kr(2)
25962 .sr(1)
25963 .m(m)
25964 .n(n)
25965 .k(k)
25966 .cm_stride(7)
25967 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025968 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025969 }
25970 }
25971 }
25972 }
25973
25974 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, qmin) {
25975 TEST_REQUIRES_X86_SSE41;
25976 GemmMicrokernelTester()
25977 .mr(3)
25978 .nr(4)
25979 .kr(2)
25980 .sr(1)
25981 .m(3)
25982 .n(4)
25983 .k(8)
25984 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080025985 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025986 }
25987
25988 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, qmax) {
25989 TEST_REQUIRES_X86_SSE41;
25990 GemmMicrokernelTester()
25991 .mr(3)
25992 .nr(4)
25993 .kr(2)
25994 .sr(1)
25995 .m(3)
25996 .n(4)
25997 .k(8)
25998 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080025999 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026000 }
26001
26002 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, strided_cm) {
26003 TEST_REQUIRES_X86_SSE41;
26004 GemmMicrokernelTester()
26005 .mr(3)
26006 .nr(4)
26007 .kr(2)
26008 .sr(1)
26009 .m(3)
26010 .n(4)
26011 .k(8)
26012 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080026013 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026014 }
26015#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
26016
26017
26018#if XNN_ARCH_X86 || XNN_ARCH_X86_64
26019 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8) {
26020 TEST_REQUIRES_X86_SSE41;
26021 GemmMicrokernelTester()
26022 .mr(4)
26023 .nr(4)
26024 .kr(2)
26025 .sr(1)
26026 .m(4)
26027 .n(4)
26028 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080026029 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026030 }
26031
26032 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, strided_cn) {
26033 TEST_REQUIRES_X86_SSE41;
26034 GemmMicrokernelTester()
26035 .mr(4)
26036 .nr(4)
26037 .kr(2)
26038 .sr(1)
26039 .m(4)
26040 .n(4)
26041 .k(8)
26042 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080026043 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026044 }
26045
26046 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8_strided_a) {
26047 TEST_REQUIRES_X86_SSE41;
26048 GemmMicrokernelTester()
26049 .mr(4)
26050 .nr(4)
26051 .kr(2)
26052 .sr(1)
26053 .m(4)
26054 .n(4)
26055 .k(8)
26056 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080026057 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026058 }
26059
26060 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8_subtile) {
26061 TEST_REQUIRES_X86_SSE41;
Zhi An Ng83844ae2022-01-14 09:52:25 -080026062 for (uint32_t n = 1; n <= 4; n++) {
26063 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026064 GemmMicrokernelTester()
26065 .mr(4)
26066 .nr(4)
26067 .kr(2)
26068 .sr(1)
26069 .m(m)
26070 .n(n)
26071 .k(8)
26072 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026073 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026074 }
26075 }
26076 }
26077
26078 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8_subtile_m) {
26079 TEST_REQUIRES_X86_SSE41;
26080 for (uint32_t m = 1; m <= 4; m++) {
26081 GemmMicrokernelTester()
26082 .mr(4)
26083 .nr(4)
26084 .kr(2)
26085 .sr(1)
26086 .m(m)
26087 .n(4)
26088 .k(8)
26089 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026090 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026091 }
26092 }
26093
26094 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8_subtile_n) {
26095 TEST_REQUIRES_X86_SSE41;
26096 for (uint32_t n = 1; n <= 4; n++) {
26097 GemmMicrokernelTester()
26098 .mr(4)
26099 .nr(4)
26100 .kr(2)
26101 .sr(1)
26102 .m(4)
26103 .n(n)
26104 .k(8)
26105 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026106 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026107 }
26108 }
26109
26110 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_lt_8) {
26111 TEST_REQUIRES_X86_SSE41;
26112 for (size_t k = 1; k < 8; k++) {
26113 GemmMicrokernelTester()
26114 .mr(4)
26115 .nr(4)
26116 .kr(2)
26117 .sr(1)
26118 .m(4)
26119 .n(4)
26120 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026121 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026122 }
26123 }
26124
26125 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_lt_8_strided_a) {
26126 TEST_REQUIRES_X86_SSE41;
26127 for (size_t k = 1; k < 8; k++) {
26128 GemmMicrokernelTester()
26129 .mr(4)
26130 .nr(4)
26131 .kr(2)
26132 .sr(1)
26133 .m(4)
26134 .n(4)
26135 .k(k)
26136 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080026137 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026138 }
26139 }
26140
26141 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_lt_8_subtile) {
26142 TEST_REQUIRES_X86_SSE41;
26143 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080026144 for (uint32_t n = 1; n <= 4; n++) {
26145 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026146 GemmMicrokernelTester()
26147 .mr(4)
26148 .nr(4)
26149 .kr(2)
26150 .sr(1)
26151 .m(m)
26152 .n(n)
26153 .k(k)
26154 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026155 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026156 }
26157 }
26158 }
26159 }
26160
26161 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_gt_8) {
26162 TEST_REQUIRES_X86_SSE41;
26163 for (size_t k = 9; k < 16; k++) {
26164 GemmMicrokernelTester()
26165 .mr(4)
26166 .nr(4)
26167 .kr(2)
26168 .sr(1)
26169 .m(4)
26170 .n(4)
26171 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026172 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026173 }
26174 }
26175
26176 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_gt_8_strided_a) {
26177 TEST_REQUIRES_X86_SSE41;
26178 for (size_t k = 9; k < 16; k++) {
26179 GemmMicrokernelTester()
26180 .mr(4)
26181 .nr(4)
26182 .kr(2)
26183 .sr(1)
26184 .m(4)
26185 .n(4)
26186 .k(k)
26187 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080026188 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026189 }
26190 }
26191
26192 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_gt_8_subtile) {
26193 TEST_REQUIRES_X86_SSE41;
26194 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080026195 for (uint32_t n = 1; n <= 4; n++) {
26196 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026197 GemmMicrokernelTester()
26198 .mr(4)
26199 .nr(4)
26200 .kr(2)
26201 .sr(1)
26202 .m(m)
26203 .n(n)
26204 .k(k)
26205 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026206 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026207 }
26208 }
26209 }
26210 }
26211
26212 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_div_8) {
26213 TEST_REQUIRES_X86_SSE41;
26214 for (size_t k = 16; k <= 80; k += 8) {
26215 GemmMicrokernelTester()
26216 .mr(4)
26217 .nr(4)
26218 .kr(2)
26219 .sr(1)
26220 .m(4)
26221 .n(4)
26222 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026223 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026224 }
26225 }
26226
26227 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_div_8_strided_a) {
26228 TEST_REQUIRES_X86_SSE41;
26229 for (size_t k = 16; k <= 80; k += 8) {
26230 GemmMicrokernelTester()
26231 .mr(4)
26232 .nr(4)
26233 .kr(2)
26234 .sr(1)
26235 .m(4)
26236 .n(4)
26237 .k(k)
26238 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080026239 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026240 }
26241 }
26242
26243 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_div_8_subtile) {
26244 TEST_REQUIRES_X86_SSE41;
26245 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080026246 for (uint32_t n = 1; n <= 4; n++) {
26247 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026248 GemmMicrokernelTester()
26249 .mr(4)
26250 .nr(4)
26251 .kr(2)
26252 .sr(1)
26253 .m(m)
26254 .n(n)
26255 .k(k)
26256 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026257 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026258 }
26259 }
26260 }
26261 }
26262
26263 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4) {
26264 TEST_REQUIRES_X86_SSE41;
26265 for (uint32_t n = 5; n < 8; n++) {
26266 for (size_t k = 1; k <= 40; k += 9) {
26267 GemmMicrokernelTester()
26268 .mr(4)
26269 .nr(4)
26270 .kr(2)
26271 .sr(1)
26272 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080026273 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026274 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026275 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026276 }
26277 }
26278 }
26279
26280 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4_strided_cn) {
26281 TEST_REQUIRES_X86_SSE41;
26282 for (uint32_t n = 5; n < 8; n++) {
26283 for (size_t k = 1; k <= 40; k += 9) {
26284 GemmMicrokernelTester()
26285 .mr(4)
26286 .nr(4)
26287 .kr(2)
26288 .sr(1)
26289 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080026290 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026291 .k(k)
26292 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080026293 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026294 }
26295 }
26296 }
26297
26298 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4_strided_a) {
26299 TEST_REQUIRES_X86_SSE41;
26300 for (uint32_t n = 5; n < 8; n++) {
26301 for (size_t k = 1; k <= 40; k += 9) {
26302 GemmMicrokernelTester()
26303 .mr(4)
26304 .nr(4)
26305 .kr(2)
26306 .sr(1)
26307 .m(4)
26308 .n(n)
26309 .k(k)
26310 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080026311 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026312 }
26313 }
26314 }
26315
26316 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4_subtile) {
26317 TEST_REQUIRES_X86_SSE41;
26318 for (uint32_t n = 5; n < 8; n++) {
26319 for (size_t k = 1; k <= 40; k += 9) {
26320 for (uint32_t m = 1; m <= 4; m++) {
26321 GemmMicrokernelTester()
26322 .mr(4)
26323 .nr(4)
26324 .kr(2)
26325 .sr(1)
26326 .m(m)
26327 .n(n)
26328 .k(k)
26329 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026330 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026331 }
26332 }
26333 }
26334 }
26335
26336 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4) {
26337 TEST_REQUIRES_X86_SSE41;
26338 for (uint32_t n = 8; n <= 12; n += 4) {
26339 for (size_t k = 1; k <= 40; k += 9) {
26340 GemmMicrokernelTester()
26341 .mr(4)
26342 .nr(4)
26343 .kr(2)
26344 .sr(1)
26345 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080026346 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026347 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026348 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026349 }
26350 }
26351 }
26352
26353 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4_strided_cn) {
26354 TEST_REQUIRES_X86_SSE41;
26355 for (uint32_t n = 8; n <= 12; n += 4) {
26356 for (size_t k = 1; k <= 40; k += 9) {
26357 GemmMicrokernelTester()
26358 .mr(4)
26359 .nr(4)
26360 .kr(2)
26361 .sr(1)
26362 .m(4)
26363 .n(n)
26364 .k(k)
26365 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080026366 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026367 }
26368 }
26369 }
26370
26371 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4_strided_a) {
26372 TEST_REQUIRES_X86_SSE41;
26373 for (uint32_t n = 8; n <= 12; n += 4) {
26374 for (size_t k = 1; k <= 40; k += 9) {
26375 GemmMicrokernelTester()
26376 .mr(4)
26377 .nr(4)
26378 .kr(2)
26379 .sr(1)
26380 .m(4)
26381 .n(n)
26382 .k(k)
26383 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080026384 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026385 }
26386 }
26387 }
26388
26389 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4_subtile) {
26390 TEST_REQUIRES_X86_SSE41;
26391 for (uint32_t n = 8; n <= 12; n += 4) {
26392 for (size_t k = 1; k <= 40; k += 9) {
26393 for (uint32_t m = 1; m <= 4; m++) {
26394 GemmMicrokernelTester()
26395 .mr(4)
26396 .nr(4)
26397 .kr(2)
26398 .sr(1)
26399 .m(m)
26400 .n(n)
26401 .k(k)
26402 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026403 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026404 }
26405 }
26406 }
26407 }
26408
26409 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, strided_cm_subtile) {
26410 TEST_REQUIRES_X86_SSE41;
26411 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080026412 for (uint32_t n = 1; n <= 4; n++) {
26413 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026414 GemmMicrokernelTester()
26415 .mr(4)
26416 .nr(4)
26417 .kr(2)
26418 .sr(1)
26419 .m(m)
26420 .n(n)
26421 .k(k)
26422 .cm_stride(7)
26423 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026424 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026425 }
26426 }
26427 }
26428 }
26429
26430 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, qmin) {
26431 TEST_REQUIRES_X86_SSE41;
26432 GemmMicrokernelTester()
26433 .mr(4)
26434 .nr(4)
26435 .kr(2)
26436 .sr(1)
26437 .m(4)
26438 .n(4)
26439 .k(8)
26440 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080026441 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026442 }
26443
26444 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, qmax) {
26445 TEST_REQUIRES_X86_SSE41;
26446 GemmMicrokernelTester()
26447 .mr(4)
26448 .nr(4)
26449 .kr(2)
26450 .sr(1)
26451 .m(4)
26452 .n(4)
26453 .k(8)
26454 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080026455 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026456 }
26457
26458 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, strided_cm) {
26459 TEST_REQUIRES_X86_SSE41;
26460 GemmMicrokernelTester()
26461 .mr(4)
26462 .nr(4)
26463 .kr(2)
26464 .sr(1)
26465 .m(4)
26466 .n(4)
26467 .k(8)
26468 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080026469 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026470 }
26471#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
26472
26473
26474#if XNN_ARCH_X86 || XNN_ARCH_X86_64
26475 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8) {
26476 TEST_REQUIRES_X86_AVX;
26477 GemmMicrokernelTester()
26478 .mr(1)
26479 .nr(4)
26480 .kr(2)
26481 .sr(1)
26482 .m(1)
26483 .n(4)
26484 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080026485 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026486 }
26487
26488 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, strided_cn) {
26489 TEST_REQUIRES_X86_AVX;
26490 GemmMicrokernelTester()
26491 .mr(1)
26492 .nr(4)
26493 .kr(2)
26494 .sr(1)
26495 .m(1)
26496 .n(4)
26497 .k(8)
26498 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080026499 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026500 }
26501
26502 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_strided_a) {
26503 TEST_REQUIRES_X86_AVX;
26504 GemmMicrokernelTester()
26505 .mr(1)
26506 .nr(4)
26507 .kr(2)
26508 .sr(1)
26509 .m(1)
26510 .n(4)
26511 .k(8)
26512 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080026513 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026514 }
26515
26516 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_subtile) {
26517 TEST_REQUIRES_X86_AVX;
Zhi An Ng83844ae2022-01-14 09:52:25 -080026518 for (uint32_t n = 1; n <= 4; n++) {
26519 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026520 GemmMicrokernelTester()
26521 .mr(1)
26522 .nr(4)
26523 .kr(2)
26524 .sr(1)
26525 .m(m)
26526 .n(n)
26527 .k(8)
26528 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026529 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026530 }
26531 }
26532 }
26533
26534 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_subtile_m) {
26535 TEST_REQUIRES_X86_AVX;
26536 for (uint32_t m = 1; m <= 1; m++) {
26537 GemmMicrokernelTester()
26538 .mr(1)
26539 .nr(4)
26540 .kr(2)
26541 .sr(1)
26542 .m(m)
26543 .n(4)
26544 .k(8)
26545 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026546 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026547 }
26548 }
26549
26550 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_subtile_n) {
26551 TEST_REQUIRES_X86_AVX;
26552 for (uint32_t n = 1; n <= 4; n++) {
26553 GemmMicrokernelTester()
26554 .mr(1)
26555 .nr(4)
26556 .kr(2)
26557 .sr(1)
26558 .m(1)
26559 .n(n)
26560 .k(8)
26561 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026562 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026563 }
26564 }
26565
26566 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_lt_8) {
26567 TEST_REQUIRES_X86_AVX;
26568 for (size_t k = 1; k < 8; k++) {
26569 GemmMicrokernelTester()
26570 .mr(1)
26571 .nr(4)
26572 .kr(2)
26573 .sr(1)
26574 .m(1)
26575 .n(4)
26576 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026577 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026578 }
26579 }
26580
26581 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_lt_8_strided_a) {
26582 TEST_REQUIRES_X86_AVX;
26583 for (size_t k = 1; k < 8; k++) {
26584 GemmMicrokernelTester()
26585 .mr(1)
26586 .nr(4)
26587 .kr(2)
26588 .sr(1)
26589 .m(1)
26590 .n(4)
26591 .k(k)
26592 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080026593 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026594 }
26595 }
26596
26597 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_lt_8_subtile) {
26598 TEST_REQUIRES_X86_AVX;
26599 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080026600 for (uint32_t n = 1; n <= 4; n++) {
26601 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026602 GemmMicrokernelTester()
26603 .mr(1)
26604 .nr(4)
26605 .kr(2)
26606 .sr(1)
26607 .m(m)
26608 .n(n)
26609 .k(k)
26610 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026611 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026612 }
26613 }
26614 }
26615 }
26616
26617 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_gt_8) {
26618 TEST_REQUIRES_X86_AVX;
26619 for (size_t k = 9; k < 16; k++) {
26620 GemmMicrokernelTester()
26621 .mr(1)
26622 .nr(4)
26623 .kr(2)
26624 .sr(1)
26625 .m(1)
26626 .n(4)
26627 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026628 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026629 }
26630 }
26631
26632 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_gt_8_strided_a) {
26633 TEST_REQUIRES_X86_AVX;
26634 for (size_t k = 9; k < 16; k++) {
26635 GemmMicrokernelTester()
26636 .mr(1)
26637 .nr(4)
26638 .kr(2)
26639 .sr(1)
26640 .m(1)
26641 .n(4)
26642 .k(k)
26643 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080026644 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026645 }
26646 }
26647
26648 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_gt_8_subtile) {
26649 TEST_REQUIRES_X86_AVX;
26650 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080026651 for (uint32_t n = 1; n <= 4; n++) {
26652 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026653 GemmMicrokernelTester()
26654 .mr(1)
26655 .nr(4)
26656 .kr(2)
26657 .sr(1)
26658 .m(m)
26659 .n(n)
26660 .k(k)
26661 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026662 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026663 }
26664 }
26665 }
26666 }
26667
26668 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_div_8) {
26669 TEST_REQUIRES_X86_AVX;
26670 for (size_t k = 16; k <= 80; k += 8) {
26671 GemmMicrokernelTester()
26672 .mr(1)
26673 .nr(4)
26674 .kr(2)
26675 .sr(1)
26676 .m(1)
26677 .n(4)
26678 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026679 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026680 }
26681 }
26682
26683 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_div_8_strided_a) {
26684 TEST_REQUIRES_X86_AVX;
26685 for (size_t k = 16; k <= 80; k += 8) {
26686 GemmMicrokernelTester()
26687 .mr(1)
26688 .nr(4)
26689 .kr(2)
26690 .sr(1)
26691 .m(1)
26692 .n(4)
26693 .k(k)
26694 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080026695 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026696 }
26697 }
26698
26699 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_div_8_subtile) {
26700 TEST_REQUIRES_X86_AVX;
26701 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080026702 for (uint32_t n = 1; n <= 4; n++) {
26703 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026704 GemmMicrokernelTester()
26705 .mr(1)
26706 .nr(4)
26707 .kr(2)
26708 .sr(1)
26709 .m(m)
26710 .n(n)
26711 .k(k)
26712 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026713 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026714 }
26715 }
26716 }
26717 }
26718
26719 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4) {
26720 TEST_REQUIRES_X86_AVX;
26721 for (uint32_t n = 5; n < 8; n++) {
26722 for (size_t k = 1; k <= 40; k += 9) {
26723 GemmMicrokernelTester()
26724 .mr(1)
26725 .nr(4)
26726 .kr(2)
26727 .sr(1)
26728 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080026729 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026730 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026731 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026732 }
26733 }
26734 }
26735
26736 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4_strided_cn) {
26737 TEST_REQUIRES_X86_AVX;
26738 for (uint32_t n = 5; n < 8; n++) {
26739 for (size_t k = 1; k <= 40; k += 9) {
26740 GemmMicrokernelTester()
26741 .mr(1)
26742 .nr(4)
26743 .kr(2)
26744 .sr(1)
26745 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080026746 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026747 .k(k)
26748 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080026749 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026750 }
26751 }
26752 }
26753
26754 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4_strided_a) {
26755 TEST_REQUIRES_X86_AVX;
26756 for (uint32_t n = 5; n < 8; n++) {
26757 for (size_t k = 1; k <= 40; k += 9) {
26758 GemmMicrokernelTester()
26759 .mr(1)
26760 .nr(4)
26761 .kr(2)
26762 .sr(1)
26763 .m(1)
26764 .n(n)
26765 .k(k)
26766 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080026767 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026768 }
26769 }
26770 }
26771
26772 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4_subtile) {
26773 TEST_REQUIRES_X86_AVX;
26774 for (uint32_t n = 5; n < 8; n++) {
26775 for (size_t k = 1; k <= 40; k += 9) {
26776 for (uint32_t m = 1; m <= 1; m++) {
26777 GemmMicrokernelTester()
26778 .mr(1)
26779 .nr(4)
26780 .kr(2)
26781 .sr(1)
26782 .m(m)
26783 .n(n)
26784 .k(k)
26785 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026786 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026787 }
26788 }
26789 }
26790 }
26791
26792 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4) {
26793 TEST_REQUIRES_X86_AVX;
26794 for (uint32_t n = 8; n <= 12; n += 4) {
26795 for (size_t k = 1; k <= 40; k += 9) {
26796 GemmMicrokernelTester()
26797 .mr(1)
26798 .nr(4)
26799 .kr(2)
26800 .sr(1)
26801 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080026802 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026803 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026804 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026805 }
26806 }
26807 }
26808
26809 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4_strided_cn) {
26810 TEST_REQUIRES_X86_AVX;
26811 for (uint32_t n = 8; n <= 12; n += 4) {
26812 for (size_t k = 1; k <= 40; k += 9) {
26813 GemmMicrokernelTester()
26814 .mr(1)
26815 .nr(4)
26816 .kr(2)
26817 .sr(1)
26818 .m(1)
26819 .n(n)
26820 .k(k)
26821 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080026822 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026823 }
26824 }
26825 }
26826
26827 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4_strided_a) {
26828 TEST_REQUIRES_X86_AVX;
26829 for (uint32_t n = 8; n <= 12; n += 4) {
26830 for (size_t k = 1; k <= 40; k += 9) {
26831 GemmMicrokernelTester()
26832 .mr(1)
26833 .nr(4)
26834 .kr(2)
26835 .sr(1)
26836 .m(1)
26837 .n(n)
26838 .k(k)
26839 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080026840 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026841 }
26842 }
26843 }
26844
26845 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4_subtile) {
26846 TEST_REQUIRES_X86_AVX;
26847 for (uint32_t n = 8; n <= 12; n += 4) {
26848 for (size_t k = 1; k <= 40; k += 9) {
26849 for (uint32_t m = 1; m <= 1; m++) {
26850 GemmMicrokernelTester()
26851 .mr(1)
26852 .nr(4)
26853 .kr(2)
26854 .sr(1)
26855 .m(m)
26856 .n(n)
26857 .k(k)
26858 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026859 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026860 }
26861 }
26862 }
26863 }
26864
26865 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, strided_cm_subtile) {
26866 TEST_REQUIRES_X86_AVX;
26867 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080026868 for (uint32_t n = 1; n <= 4; n++) {
26869 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026870 GemmMicrokernelTester()
26871 .mr(1)
26872 .nr(4)
26873 .kr(2)
26874 .sr(1)
26875 .m(m)
26876 .n(n)
26877 .k(k)
26878 .cm_stride(7)
26879 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026880 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026881 }
26882 }
26883 }
26884 }
26885
26886 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, qmin) {
26887 TEST_REQUIRES_X86_AVX;
26888 GemmMicrokernelTester()
26889 .mr(1)
26890 .nr(4)
26891 .kr(2)
26892 .sr(1)
26893 .m(1)
26894 .n(4)
26895 .k(8)
26896 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080026897 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026898 }
26899
26900 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, qmax) {
26901 TEST_REQUIRES_X86_AVX;
26902 GemmMicrokernelTester()
26903 .mr(1)
26904 .nr(4)
26905 .kr(2)
26906 .sr(1)
26907 .m(1)
26908 .n(4)
26909 .k(8)
26910 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080026911 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026912 }
26913
26914 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, strided_cm) {
26915 TEST_REQUIRES_X86_AVX;
26916 GemmMicrokernelTester()
26917 .mr(1)
26918 .nr(4)
26919 .kr(2)
26920 .sr(1)
26921 .m(1)
26922 .n(4)
26923 .k(8)
26924 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080026925 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026926 }
26927#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
26928
26929
26930#if XNN_ARCH_X86 || XNN_ARCH_X86_64
26931 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8) {
26932 TEST_REQUIRES_X86_XOP;
26933 GemmMicrokernelTester()
26934 .mr(1)
26935 .nr(4)
26936 .kr(2)
26937 .sr(1)
26938 .m(1)
26939 .n(4)
26940 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080026941 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026942 }
26943
26944 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, strided_cn) {
26945 TEST_REQUIRES_X86_XOP;
26946 GemmMicrokernelTester()
26947 .mr(1)
26948 .nr(4)
26949 .kr(2)
26950 .sr(1)
26951 .m(1)
26952 .n(4)
26953 .k(8)
26954 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080026955 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026956 }
26957
26958 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8_strided_a) {
26959 TEST_REQUIRES_X86_XOP;
26960 GemmMicrokernelTester()
26961 .mr(1)
26962 .nr(4)
26963 .kr(2)
26964 .sr(1)
26965 .m(1)
26966 .n(4)
26967 .k(8)
26968 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080026969 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026970 }
26971
26972 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8_subtile) {
26973 TEST_REQUIRES_X86_XOP;
Zhi An Ng83844ae2022-01-14 09:52:25 -080026974 for (uint32_t n = 1; n <= 4; n++) {
26975 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026976 GemmMicrokernelTester()
26977 .mr(1)
26978 .nr(4)
26979 .kr(2)
26980 .sr(1)
26981 .m(m)
26982 .n(n)
26983 .k(8)
26984 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026985 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026986 }
26987 }
26988 }
26989
26990 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8_subtile_m) {
26991 TEST_REQUIRES_X86_XOP;
26992 for (uint32_t m = 1; m <= 1; m++) {
26993 GemmMicrokernelTester()
26994 .mr(1)
26995 .nr(4)
26996 .kr(2)
26997 .sr(1)
26998 .m(m)
26999 .n(4)
27000 .k(8)
27001 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027002 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027003 }
27004 }
27005
27006 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8_subtile_n) {
27007 TEST_REQUIRES_X86_XOP;
27008 for (uint32_t n = 1; n <= 4; n++) {
27009 GemmMicrokernelTester()
27010 .mr(1)
27011 .nr(4)
27012 .kr(2)
27013 .sr(1)
27014 .m(1)
27015 .n(n)
27016 .k(8)
27017 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027018 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027019 }
27020 }
27021
27022 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_lt_8) {
27023 TEST_REQUIRES_X86_XOP;
27024 for (size_t k = 1; k < 8; k++) {
27025 GemmMicrokernelTester()
27026 .mr(1)
27027 .nr(4)
27028 .kr(2)
27029 .sr(1)
27030 .m(1)
27031 .n(4)
27032 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080027033 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027034 }
27035 }
27036
27037 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_lt_8_strided_a) {
27038 TEST_REQUIRES_X86_XOP;
27039 for (size_t k = 1; k < 8; k++) {
27040 GemmMicrokernelTester()
27041 .mr(1)
27042 .nr(4)
27043 .kr(2)
27044 .sr(1)
27045 .m(1)
27046 .n(4)
27047 .k(k)
27048 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080027049 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027050 }
27051 }
27052
27053 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_lt_8_subtile) {
27054 TEST_REQUIRES_X86_XOP;
27055 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080027056 for (uint32_t n = 1; n <= 4; n++) {
27057 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027058 GemmMicrokernelTester()
27059 .mr(1)
27060 .nr(4)
27061 .kr(2)
27062 .sr(1)
27063 .m(m)
27064 .n(n)
27065 .k(k)
27066 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027067 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027068 }
27069 }
27070 }
27071 }
27072
27073 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_gt_8) {
27074 TEST_REQUIRES_X86_XOP;
27075 for (size_t k = 9; k < 16; k++) {
27076 GemmMicrokernelTester()
27077 .mr(1)
27078 .nr(4)
27079 .kr(2)
27080 .sr(1)
27081 .m(1)
27082 .n(4)
27083 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080027084 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027085 }
27086 }
27087
27088 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_gt_8_strided_a) {
27089 TEST_REQUIRES_X86_XOP;
27090 for (size_t k = 9; k < 16; k++) {
27091 GemmMicrokernelTester()
27092 .mr(1)
27093 .nr(4)
27094 .kr(2)
27095 .sr(1)
27096 .m(1)
27097 .n(4)
27098 .k(k)
27099 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080027100 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027101 }
27102 }
27103
27104 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_gt_8_subtile) {
27105 TEST_REQUIRES_X86_XOP;
27106 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080027107 for (uint32_t n = 1; n <= 4; n++) {
27108 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027109 GemmMicrokernelTester()
27110 .mr(1)
27111 .nr(4)
27112 .kr(2)
27113 .sr(1)
27114 .m(m)
27115 .n(n)
27116 .k(k)
27117 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027118 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027119 }
27120 }
27121 }
27122 }
27123
27124 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_div_8) {
27125 TEST_REQUIRES_X86_XOP;
27126 for (size_t k = 16; k <= 80; k += 8) {
27127 GemmMicrokernelTester()
27128 .mr(1)
27129 .nr(4)
27130 .kr(2)
27131 .sr(1)
27132 .m(1)
27133 .n(4)
27134 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080027135 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027136 }
27137 }
27138
27139 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_div_8_strided_a) {
27140 TEST_REQUIRES_X86_XOP;
27141 for (size_t k = 16; k <= 80; k += 8) {
27142 GemmMicrokernelTester()
27143 .mr(1)
27144 .nr(4)
27145 .kr(2)
27146 .sr(1)
27147 .m(1)
27148 .n(4)
27149 .k(k)
27150 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080027151 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027152 }
27153 }
27154
27155 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_div_8_subtile) {
27156 TEST_REQUIRES_X86_XOP;
27157 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080027158 for (uint32_t n = 1; n <= 4; n++) {
27159 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027160 GemmMicrokernelTester()
27161 .mr(1)
27162 .nr(4)
27163 .kr(2)
27164 .sr(1)
27165 .m(m)
27166 .n(n)
27167 .k(k)
27168 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027169 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027170 }
27171 }
27172 }
27173 }
27174
27175 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4) {
27176 TEST_REQUIRES_X86_XOP;
27177 for (uint32_t n = 5; n < 8; n++) {
27178 for (size_t k = 1; k <= 40; k += 9) {
27179 GemmMicrokernelTester()
27180 .mr(1)
27181 .nr(4)
27182 .kr(2)
27183 .sr(1)
27184 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080027185 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027186 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080027187 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027188 }
27189 }
27190 }
27191
27192 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4_strided_cn) {
27193 TEST_REQUIRES_X86_XOP;
27194 for (uint32_t n = 5; n < 8; n++) {
27195 for (size_t k = 1; k <= 40; k += 9) {
27196 GemmMicrokernelTester()
27197 .mr(1)
27198 .nr(4)
27199 .kr(2)
27200 .sr(1)
27201 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080027202 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027203 .k(k)
27204 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080027205 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027206 }
27207 }
27208 }
27209
27210 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4_strided_a) {
27211 TEST_REQUIRES_X86_XOP;
27212 for (uint32_t n = 5; n < 8; n++) {
27213 for (size_t k = 1; k <= 40; k += 9) {
27214 GemmMicrokernelTester()
27215 .mr(1)
27216 .nr(4)
27217 .kr(2)
27218 .sr(1)
27219 .m(1)
27220 .n(n)
27221 .k(k)
27222 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080027223 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027224 }
27225 }
27226 }
27227
27228 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4_subtile) {
27229 TEST_REQUIRES_X86_XOP;
27230 for (uint32_t n = 5; n < 8; n++) {
27231 for (size_t k = 1; k <= 40; k += 9) {
27232 for (uint32_t m = 1; m <= 1; m++) {
27233 GemmMicrokernelTester()
27234 .mr(1)
27235 .nr(4)
27236 .kr(2)
27237 .sr(1)
27238 .m(m)
27239 .n(n)
27240 .k(k)
27241 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027242 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027243 }
27244 }
27245 }
27246 }
27247
27248 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4) {
27249 TEST_REQUIRES_X86_XOP;
27250 for (uint32_t n = 8; n <= 12; n += 4) {
27251 for (size_t k = 1; k <= 40; k += 9) {
27252 GemmMicrokernelTester()
27253 .mr(1)
27254 .nr(4)
27255 .kr(2)
27256 .sr(1)
27257 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080027258 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027259 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080027260 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027261 }
27262 }
27263 }
27264
27265 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4_strided_cn) {
27266 TEST_REQUIRES_X86_XOP;
27267 for (uint32_t n = 8; n <= 12; n += 4) {
27268 for (size_t k = 1; k <= 40; k += 9) {
27269 GemmMicrokernelTester()
27270 .mr(1)
27271 .nr(4)
27272 .kr(2)
27273 .sr(1)
27274 .m(1)
27275 .n(n)
27276 .k(k)
27277 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080027278 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027279 }
27280 }
27281 }
27282
27283 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4_strided_a) {
27284 TEST_REQUIRES_X86_XOP;
27285 for (uint32_t n = 8; n <= 12; n += 4) {
27286 for (size_t k = 1; k <= 40; k += 9) {
27287 GemmMicrokernelTester()
27288 .mr(1)
27289 .nr(4)
27290 .kr(2)
27291 .sr(1)
27292 .m(1)
27293 .n(n)
27294 .k(k)
27295 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080027296 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027297 }
27298 }
27299 }
27300
27301 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4_subtile) {
27302 TEST_REQUIRES_X86_XOP;
27303 for (uint32_t n = 8; n <= 12; n += 4) {
27304 for (size_t k = 1; k <= 40; k += 9) {
27305 for (uint32_t m = 1; m <= 1; m++) {
27306 GemmMicrokernelTester()
27307 .mr(1)
27308 .nr(4)
27309 .kr(2)
27310 .sr(1)
27311 .m(m)
27312 .n(n)
27313 .k(k)
27314 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027315 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027316 }
27317 }
27318 }
27319 }
27320
27321 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, strided_cm_subtile) {
27322 TEST_REQUIRES_X86_XOP;
27323 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080027324 for (uint32_t n = 1; n <= 4; n++) {
27325 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027326 GemmMicrokernelTester()
27327 .mr(1)
27328 .nr(4)
27329 .kr(2)
27330 .sr(1)
27331 .m(m)
27332 .n(n)
27333 .k(k)
27334 .cm_stride(7)
27335 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027336 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027337 }
27338 }
27339 }
27340 }
27341
27342 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, qmin) {
27343 TEST_REQUIRES_X86_XOP;
27344 GemmMicrokernelTester()
27345 .mr(1)
27346 .nr(4)
27347 .kr(2)
27348 .sr(1)
27349 .m(1)
27350 .n(4)
27351 .k(8)
27352 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080027353 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027354 }
27355
27356 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, qmax) {
27357 TEST_REQUIRES_X86_XOP;
27358 GemmMicrokernelTester()
27359 .mr(1)
27360 .nr(4)
27361 .kr(2)
27362 .sr(1)
27363 .m(1)
27364 .n(4)
27365 .k(8)
27366 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080027367 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027368 }
27369
27370 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, strided_cm) {
27371 TEST_REQUIRES_X86_XOP;
27372 GemmMicrokernelTester()
27373 .mr(1)
27374 .nr(4)
27375 .kr(2)
27376 .sr(1)
27377 .m(1)
27378 .n(4)
27379 .k(8)
27380 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080027381 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027382 }
27383#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
27384
27385
27386#if XNN_ARCH_X86 || XNN_ARCH_X86_64
27387 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8) {
27388 TEST_REQUIRES_X86_XOP;
27389 GemmMicrokernelTester()
27390 .mr(3)
27391 .nr(4)
27392 .kr(2)
27393 .sr(1)
27394 .m(3)
27395 .n(4)
27396 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080027397 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027398 }
27399
27400 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, strided_cn) {
27401 TEST_REQUIRES_X86_XOP;
27402 GemmMicrokernelTester()
27403 .mr(3)
27404 .nr(4)
27405 .kr(2)
27406 .sr(1)
27407 .m(3)
27408 .n(4)
27409 .k(8)
27410 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080027411 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027412 }
27413
27414 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_strided_a) {
27415 TEST_REQUIRES_X86_XOP;
27416 GemmMicrokernelTester()
27417 .mr(3)
27418 .nr(4)
27419 .kr(2)
27420 .sr(1)
27421 .m(3)
27422 .n(4)
27423 .k(8)
27424 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080027425 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027426 }
27427
27428 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_subtile) {
27429 TEST_REQUIRES_X86_XOP;
Zhi An Ng83844ae2022-01-14 09:52:25 -080027430 for (uint32_t n = 1; n <= 4; n++) {
27431 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027432 GemmMicrokernelTester()
27433 .mr(3)
27434 .nr(4)
27435 .kr(2)
27436 .sr(1)
27437 .m(m)
27438 .n(n)
27439 .k(8)
27440 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027441 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027442 }
27443 }
27444 }
27445
27446 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_subtile_m) {
27447 TEST_REQUIRES_X86_XOP;
27448 for (uint32_t m = 1; m <= 3; m++) {
27449 GemmMicrokernelTester()
27450 .mr(3)
27451 .nr(4)
27452 .kr(2)
27453 .sr(1)
27454 .m(m)
27455 .n(4)
27456 .k(8)
27457 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027458 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027459 }
27460 }
27461
27462 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_subtile_n) {
27463 TEST_REQUIRES_X86_XOP;
27464 for (uint32_t n = 1; n <= 4; n++) {
27465 GemmMicrokernelTester()
27466 .mr(3)
27467 .nr(4)
27468 .kr(2)
27469 .sr(1)
27470 .m(3)
27471 .n(n)
27472 .k(8)
27473 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027474 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027475 }
27476 }
27477
27478 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_lt_8) {
27479 TEST_REQUIRES_X86_XOP;
27480 for (size_t k = 1; k < 8; k++) {
27481 GemmMicrokernelTester()
27482 .mr(3)
27483 .nr(4)
27484 .kr(2)
27485 .sr(1)
27486 .m(3)
27487 .n(4)
27488 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080027489 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027490 }
27491 }
27492
27493 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_lt_8_strided_a) {
27494 TEST_REQUIRES_X86_XOP;
27495 for (size_t k = 1; k < 8; k++) {
27496 GemmMicrokernelTester()
27497 .mr(3)
27498 .nr(4)
27499 .kr(2)
27500 .sr(1)
27501 .m(3)
27502 .n(4)
27503 .k(k)
27504 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080027505 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027506 }
27507 }
27508
27509 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_lt_8_subtile) {
27510 TEST_REQUIRES_X86_XOP;
27511 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080027512 for (uint32_t n = 1; n <= 4; n++) {
27513 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027514 GemmMicrokernelTester()
27515 .mr(3)
27516 .nr(4)
27517 .kr(2)
27518 .sr(1)
27519 .m(m)
27520 .n(n)
27521 .k(k)
27522 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027523 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027524 }
27525 }
27526 }
27527 }
27528
27529 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_gt_8) {
27530 TEST_REQUIRES_X86_XOP;
27531 for (size_t k = 9; k < 16; k++) {
27532 GemmMicrokernelTester()
27533 .mr(3)
27534 .nr(4)
27535 .kr(2)
27536 .sr(1)
27537 .m(3)
27538 .n(4)
27539 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080027540 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027541 }
27542 }
27543
27544 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_gt_8_strided_a) {
27545 TEST_REQUIRES_X86_XOP;
27546 for (size_t k = 9; k < 16; k++) {
27547 GemmMicrokernelTester()
27548 .mr(3)
27549 .nr(4)
27550 .kr(2)
27551 .sr(1)
27552 .m(3)
27553 .n(4)
27554 .k(k)
27555 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080027556 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027557 }
27558 }
27559
27560 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_gt_8_subtile) {
27561 TEST_REQUIRES_X86_XOP;
27562 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080027563 for (uint32_t n = 1; n <= 4; n++) {
27564 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027565 GemmMicrokernelTester()
27566 .mr(3)
27567 .nr(4)
27568 .kr(2)
27569 .sr(1)
27570 .m(m)
27571 .n(n)
27572 .k(k)
27573 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027574 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027575 }
27576 }
27577 }
27578 }
27579
27580 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_div_8) {
27581 TEST_REQUIRES_X86_XOP;
27582 for (size_t k = 16; k <= 80; k += 8) {
27583 GemmMicrokernelTester()
27584 .mr(3)
27585 .nr(4)
27586 .kr(2)
27587 .sr(1)
27588 .m(3)
27589 .n(4)
27590 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080027591 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027592 }
27593 }
27594
27595 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_div_8_strided_a) {
27596 TEST_REQUIRES_X86_XOP;
27597 for (size_t k = 16; k <= 80; k += 8) {
27598 GemmMicrokernelTester()
27599 .mr(3)
27600 .nr(4)
27601 .kr(2)
27602 .sr(1)
27603 .m(3)
27604 .n(4)
27605 .k(k)
27606 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080027607 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027608 }
27609 }
27610
27611 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_div_8_subtile) {
27612 TEST_REQUIRES_X86_XOP;
27613 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080027614 for (uint32_t n = 1; n <= 4; n++) {
27615 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027616 GemmMicrokernelTester()
27617 .mr(3)
27618 .nr(4)
27619 .kr(2)
27620 .sr(1)
27621 .m(m)
27622 .n(n)
27623 .k(k)
27624 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027625 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027626 }
27627 }
27628 }
27629 }
27630
27631 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4) {
27632 TEST_REQUIRES_X86_XOP;
27633 for (uint32_t n = 5; n < 8; n++) {
27634 for (size_t k = 1; k <= 40; k += 9) {
27635 GemmMicrokernelTester()
27636 .mr(3)
27637 .nr(4)
27638 .kr(2)
27639 .sr(1)
27640 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080027641 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027642 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080027643 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027644 }
27645 }
27646 }
27647
27648 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4_strided_cn) {
27649 TEST_REQUIRES_X86_XOP;
27650 for (uint32_t n = 5; n < 8; n++) {
27651 for (size_t k = 1; k <= 40; k += 9) {
27652 GemmMicrokernelTester()
27653 .mr(3)
27654 .nr(4)
27655 .kr(2)
27656 .sr(1)
27657 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080027658 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027659 .k(k)
27660 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080027661 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027662 }
27663 }
27664 }
27665
27666 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4_strided_a) {
27667 TEST_REQUIRES_X86_XOP;
27668 for (uint32_t n = 5; n < 8; n++) {
27669 for (size_t k = 1; k <= 40; k += 9) {
27670 GemmMicrokernelTester()
27671 .mr(3)
27672 .nr(4)
27673 .kr(2)
27674 .sr(1)
27675 .m(3)
27676 .n(n)
27677 .k(k)
27678 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080027679 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027680 }
27681 }
27682 }
27683
27684 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4_subtile) {
27685 TEST_REQUIRES_X86_XOP;
27686 for (uint32_t n = 5; n < 8; n++) {
27687 for (size_t k = 1; k <= 40; k += 9) {
27688 for (uint32_t m = 1; m <= 3; m++) {
27689 GemmMicrokernelTester()
27690 .mr(3)
27691 .nr(4)
27692 .kr(2)
27693 .sr(1)
27694 .m(m)
27695 .n(n)
27696 .k(k)
27697 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027698 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027699 }
27700 }
27701 }
27702 }
27703
27704 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4) {
27705 TEST_REQUIRES_X86_XOP;
27706 for (uint32_t n = 8; n <= 12; n += 4) {
27707 for (size_t k = 1; k <= 40; k += 9) {
27708 GemmMicrokernelTester()
27709 .mr(3)
27710 .nr(4)
27711 .kr(2)
27712 .sr(1)
27713 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080027714 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027715 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080027716 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027717 }
27718 }
27719 }
27720
27721 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4_strided_cn) {
27722 TEST_REQUIRES_X86_XOP;
27723 for (uint32_t n = 8; n <= 12; n += 4) {
27724 for (size_t k = 1; k <= 40; k += 9) {
27725 GemmMicrokernelTester()
27726 .mr(3)
27727 .nr(4)
27728 .kr(2)
27729 .sr(1)
27730 .m(3)
27731 .n(n)
27732 .k(k)
27733 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080027734 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027735 }
27736 }
27737 }
27738
27739 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4_strided_a) {
27740 TEST_REQUIRES_X86_XOP;
27741 for (uint32_t n = 8; n <= 12; n += 4) {
27742 for (size_t k = 1; k <= 40; k += 9) {
27743 GemmMicrokernelTester()
27744 .mr(3)
27745 .nr(4)
27746 .kr(2)
27747 .sr(1)
27748 .m(3)
27749 .n(n)
27750 .k(k)
27751 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080027752 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027753 }
27754 }
27755 }
27756
27757 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4_subtile) {
27758 TEST_REQUIRES_X86_XOP;
27759 for (uint32_t n = 8; n <= 12; n += 4) {
27760 for (size_t k = 1; k <= 40; k += 9) {
27761 for (uint32_t m = 1; m <= 3; m++) {
27762 GemmMicrokernelTester()
27763 .mr(3)
27764 .nr(4)
27765 .kr(2)
27766 .sr(1)
27767 .m(m)
27768 .n(n)
27769 .k(k)
27770 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027771 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027772 }
27773 }
27774 }
27775 }
27776
27777 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, strided_cm_subtile) {
27778 TEST_REQUIRES_X86_XOP;
27779 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080027780 for (uint32_t n = 1; n <= 4; n++) {
27781 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027782 GemmMicrokernelTester()
27783 .mr(3)
27784 .nr(4)
27785 .kr(2)
27786 .sr(1)
27787 .m(m)
27788 .n(n)
27789 .k(k)
27790 .cm_stride(7)
27791 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027792 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027793 }
27794 }
27795 }
27796 }
27797
27798 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, qmin) {
27799 TEST_REQUIRES_X86_XOP;
27800 GemmMicrokernelTester()
27801 .mr(3)
27802 .nr(4)
27803 .kr(2)
27804 .sr(1)
27805 .m(3)
27806 .n(4)
27807 .k(8)
27808 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080027809 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027810 }
27811
27812 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, qmax) {
27813 TEST_REQUIRES_X86_XOP;
27814 GemmMicrokernelTester()
27815 .mr(3)
27816 .nr(4)
27817 .kr(2)
27818 .sr(1)
27819 .m(3)
27820 .n(4)
27821 .k(8)
27822 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080027823 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027824 }
27825
27826 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, strided_cm) {
27827 TEST_REQUIRES_X86_XOP;
27828 GemmMicrokernelTester()
27829 .mr(3)
27830 .nr(4)
27831 .kr(2)
27832 .sr(1)
27833 .m(3)
27834 .n(4)
27835 .k(8)
27836 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080027837 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027838 }
27839#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
27840
27841
27842#if XNN_ARCH_X86 || XNN_ARCH_X86_64
27843 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8) {
27844 TEST_REQUIRES_X86_XOP;
27845 GemmMicrokernelTester()
27846 .mr(4)
27847 .nr(4)
27848 .kr(2)
27849 .sr(1)
27850 .m(4)
27851 .n(4)
27852 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080027853 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027854 }
27855
27856 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, strided_cn) {
27857 TEST_REQUIRES_X86_XOP;
27858 GemmMicrokernelTester()
27859 .mr(4)
27860 .nr(4)
27861 .kr(2)
27862 .sr(1)
27863 .m(4)
27864 .n(4)
27865 .k(8)
27866 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080027867 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027868 }
27869
27870 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8_strided_a) {
27871 TEST_REQUIRES_X86_XOP;
27872 GemmMicrokernelTester()
27873 .mr(4)
27874 .nr(4)
27875 .kr(2)
27876 .sr(1)
27877 .m(4)
27878 .n(4)
27879 .k(8)
27880 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080027881 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027882 }
27883
27884 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8_subtile) {
27885 TEST_REQUIRES_X86_XOP;
Zhi An Ng83844ae2022-01-14 09:52:25 -080027886 for (uint32_t n = 1; n <= 4; n++) {
27887 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027888 GemmMicrokernelTester()
27889 .mr(4)
27890 .nr(4)
27891 .kr(2)
27892 .sr(1)
27893 .m(m)
27894 .n(n)
27895 .k(8)
27896 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027897 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027898 }
27899 }
27900 }
27901
27902 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8_subtile_m) {
27903 TEST_REQUIRES_X86_XOP;
27904 for (uint32_t m = 1; m <= 4; m++) {
27905 GemmMicrokernelTester()
27906 .mr(4)
27907 .nr(4)
27908 .kr(2)
27909 .sr(1)
27910 .m(m)
27911 .n(4)
27912 .k(8)
27913 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027914 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027915 }
27916 }
27917
27918 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8_subtile_n) {
27919 TEST_REQUIRES_X86_XOP;
27920 for (uint32_t n = 1; n <= 4; n++) {
27921 GemmMicrokernelTester()
27922 .mr(4)
27923 .nr(4)
27924 .kr(2)
27925 .sr(1)
27926 .m(4)
27927 .n(n)
27928 .k(8)
27929 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027930 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027931 }
27932 }
27933
27934 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_lt_8) {
27935 TEST_REQUIRES_X86_XOP;
27936 for (size_t k = 1; k < 8; k++) {
27937 GemmMicrokernelTester()
27938 .mr(4)
27939 .nr(4)
27940 .kr(2)
27941 .sr(1)
27942 .m(4)
27943 .n(4)
27944 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080027945 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027946 }
27947 }
27948
27949 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_lt_8_strided_a) {
27950 TEST_REQUIRES_X86_XOP;
27951 for (size_t k = 1; k < 8; k++) {
27952 GemmMicrokernelTester()
27953 .mr(4)
27954 .nr(4)
27955 .kr(2)
27956 .sr(1)
27957 .m(4)
27958 .n(4)
27959 .k(k)
27960 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080027961 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027962 }
27963 }
27964
27965 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_lt_8_subtile) {
27966 TEST_REQUIRES_X86_XOP;
27967 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080027968 for (uint32_t n = 1; n <= 4; n++) {
27969 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027970 GemmMicrokernelTester()
27971 .mr(4)
27972 .nr(4)
27973 .kr(2)
27974 .sr(1)
27975 .m(m)
27976 .n(n)
27977 .k(k)
27978 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027979 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027980 }
27981 }
27982 }
27983 }
27984
27985 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_gt_8) {
27986 TEST_REQUIRES_X86_XOP;
27987 for (size_t k = 9; k < 16; k++) {
27988 GemmMicrokernelTester()
27989 .mr(4)
27990 .nr(4)
27991 .kr(2)
27992 .sr(1)
27993 .m(4)
27994 .n(4)
27995 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080027996 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027997 }
27998 }
27999
28000 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_gt_8_strided_a) {
28001 TEST_REQUIRES_X86_XOP;
28002 for (size_t k = 9; k < 16; k++) {
28003 GemmMicrokernelTester()
28004 .mr(4)
28005 .nr(4)
28006 .kr(2)
28007 .sr(1)
28008 .m(4)
28009 .n(4)
28010 .k(k)
28011 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080028012 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028013 }
28014 }
28015
28016 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_gt_8_subtile) {
28017 TEST_REQUIRES_X86_XOP;
28018 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080028019 for (uint32_t n = 1; n <= 4; n++) {
28020 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028021 GemmMicrokernelTester()
28022 .mr(4)
28023 .nr(4)
28024 .kr(2)
28025 .sr(1)
28026 .m(m)
28027 .n(n)
28028 .k(k)
28029 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028030 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028031 }
28032 }
28033 }
28034 }
28035
28036 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_div_8) {
28037 TEST_REQUIRES_X86_XOP;
28038 for (size_t k = 16; k <= 80; k += 8) {
28039 GemmMicrokernelTester()
28040 .mr(4)
28041 .nr(4)
28042 .kr(2)
28043 .sr(1)
28044 .m(4)
28045 .n(4)
28046 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080028047 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028048 }
28049 }
28050
28051 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_div_8_strided_a) {
28052 TEST_REQUIRES_X86_XOP;
28053 for (size_t k = 16; k <= 80; k += 8) {
28054 GemmMicrokernelTester()
28055 .mr(4)
28056 .nr(4)
28057 .kr(2)
28058 .sr(1)
28059 .m(4)
28060 .n(4)
28061 .k(k)
28062 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080028063 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028064 }
28065 }
28066
28067 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_div_8_subtile) {
28068 TEST_REQUIRES_X86_XOP;
28069 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080028070 for (uint32_t n = 1; n <= 4; n++) {
28071 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028072 GemmMicrokernelTester()
28073 .mr(4)
28074 .nr(4)
28075 .kr(2)
28076 .sr(1)
28077 .m(m)
28078 .n(n)
28079 .k(k)
28080 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028081 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028082 }
28083 }
28084 }
28085 }
28086
28087 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4) {
28088 TEST_REQUIRES_X86_XOP;
28089 for (uint32_t n = 5; n < 8; n++) {
28090 for (size_t k = 1; k <= 40; k += 9) {
28091 GemmMicrokernelTester()
28092 .mr(4)
28093 .nr(4)
28094 .kr(2)
28095 .sr(1)
28096 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080028097 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028098 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080028099 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028100 }
28101 }
28102 }
28103
28104 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4_strided_cn) {
28105 TEST_REQUIRES_X86_XOP;
28106 for (uint32_t n = 5; n < 8; n++) {
28107 for (size_t k = 1; k <= 40; k += 9) {
28108 GemmMicrokernelTester()
28109 .mr(4)
28110 .nr(4)
28111 .kr(2)
28112 .sr(1)
28113 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080028114 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028115 .k(k)
28116 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028117 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028118 }
28119 }
28120 }
28121
28122 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4_strided_a) {
28123 TEST_REQUIRES_X86_XOP;
28124 for (uint32_t n = 5; n < 8; n++) {
28125 for (size_t k = 1; k <= 40; k += 9) {
28126 GemmMicrokernelTester()
28127 .mr(4)
28128 .nr(4)
28129 .kr(2)
28130 .sr(1)
28131 .m(4)
28132 .n(n)
28133 .k(k)
28134 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080028135 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028136 }
28137 }
28138 }
28139
28140 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4_subtile) {
28141 TEST_REQUIRES_X86_XOP;
28142 for (uint32_t n = 5; n < 8; n++) {
28143 for (size_t k = 1; k <= 40; k += 9) {
28144 for (uint32_t m = 1; m <= 4; m++) {
28145 GemmMicrokernelTester()
28146 .mr(4)
28147 .nr(4)
28148 .kr(2)
28149 .sr(1)
28150 .m(m)
28151 .n(n)
28152 .k(k)
28153 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028154 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028155 }
28156 }
28157 }
28158 }
28159
28160 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4) {
28161 TEST_REQUIRES_X86_XOP;
28162 for (uint32_t n = 8; n <= 12; n += 4) {
28163 for (size_t k = 1; k <= 40; k += 9) {
28164 GemmMicrokernelTester()
28165 .mr(4)
28166 .nr(4)
28167 .kr(2)
28168 .sr(1)
28169 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080028170 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028171 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080028172 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028173 }
28174 }
28175 }
28176
28177 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4_strided_cn) {
28178 TEST_REQUIRES_X86_XOP;
28179 for (uint32_t n = 8; n <= 12; n += 4) {
28180 for (size_t k = 1; k <= 40; k += 9) {
28181 GemmMicrokernelTester()
28182 .mr(4)
28183 .nr(4)
28184 .kr(2)
28185 .sr(1)
28186 .m(4)
28187 .n(n)
28188 .k(k)
28189 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028190 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028191 }
28192 }
28193 }
28194
28195 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4_strided_a) {
28196 TEST_REQUIRES_X86_XOP;
28197 for (uint32_t n = 8; n <= 12; n += 4) {
28198 for (size_t k = 1; k <= 40; k += 9) {
28199 GemmMicrokernelTester()
28200 .mr(4)
28201 .nr(4)
28202 .kr(2)
28203 .sr(1)
28204 .m(4)
28205 .n(n)
28206 .k(k)
28207 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080028208 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028209 }
28210 }
28211 }
28212
28213 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4_subtile) {
28214 TEST_REQUIRES_X86_XOP;
28215 for (uint32_t n = 8; n <= 12; n += 4) {
28216 for (size_t k = 1; k <= 40; k += 9) {
28217 for (uint32_t m = 1; m <= 4; m++) {
28218 GemmMicrokernelTester()
28219 .mr(4)
28220 .nr(4)
28221 .kr(2)
28222 .sr(1)
28223 .m(m)
28224 .n(n)
28225 .k(k)
28226 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028227 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028228 }
28229 }
28230 }
28231 }
28232
28233 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, strided_cm_subtile) {
28234 TEST_REQUIRES_X86_XOP;
28235 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080028236 for (uint32_t n = 1; n <= 4; n++) {
28237 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028238 GemmMicrokernelTester()
28239 .mr(4)
28240 .nr(4)
28241 .kr(2)
28242 .sr(1)
28243 .m(m)
28244 .n(n)
28245 .k(k)
28246 .cm_stride(7)
28247 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028248 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028249 }
28250 }
28251 }
28252 }
28253
28254 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, qmin) {
28255 TEST_REQUIRES_X86_XOP;
28256 GemmMicrokernelTester()
28257 .mr(4)
28258 .nr(4)
28259 .kr(2)
28260 .sr(1)
28261 .m(4)
28262 .n(4)
28263 .k(8)
28264 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080028265 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028266 }
28267
28268 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, qmax) {
28269 TEST_REQUIRES_X86_XOP;
28270 GemmMicrokernelTester()
28271 .mr(4)
28272 .nr(4)
28273 .kr(2)
28274 .sr(1)
28275 .m(4)
28276 .n(4)
28277 .k(8)
28278 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080028279 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028280 }
28281
28282 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, strided_cm) {
28283 TEST_REQUIRES_X86_XOP;
28284 GemmMicrokernelTester()
28285 .mr(4)
28286 .nr(4)
28287 .kr(2)
28288 .sr(1)
28289 .m(4)
28290 .n(4)
28291 .k(8)
28292 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028293 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028294 }
28295#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
28296
28297
28298#if XNN_ARCH_X86 || XNN_ARCH_X86_64
28299 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8) {
28300 TEST_REQUIRES_X86_SSE2;
28301 GemmMicrokernelTester()
28302 .mr(1)
28303 .nr(4)
28304 .kr(8)
28305 .sr(1)
28306 .m(1)
28307 .n(4)
28308 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080028309 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028310 }
28311
28312 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, strided_cn) {
28313 TEST_REQUIRES_X86_SSE2;
28314 GemmMicrokernelTester()
28315 .mr(1)
28316 .nr(4)
28317 .kr(8)
28318 .sr(1)
28319 .m(1)
28320 .n(4)
28321 .k(8)
28322 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028323 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028324 }
28325
28326 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8_strided_a) {
28327 TEST_REQUIRES_X86_SSE2;
28328 GemmMicrokernelTester()
28329 .mr(1)
28330 .nr(4)
28331 .kr(8)
28332 .sr(1)
28333 .m(1)
28334 .n(4)
28335 .k(8)
28336 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080028337 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028338 }
28339
28340 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8_subtile) {
28341 TEST_REQUIRES_X86_SSE2;
Zhi An Ng83844ae2022-01-14 09:52:25 -080028342 for (uint32_t n = 1; n <= 4; n++) {
28343 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028344 GemmMicrokernelTester()
28345 .mr(1)
28346 .nr(4)
28347 .kr(8)
28348 .sr(1)
28349 .m(m)
28350 .n(n)
28351 .k(8)
28352 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028353 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028354 }
28355 }
28356 }
28357
28358 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8_subtile_m) {
28359 TEST_REQUIRES_X86_SSE2;
28360 for (uint32_t m = 1; m <= 1; m++) {
28361 GemmMicrokernelTester()
28362 .mr(1)
28363 .nr(4)
28364 .kr(8)
28365 .sr(1)
28366 .m(m)
28367 .n(4)
28368 .k(8)
28369 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028370 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028371 }
28372 }
28373
28374 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8_subtile_n) {
28375 TEST_REQUIRES_X86_SSE2;
28376 for (uint32_t n = 1; n <= 4; n++) {
28377 GemmMicrokernelTester()
28378 .mr(1)
28379 .nr(4)
28380 .kr(8)
28381 .sr(1)
28382 .m(1)
28383 .n(n)
28384 .k(8)
28385 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028386 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028387 }
28388 }
28389
28390 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_lt_8) {
28391 TEST_REQUIRES_X86_SSE2;
28392 for (size_t k = 1; k < 8; k++) {
28393 GemmMicrokernelTester()
28394 .mr(1)
28395 .nr(4)
28396 .kr(8)
28397 .sr(1)
28398 .m(1)
28399 .n(4)
28400 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080028401 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028402 }
28403 }
28404
28405 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_lt_8_strided_a) {
28406 TEST_REQUIRES_X86_SSE2;
28407 for (size_t k = 1; k < 8; k++) {
28408 GemmMicrokernelTester()
28409 .mr(1)
28410 .nr(4)
28411 .kr(8)
28412 .sr(1)
28413 .m(1)
28414 .n(4)
28415 .k(k)
28416 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080028417 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028418 }
28419 }
28420
28421 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_lt_8_subtile) {
28422 TEST_REQUIRES_X86_SSE2;
28423 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080028424 for (uint32_t n = 1; n <= 4; n++) {
28425 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028426 GemmMicrokernelTester()
28427 .mr(1)
28428 .nr(4)
28429 .kr(8)
28430 .sr(1)
28431 .m(m)
28432 .n(n)
28433 .k(k)
28434 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028435 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028436 }
28437 }
28438 }
28439 }
28440
28441 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_gt_8) {
28442 TEST_REQUIRES_X86_SSE2;
28443 for (size_t k = 9; k < 16; k++) {
28444 GemmMicrokernelTester()
28445 .mr(1)
28446 .nr(4)
28447 .kr(8)
28448 .sr(1)
28449 .m(1)
28450 .n(4)
28451 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080028452 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028453 }
28454 }
28455
28456 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_gt_8_strided_a) {
28457 TEST_REQUIRES_X86_SSE2;
28458 for (size_t k = 9; k < 16; k++) {
28459 GemmMicrokernelTester()
28460 .mr(1)
28461 .nr(4)
28462 .kr(8)
28463 .sr(1)
28464 .m(1)
28465 .n(4)
28466 .k(k)
28467 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080028468 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028469 }
28470 }
28471
28472 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_gt_8_subtile) {
28473 TEST_REQUIRES_X86_SSE2;
28474 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080028475 for (uint32_t n = 1; n <= 4; n++) {
28476 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028477 GemmMicrokernelTester()
28478 .mr(1)
28479 .nr(4)
28480 .kr(8)
28481 .sr(1)
28482 .m(m)
28483 .n(n)
28484 .k(k)
28485 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028486 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028487 }
28488 }
28489 }
28490 }
28491
28492 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_div_8) {
28493 TEST_REQUIRES_X86_SSE2;
28494 for (size_t k = 16; k <= 80; k += 8) {
28495 GemmMicrokernelTester()
28496 .mr(1)
28497 .nr(4)
28498 .kr(8)
28499 .sr(1)
28500 .m(1)
28501 .n(4)
28502 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080028503 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028504 }
28505 }
28506
28507 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_div_8_strided_a) {
28508 TEST_REQUIRES_X86_SSE2;
28509 for (size_t k = 16; k <= 80; k += 8) {
28510 GemmMicrokernelTester()
28511 .mr(1)
28512 .nr(4)
28513 .kr(8)
28514 .sr(1)
28515 .m(1)
28516 .n(4)
28517 .k(k)
28518 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080028519 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028520 }
28521 }
28522
28523 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_div_8_subtile) {
28524 TEST_REQUIRES_X86_SSE2;
28525 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080028526 for (uint32_t n = 1; n <= 4; n++) {
28527 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028528 GemmMicrokernelTester()
28529 .mr(1)
28530 .nr(4)
28531 .kr(8)
28532 .sr(1)
28533 .m(m)
28534 .n(n)
28535 .k(k)
28536 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028537 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028538 }
28539 }
28540 }
28541 }
28542
28543 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4) {
28544 TEST_REQUIRES_X86_SSE2;
28545 for (uint32_t n = 5; n < 8; n++) {
28546 for (size_t k = 1; k <= 40; k += 9) {
28547 GemmMicrokernelTester()
28548 .mr(1)
28549 .nr(4)
28550 .kr(8)
28551 .sr(1)
28552 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080028553 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028554 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080028555 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028556 }
28557 }
28558 }
28559
28560 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4_strided_cn) {
28561 TEST_REQUIRES_X86_SSE2;
28562 for (uint32_t n = 5; n < 8; n++) {
28563 for (size_t k = 1; k <= 40; k += 9) {
28564 GemmMicrokernelTester()
28565 .mr(1)
28566 .nr(4)
28567 .kr(8)
28568 .sr(1)
28569 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080028570 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028571 .k(k)
28572 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028573 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028574 }
28575 }
28576 }
28577
28578 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4_strided_a) {
28579 TEST_REQUIRES_X86_SSE2;
28580 for (uint32_t n = 5; n < 8; n++) {
28581 for (size_t k = 1; k <= 40; k += 9) {
28582 GemmMicrokernelTester()
28583 .mr(1)
28584 .nr(4)
28585 .kr(8)
28586 .sr(1)
28587 .m(1)
28588 .n(n)
28589 .k(k)
28590 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080028591 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028592 }
28593 }
28594 }
28595
28596 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4_subtile) {
28597 TEST_REQUIRES_X86_SSE2;
28598 for (uint32_t n = 5; n < 8; n++) {
28599 for (size_t k = 1; k <= 40; k += 9) {
28600 for (uint32_t m = 1; m <= 1; m++) {
28601 GemmMicrokernelTester()
28602 .mr(1)
28603 .nr(4)
28604 .kr(8)
28605 .sr(1)
28606 .m(m)
28607 .n(n)
28608 .k(k)
28609 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028610 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028611 }
28612 }
28613 }
28614 }
28615
28616 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4) {
28617 TEST_REQUIRES_X86_SSE2;
28618 for (uint32_t n = 8; n <= 12; n += 4) {
28619 for (size_t k = 1; k <= 40; k += 9) {
28620 GemmMicrokernelTester()
28621 .mr(1)
28622 .nr(4)
28623 .kr(8)
28624 .sr(1)
28625 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080028626 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028627 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080028628 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028629 }
28630 }
28631 }
28632
28633 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4_strided_cn) {
28634 TEST_REQUIRES_X86_SSE2;
28635 for (uint32_t n = 8; n <= 12; n += 4) {
28636 for (size_t k = 1; k <= 40; k += 9) {
28637 GemmMicrokernelTester()
28638 .mr(1)
28639 .nr(4)
28640 .kr(8)
28641 .sr(1)
28642 .m(1)
28643 .n(n)
28644 .k(k)
28645 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028646 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028647 }
28648 }
28649 }
28650
28651 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4_strided_a) {
28652 TEST_REQUIRES_X86_SSE2;
28653 for (uint32_t n = 8; n <= 12; n += 4) {
28654 for (size_t k = 1; k <= 40; k += 9) {
28655 GemmMicrokernelTester()
28656 .mr(1)
28657 .nr(4)
28658 .kr(8)
28659 .sr(1)
28660 .m(1)
28661 .n(n)
28662 .k(k)
28663 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080028664 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028665 }
28666 }
28667 }
28668
28669 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4_subtile) {
28670 TEST_REQUIRES_X86_SSE2;
28671 for (uint32_t n = 8; n <= 12; n += 4) {
28672 for (size_t k = 1; k <= 40; k += 9) {
28673 for (uint32_t m = 1; m <= 1; m++) {
28674 GemmMicrokernelTester()
28675 .mr(1)
28676 .nr(4)
28677 .kr(8)
28678 .sr(1)
28679 .m(m)
28680 .n(n)
28681 .k(k)
28682 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028683 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028684 }
28685 }
28686 }
28687 }
28688
28689 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, strided_cm_subtile) {
28690 TEST_REQUIRES_X86_SSE2;
28691 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080028692 for (uint32_t n = 1; n <= 4; n++) {
28693 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028694 GemmMicrokernelTester()
28695 .mr(1)
28696 .nr(4)
28697 .kr(8)
28698 .sr(1)
28699 .m(m)
28700 .n(n)
28701 .k(k)
28702 .cm_stride(7)
28703 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028704 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028705 }
28706 }
28707 }
28708 }
28709
28710 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, qmin) {
28711 TEST_REQUIRES_X86_SSE2;
28712 GemmMicrokernelTester()
28713 .mr(1)
28714 .nr(4)
28715 .kr(8)
28716 .sr(1)
28717 .m(1)
28718 .n(4)
28719 .k(8)
28720 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080028721 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028722 }
28723
28724 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, qmax) {
28725 TEST_REQUIRES_X86_SSE2;
28726 GemmMicrokernelTester()
28727 .mr(1)
28728 .nr(4)
28729 .kr(8)
28730 .sr(1)
28731 .m(1)
28732 .n(4)
28733 .k(8)
28734 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080028735 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028736 }
28737
28738 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, strided_cm) {
28739 TEST_REQUIRES_X86_SSE2;
28740 GemmMicrokernelTester()
28741 .mr(1)
28742 .nr(4)
28743 .kr(8)
28744 .sr(1)
28745 .m(1)
28746 .n(4)
28747 .k(8)
28748 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028749 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028750 }
28751#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
28752
28753
28754#if XNN_ARCH_X86 || XNN_ARCH_X86_64
28755 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8) {
28756 TEST_REQUIRES_X86_SSE41;
28757 GemmMicrokernelTester()
28758 .mr(1)
28759 .nr(4)
28760 .kr(8)
28761 .sr(1)
28762 .m(1)
28763 .n(4)
28764 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080028765 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028766 }
28767
28768 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, strided_cn) {
28769 TEST_REQUIRES_X86_SSE41;
28770 GemmMicrokernelTester()
28771 .mr(1)
28772 .nr(4)
28773 .kr(8)
28774 .sr(1)
28775 .m(1)
28776 .n(4)
28777 .k(8)
28778 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028779 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028780 }
28781
28782 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_strided_a) {
28783 TEST_REQUIRES_X86_SSE41;
28784 GemmMicrokernelTester()
28785 .mr(1)
28786 .nr(4)
28787 .kr(8)
28788 .sr(1)
28789 .m(1)
28790 .n(4)
28791 .k(8)
28792 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080028793 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028794 }
28795
28796 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_subtile) {
28797 TEST_REQUIRES_X86_SSE41;
Zhi An Ng83844ae2022-01-14 09:52:25 -080028798 for (uint32_t n = 1; n <= 4; n++) {
28799 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028800 GemmMicrokernelTester()
28801 .mr(1)
28802 .nr(4)
28803 .kr(8)
28804 .sr(1)
28805 .m(m)
28806 .n(n)
28807 .k(8)
28808 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028809 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028810 }
28811 }
28812 }
28813
28814 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_subtile_m) {
28815 TEST_REQUIRES_X86_SSE41;
28816 for (uint32_t m = 1; m <= 1; m++) {
28817 GemmMicrokernelTester()
28818 .mr(1)
28819 .nr(4)
28820 .kr(8)
28821 .sr(1)
28822 .m(m)
28823 .n(4)
28824 .k(8)
28825 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028826 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028827 }
28828 }
28829
28830 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_subtile_n) {
28831 TEST_REQUIRES_X86_SSE41;
28832 for (uint32_t n = 1; n <= 4; n++) {
28833 GemmMicrokernelTester()
28834 .mr(1)
28835 .nr(4)
28836 .kr(8)
28837 .sr(1)
28838 .m(1)
28839 .n(n)
28840 .k(8)
28841 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028842 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028843 }
28844 }
28845
28846 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_lt_8) {
28847 TEST_REQUIRES_X86_SSE41;
28848 for (size_t k = 1; k < 8; k++) {
28849 GemmMicrokernelTester()
28850 .mr(1)
28851 .nr(4)
28852 .kr(8)
28853 .sr(1)
28854 .m(1)
28855 .n(4)
28856 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080028857 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028858 }
28859 }
28860
28861 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_lt_8_strided_a) {
28862 TEST_REQUIRES_X86_SSE41;
28863 for (size_t k = 1; k < 8; k++) {
28864 GemmMicrokernelTester()
28865 .mr(1)
28866 .nr(4)
28867 .kr(8)
28868 .sr(1)
28869 .m(1)
28870 .n(4)
28871 .k(k)
28872 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080028873 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028874 }
28875 }
28876
28877 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_lt_8_subtile) {
28878 TEST_REQUIRES_X86_SSE41;
28879 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080028880 for (uint32_t n = 1; n <= 4; n++) {
28881 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028882 GemmMicrokernelTester()
28883 .mr(1)
28884 .nr(4)
28885 .kr(8)
28886 .sr(1)
28887 .m(m)
28888 .n(n)
28889 .k(k)
28890 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028891 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028892 }
28893 }
28894 }
28895 }
28896
28897 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_gt_8) {
28898 TEST_REQUIRES_X86_SSE41;
28899 for (size_t k = 9; k < 16; k++) {
28900 GemmMicrokernelTester()
28901 .mr(1)
28902 .nr(4)
28903 .kr(8)
28904 .sr(1)
28905 .m(1)
28906 .n(4)
28907 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080028908 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028909 }
28910 }
28911
28912 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_gt_8_strided_a) {
28913 TEST_REQUIRES_X86_SSE41;
28914 for (size_t k = 9; k < 16; k++) {
28915 GemmMicrokernelTester()
28916 .mr(1)
28917 .nr(4)
28918 .kr(8)
28919 .sr(1)
28920 .m(1)
28921 .n(4)
28922 .k(k)
28923 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080028924 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028925 }
28926 }
28927
28928 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_gt_8_subtile) {
28929 TEST_REQUIRES_X86_SSE41;
28930 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080028931 for (uint32_t n = 1; n <= 4; n++) {
28932 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028933 GemmMicrokernelTester()
28934 .mr(1)
28935 .nr(4)
28936 .kr(8)
28937 .sr(1)
28938 .m(m)
28939 .n(n)
28940 .k(k)
28941 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028942 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028943 }
28944 }
28945 }
28946 }
28947
28948 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_div_8) {
28949 TEST_REQUIRES_X86_SSE41;
28950 for (size_t k = 16; k <= 80; k += 8) {
28951 GemmMicrokernelTester()
28952 .mr(1)
28953 .nr(4)
28954 .kr(8)
28955 .sr(1)
28956 .m(1)
28957 .n(4)
28958 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080028959 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028960 }
28961 }
28962
28963 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_div_8_strided_a) {
28964 TEST_REQUIRES_X86_SSE41;
28965 for (size_t k = 16; k <= 80; k += 8) {
28966 GemmMicrokernelTester()
28967 .mr(1)
28968 .nr(4)
28969 .kr(8)
28970 .sr(1)
28971 .m(1)
28972 .n(4)
28973 .k(k)
28974 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080028975 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028976 }
28977 }
28978
28979 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_div_8_subtile) {
28980 TEST_REQUIRES_X86_SSE41;
28981 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080028982 for (uint32_t n = 1; n <= 4; n++) {
28983 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028984 GemmMicrokernelTester()
28985 .mr(1)
28986 .nr(4)
28987 .kr(8)
28988 .sr(1)
28989 .m(m)
28990 .n(n)
28991 .k(k)
28992 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028993 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028994 }
28995 }
28996 }
28997 }
28998
28999 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4) {
29000 TEST_REQUIRES_X86_SSE41;
29001 for (uint32_t n = 5; n < 8; n++) {
29002 for (size_t k = 1; k <= 40; k += 9) {
29003 GemmMicrokernelTester()
29004 .mr(1)
29005 .nr(4)
29006 .kr(8)
29007 .sr(1)
29008 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080029009 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029010 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080029011 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029012 }
29013 }
29014 }
29015
29016 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4_strided_cn) {
29017 TEST_REQUIRES_X86_SSE41;
29018 for (uint32_t n = 5; n < 8; n++) {
29019 for (size_t k = 1; k <= 40; k += 9) {
29020 GemmMicrokernelTester()
29021 .mr(1)
29022 .nr(4)
29023 .kr(8)
29024 .sr(1)
29025 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080029026 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029027 .k(k)
29028 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029029 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029030 }
29031 }
29032 }
29033
29034 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4_strided_a) {
29035 TEST_REQUIRES_X86_SSE41;
29036 for (uint32_t n = 5; n < 8; n++) {
29037 for (size_t k = 1; k <= 40; k += 9) {
29038 GemmMicrokernelTester()
29039 .mr(1)
29040 .nr(4)
29041 .kr(8)
29042 .sr(1)
29043 .m(1)
29044 .n(n)
29045 .k(k)
29046 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080029047 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029048 }
29049 }
29050 }
29051
29052 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4_subtile) {
29053 TEST_REQUIRES_X86_SSE41;
29054 for (uint32_t n = 5; n < 8; n++) {
29055 for (size_t k = 1; k <= 40; k += 9) {
29056 for (uint32_t m = 1; m <= 1; m++) {
29057 GemmMicrokernelTester()
29058 .mr(1)
29059 .nr(4)
29060 .kr(8)
29061 .sr(1)
29062 .m(m)
29063 .n(n)
29064 .k(k)
29065 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029066 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029067 }
29068 }
29069 }
29070 }
29071
29072 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4) {
29073 TEST_REQUIRES_X86_SSE41;
29074 for (uint32_t n = 8; n <= 12; n += 4) {
29075 for (size_t k = 1; k <= 40; k += 9) {
29076 GemmMicrokernelTester()
29077 .mr(1)
29078 .nr(4)
29079 .kr(8)
29080 .sr(1)
29081 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080029082 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029083 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080029084 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029085 }
29086 }
29087 }
29088
29089 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4_strided_cn) {
29090 TEST_REQUIRES_X86_SSE41;
29091 for (uint32_t n = 8; n <= 12; n += 4) {
29092 for (size_t k = 1; k <= 40; k += 9) {
29093 GemmMicrokernelTester()
29094 .mr(1)
29095 .nr(4)
29096 .kr(8)
29097 .sr(1)
29098 .m(1)
29099 .n(n)
29100 .k(k)
29101 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029102 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029103 }
29104 }
29105 }
29106
29107 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4_strided_a) {
29108 TEST_REQUIRES_X86_SSE41;
29109 for (uint32_t n = 8; n <= 12; n += 4) {
29110 for (size_t k = 1; k <= 40; k += 9) {
29111 GemmMicrokernelTester()
29112 .mr(1)
29113 .nr(4)
29114 .kr(8)
29115 .sr(1)
29116 .m(1)
29117 .n(n)
29118 .k(k)
29119 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080029120 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029121 }
29122 }
29123 }
29124
29125 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4_subtile) {
29126 TEST_REQUIRES_X86_SSE41;
29127 for (uint32_t n = 8; n <= 12; n += 4) {
29128 for (size_t k = 1; k <= 40; k += 9) {
29129 for (uint32_t m = 1; m <= 1; m++) {
29130 GemmMicrokernelTester()
29131 .mr(1)
29132 .nr(4)
29133 .kr(8)
29134 .sr(1)
29135 .m(m)
29136 .n(n)
29137 .k(k)
29138 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029139 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029140 }
29141 }
29142 }
29143 }
29144
29145 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, strided_cm_subtile) {
29146 TEST_REQUIRES_X86_SSE41;
29147 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080029148 for (uint32_t n = 1; n <= 4; n++) {
29149 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029150 GemmMicrokernelTester()
29151 .mr(1)
29152 .nr(4)
29153 .kr(8)
29154 .sr(1)
29155 .m(m)
29156 .n(n)
29157 .k(k)
29158 .cm_stride(7)
29159 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029160 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029161 }
29162 }
29163 }
29164 }
29165
29166 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, qmin) {
29167 TEST_REQUIRES_X86_SSE41;
29168 GemmMicrokernelTester()
29169 .mr(1)
29170 .nr(4)
29171 .kr(8)
29172 .sr(1)
29173 .m(1)
29174 .n(4)
29175 .k(8)
29176 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080029177 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029178 }
29179
29180 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, qmax) {
29181 TEST_REQUIRES_X86_SSE41;
29182 GemmMicrokernelTester()
29183 .mr(1)
29184 .nr(4)
29185 .kr(8)
29186 .sr(1)
29187 .m(1)
29188 .n(4)
29189 .k(8)
29190 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080029191 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029192 }
29193
29194 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, strided_cm) {
29195 TEST_REQUIRES_X86_SSE41;
29196 GemmMicrokernelTester()
29197 .mr(1)
29198 .nr(4)
29199 .kr(8)
29200 .sr(1)
29201 .m(1)
29202 .n(4)
29203 .k(8)
29204 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029205 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029206 }
29207#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
29208
29209
29210#if XNN_ARCH_X86 || XNN_ARCH_X86_64
29211 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8) {
29212 TEST_REQUIRES_X86_SSE41;
29213 GemmMicrokernelTester()
29214 .mr(3)
29215 .nr(4)
29216 .kr(8)
29217 .sr(1)
29218 .m(3)
29219 .n(4)
29220 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080029221 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029222 }
29223
29224 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, strided_cn) {
29225 TEST_REQUIRES_X86_SSE41;
29226 GemmMicrokernelTester()
29227 .mr(3)
29228 .nr(4)
29229 .kr(8)
29230 .sr(1)
29231 .m(3)
29232 .n(4)
29233 .k(8)
29234 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029235 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029236 }
29237
29238 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8_strided_a) {
29239 TEST_REQUIRES_X86_SSE41;
29240 GemmMicrokernelTester()
29241 .mr(3)
29242 .nr(4)
29243 .kr(8)
29244 .sr(1)
29245 .m(3)
29246 .n(4)
29247 .k(8)
29248 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080029249 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029250 }
29251
29252 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8_subtile) {
29253 TEST_REQUIRES_X86_SSE41;
Zhi An Ng83844ae2022-01-14 09:52:25 -080029254 for (uint32_t n = 1; n <= 4; n++) {
29255 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029256 GemmMicrokernelTester()
29257 .mr(3)
29258 .nr(4)
29259 .kr(8)
29260 .sr(1)
29261 .m(m)
29262 .n(n)
29263 .k(8)
29264 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029265 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029266 }
29267 }
29268 }
29269
29270 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8_subtile_m) {
29271 TEST_REQUIRES_X86_SSE41;
29272 for (uint32_t m = 1; m <= 3; m++) {
29273 GemmMicrokernelTester()
29274 .mr(3)
29275 .nr(4)
29276 .kr(8)
29277 .sr(1)
29278 .m(m)
29279 .n(4)
29280 .k(8)
29281 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029282 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029283 }
29284 }
29285
29286 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8_subtile_n) {
29287 TEST_REQUIRES_X86_SSE41;
29288 for (uint32_t n = 1; n <= 4; n++) {
29289 GemmMicrokernelTester()
29290 .mr(3)
29291 .nr(4)
29292 .kr(8)
29293 .sr(1)
29294 .m(3)
29295 .n(n)
29296 .k(8)
29297 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029298 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029299 }
29300 }
29301
29302 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_lt_8) {
29303 TEST_REQUIRES_X86_SSE41;
29304 for (size_t k = 1; k < 8; k++) {
29305 GemmMicrokernelTester()
29306 .mr(3)
29307 .nr(4)
29308 .kr(8)
29309 .sr(1)
29310 .m(3)
29311 .n(4)
29312 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080029313 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029314 }
29315 }
29316
29317 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_lt_8_strided_a) {
29318 TEST_REQUIRES_X86_SSE41;
29319 for (size_t k = 1; k < 8; k++) {
29320 GemmMicrokernelTester()
29321 .mr(3)
29322 .nr(4)
29323 .kr(8)
29324 .sr(1)
29325 .m(3)
29326 .n(4)
29327 .k(k)
29328 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080029329 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029330 }
29331 }
29332
29333 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_lt_8_subtile) {
29334 TEST_REQUIRES_X86_SSE41;
29335 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080029336 for (uint32_t n = 1; n <= 4; n++) {
29337 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029338 GemmMicrokernelTester()
29339 .mr(3)
29340 .nr(4)
29341 .kr(8)
29342 .sr(1)
29343 .m(m)
29344 .n(n)
29345 .k(k)
29346 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029347 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029348 }
29349 }
29350 }
29351 }
29352
29353 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_gt_8) {
29354 TEST_REQUIRES_X86_SSE41;
29355 for (size_t k = 9; k < 16; k++) {
29356 GemmMicrokernelTester()
29357 .mr(3)
29358 .nr(4)
29359 .kr(8)
29360 .sr(1)
29361 .m(3)
29362 .n(4)
29363 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080029364 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029365 }
29366 }
29367
29368 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_gt_8_strided_a) {
29369 TEST_REQUIRES_X86_SSE41;
29370 for (size_t k = 9; k < 16; k++) {
29371 GemmMicrokernelTester()
29372 .mr(3)
29373 .nr(4)
29374 .kr(8)
29375 .sr(1)
29376 .m(3)
29377 .n(4)
29378 .k(k)
29379 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080029380 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029381 }
29382 }
29383
29384 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_gt_8_subtile) {
29385 TEST_REQUIRES_X86_SSE41;
29386 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080029387 for (uint32_t n = 1; n <= 4; n++) {
29388 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029389 GemmMicrokernelTester()
29390 .mr(3)
29391 .nr(4)
29392 .kr(8)
29393 .sr(1)
29394 .m(m)
29395 .n(n)
29396 .k(k)
29397 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029398 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029399 }
29400 }
29401 }
29402 }
29403
29404 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_div_8) {
29405 TEST_REQUIRES_X86_SSE41;
29406 for (size_t k = 16; k <= 80; k += 8) {
29407 GemmMicrokernelTester()
29408 .mr(3)
29409 .nr(4)
29410 .kr(8)
29411 .sr(1)
29412 .m(3)
29413 .n(4)
29414 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080029415 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029416 }
29417 }
29418
29419 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_div_8_strided_a) {
29420 TEST_REQUIRES_X86_SSE41;
29421 for (size_t k = 16; k <= 80; k += 8) {
29422 GemmMicrokernelTester()
29423 .mr(3)
29424 .nr(4)
29425 .kr(8)
29426 .sr(1)
29427 .m(3)
29428 .n(4)
29429 .k(k)
29430 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080029431 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029432 }
29433 }
29434
29435 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_div_8_subtile) {
29436 TEST_REQUIRES_X86_SSE41;
29437 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080029438 for (uint32_t n = 1; n <= 4; n++) {
29439 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029440 GemmMicrokernelTester()
29441 .mr(3)
29442 .nr(4)
29443 .kr(8)
29444 .sr(1)
29445 .m(m)
29446 .n(n)
29447 .k(k)
29448 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029449 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029450 }
29451 }
29452 }
29453 }
29454
29455 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4) {
29456 TEST_REQUIRES_X86_SSE41;
29457 for (uint32_t n = 5; n < 8; n++) {
29458 for (size_t k = 1; k <= 40; k += 9) {
29459 GemmMicrokernelTester()
29460 .mr(3)
29461 .nr(4)
29462 .kr(8)
29463 .sr(1)
29464 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080029465 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029466 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080029467 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029468 }
29469 }
29470 }
29471
29472 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4_strided_cn) {
29473 TEST_REQUIRES_X86_SSE41;
29474 for (uint32_t n = 5; n < 8; n++) {
29475 for (size_t k = 1; k <= 40; k += 9) {
29476 GemmMicrokernelTester()
29477 .mr(3)
29478 .nr(4)
29479 .kr(8)
29480 .sr(1)
29481 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080029482 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029483 .k(k)
29484 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029485 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029486 }
29487 }
29488 }
29489
29490 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4_strided_a) {
29491 TEST_REQUIRES_X86_SSE41;
29492 for (uint32_t n = 5; n < 8; n++) {
29493 for (size_t k = 1; k <= 40; k += 9) {
29494 GemmMicrokernelTester()
29495 .mr(3)
29496 .nr(4)
29497 .kr(8)
29498 .sr(1)
29499 .m(3)
29500 .n(n)
29501 .k(k)
29502 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080029503 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029504 }
29505 }
29506 }
29507
29508 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4_subtile) {
29509 TEST_REQUIRES_X86_SSE41;
29510 for (uint32_t n = 5; n < 8; n++) {
29511 for (size_t k = 1; k <= 40; k += 9) {
29512 for (uint32_t m = 1; m <= 3; m++) {
29513 GemmMicrokernelTester()
29514 .mr(3)
29515 .nr(4)
29516 .kr(8)
29517 .sr(1)
29518 .m(m)
29519 .n(n)
29520 .k(k)
29521 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029522 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029523 }
29524 }
29525 }
29526 }
29527
29528 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4) {
29529 TEST_REQUIRES_X86_SSE41;
29530 for (uint32_t n = 8; n <= 12; n += 4) {
29531 for (size_t k = 1; k <= 40; k += 9) {
29532 GemmMicrokernelTester()
29533 .mr(3)
29534 .nr(4)
29535 .kr(8)
29536 .sr(1)
29537 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080029538 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029539 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080029540 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029541 }
29542 }
29543 }
29544
29545 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4_strided_cn) {
29546 TEST_REQUIRES_X86_SSE41;
29547 for (uint32_t n = 8; n <= 12; n += 4) {
29548 for (size_t k = 1; k <= 40; k += 9) {
29549 GemmMicrokernelTester()
29550 .mr(3)
29551 .nr(4)
29552 .kr(8)
29553 .sr(1)
29554 .m(3)
29555 .n(n)
29556 .k(k)
29557 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029558 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029559 }
29560 }
29561 }
29562
29563 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4_strided_a) {
29564 TEST_REQUIRES_X86_SSE41;
29565 for (uint32_t n = 8; n <= 12; n += 4) {
29566 for (size_t k = 1; k <= 40; k += 9) {
29567 GemmMicrokernelTester()
29568 .mr(3)
29569 .nr(4)
29570 .kr(8)
29571 .sr(1)
29572 .m(3)
29573 .n(n)
29574 .k(k)
29575 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080029576 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029577 }
29578 }
29579 }
29580
29581 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4_subtile) {
29582 TEST_REQUIRES_X86_SSE41;
29583 for (uint32_t n = 8; n <= 12; n += 4) {
29584 for (size_t k = 1; k <= 40; k += 9) {
29585 for (uint32_t m = 1; m <= 3; m++) {
29586 GemmMicrokernelTester()
29587 .mr(3)
29588 .nr(4)
29589 .kr(8)
29590 .sr(1)
29591 .m(m)
29592 .n(n)
29593 .k(k)
29594 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029595 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029596 }
29597 }
29598 }
29599 }
29600
29601 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, strided_cm_subtile) {
29602 TEST_REQUIRES_X86_SSE41;
29603 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080029604 for (uint32_t n = 1; n <= 4; n++) {
29605 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029606 GemmMicrokernelTester()
29607 .mr(3)
29608 .nr(4)
29609 .kr(8)
29610 .sr(1)
29611 .m(m)
29612 .n(n)
29613 .k(k)
29614 .cm_stride(7)
29615 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029616 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029617 }
29618 }
29619 }
29620 }
29621
29622 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, qmin) {
29623 TEST_REQUIRES_X86_SSE41;
29624 GemmMicrokernelTester()
29625 .mr(3)
29626 .nr(4)
29627 .kr(8)
29628 .sr(1)
29629 .m(3)
29630 .n(4)
29631 .k(8)
29632 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080029633 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029634 }
29635
29636 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, qmax) {
29637 TEST_REQUIRES_X86_SSE41;
29638 GemmMicrokernelTester()
29639 .mr(3)
29640 .nr(4)
29641 .kr(8)
29642 .sr(1)
29643 .m(3)
29644 .n(4)
29645 .k(8)
29646 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080029647 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029648 }
29649
29650 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, strided_cm) {
29651 TEST_REQUIRES_X86_SSE41;
29652 GemmMicrokernelTester()
29653 .mr(3)
29654 .nr(4)
29655 .kr(8)
29656 .sr(1)
29657 .m(3)
29658 .n(4)
29659 .k(8)
29660 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029661 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029662 }
29663#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
29664
29665
29666#if XNN_ARCH_X86 || XNN_ARCH_X86_64
29667 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8) {
29668 TEST_REQUIRES_X86_XOP;
29669 GemmMicrokernelTester()
29670 .mr(1)
29671 .nr(4)
29672 .kr(8)
29673 .sr(1)
29674 .m(1)
29675 .n(4)
29676 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080029677 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029678 }
29679
29680 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, strided_cn) {
29681 TEST_REQUIRES_X86_XOP;
29682 GemmMicrokernelTester()
29683 .mr(1)
29684 .nr(4)
29685 .kr(8)
29686 .sr(1)
29687 .m(1)
29688 .n(4)
29689 .k(8)
29690 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029691 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029692 }
29693
29694 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_strided_a) {
29695 TEST_REQUIRES_X86_XOP;
29696 GemmMicrokernelTester()
29697 .mr(1)
29698 .nr(4)
29699 .kr(8)
29700 .sr(1)
29701 .m(1)
29702 .n(4)
29703 .k(8)
29704 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080029705 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029706 }
29707
29708 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_subtile) {
29709 TEST_REQUIRES_X86_XOP;
Zhi An Ng83844ae2022-01-14 09:52:25 -080029710 for (uint32_t n = 1; n <= 4; n++) {
29711 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029712 GemmMicrokernelTester()
29713 .mr(1)
29714 .nr(4)
29715 .kr(8)
29716 .sr(1)
29717 .m(m)
29718 .n(n)
29719 .k(8)
29720 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029721 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029722 }
29723 }
29724 }
29725
29726 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_subtile_m) {
29727 TEST_REQUIRES_X86_XOP;
29728 for (uint32_t m = 1; m <= 1; m++) {
29729 GemmMicrokernelTester()
29730 .mr(1)
29731 .nr(4)
29732 .kr(8)
29733 .sr(1)
29734 .m(m)
29735 .n(4)
29736 .k(8)
29737 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029738 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029739 }
29740 }
29741
29742 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_subtile_n) {
29743 TEST_REQUIRES_X86_XOP;
29744 for (uint32_t n = 1; n <= 4; n++) {
29745 GemmMicrokernelTester()
29746 .mr(1)
29747 .nr(4)
29748 .kr(8)
29749 .sr(1)
29750 .m(1)
29751 .n(n)
29752 .k(8)
29753 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029754 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029755 }
29756 }
29757
29758 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_lt_8) {
29759 TEST_REQUIRES_X86_XOP;
29760 for (size_t k = 1; k < 8; k++) {
29761 GemmMicrokernelTester()
29762 .mr(1)
29763 .nr(4)
29764 .kr(8)
29765 .sr(1)
29766 .m(1)
29767 .n(4)
29768 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080029769 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029770 }
29771 }
29772
29773 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_lt_8_strided_a) {
29774 TEST_REQUIRES_X86_XOP;
29775 for (size_t k = 1; k < 8; k++) {
29776 GemmMicrokernelTester()
29777 .mr(1)
29778 .nr(4)
29779 .kr(8)
29780 .sr(1)
29781 .m(1)
29782 .n(4)
29783 .k(k)
29784 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080029785 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029786 }
29787 }
29788
29789 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_lt_8_subtile) {
29790 TEST_REQUIRES_X86_XOP;
29791 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080029792 for (uint32_t n = 1; n <= 4; n++) {
29793 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029794 GemmMicrokernelTester()
29795 .mr(1)
29796 .nr(4)
29797 .kr(8)
29798 .sr(1)
29799 .m(m)
29800 .n(n)
29801 .k(k)
29802 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029803 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029804 }
29805 }
29806 }
29807 }
29808
29809 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_gt_8) {
29810 TEST_REQUIRES_X86_XOP;
29811 for (size_t k = 9; k < 16; k++) {
29812 GemmMicrokernelTester()
29813 .mr(1)
29814 .nr(4)
29815 .kr(8)
29816 .sr(1)
29817 .m(1)
29818 .n(4)
29819 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080029820 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029821 }
29822 }
29823
29824 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_gt_8_strided_a) {
29825 TEST_REQUIRES_X86_XOP;
29826 for (size_t k = 9; k < 16; k++) {
29827 GemmMicrokernelTester()
29828 .mr(1)
29829 .nr(4)
29830 .kr(8)
29831 .sr(1)
29832 .m(1)
29833 .n(4)
29834 .k(k)
29835 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080029836 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029837 }
29838 }
29839
29840 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_gt_8_subtile) {
29841 TEST_REQUIRES_X86_XOP;
29842 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080029843 for (uint32_t n = 1; n <= 4; n++) {
29844 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029845 GemmMicrokernelTester()
29846 .mr(1)
29847 .nr(4)
29848 .kr(8)
29849 .sr(1)
29850 .m(m)
29851 .n(n)
29852 .k(k)
29853 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029854 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029855 }
29856 }
29857 }
29858 }
29859
29860 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_div_8) {
29861 TEST_REQUIRES_X86_XOP;
29862 for (size_t k = 16; k <= 80; k += 8) {
29863 GemmMicrokernelTester()
29864 .mr(1)
29865 .nr(4)
29866 .kr(8)
29867 .sr(1)
29868 .m(1)
29869 .n(4)
29870 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080029871 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029872 }
29873 }
29874
29875 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_div_8_strided_a) {
29876 TEST_REQUIRES_X86_XOP;
29877 for (size_t k = 16; k <= 80; k += 8) {
29878 GemmMicrokernelTester()
29879 .mr(1)
29880 .nr(4)
29881 .kr(8)
29882 .sr(1)
29883 .m(1)
29884 .n(4)
29885 .k(k)
29886 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080029887 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029888 }
29889 }
29890
29891 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_div_8_subtile) {
29892 TEST_REQUIRES_X86_XOP;
29893 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080029894 for (uint32_t n = 1; n <= 4; n++) {
29895 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029896 GemmMicrokernelTester()
29897 .mr(1)
29898 .nr(4)
29899 .kr(8)
29900 .sr(1)
29901 .m(m)
29902 .n(n)
29903 .k(k)
29904 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029905 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029906 }
29907 }
29908 }
29909 }
29910
29911 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4) {
29912 TEST_REQUIRES_X86_XOP;
29913 for (uint32_t n = 5; n < 8; n++) {
29914 for (size_t k = 1; k <= 40; k += 9) {
29915 GemmMicrokernelTester()
29916 .mr(1)
29917 .nr(4)
29918 .kr(8)
29919 .sr(1)
29920 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080029921 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029922 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080029923 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029924 }
29925 }
29926 }
29927
29928 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4_strided_cn) {
29929 TEST_REQUIRES_X86_XOP;
29930 for (uint32_t n = 5; n < 8; n++) {
29931 for (size_t k = 1; k <= 40; k += 9) {
29932 GemmMicrokernelTester()
29933 .mr(1)
29934 .nr(4)
29935 .kr(8)
29936 .sr(1)
29937 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080029938 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029939 .k(k)
29940 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029941 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029942 }
29943 }
29944 }
29945
29946 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4_strided_a) {
29947 TEST_REQUIRES_X86_XOP;
29948 for (uint32_t n = 5; n < 8; n++) {
29949 for (size_t k = 1; k <= 40; k += 9) {
29950 GemmMicrokernelTester()
29951 .mr(1)
29952 .nr(4)
29953 .kr(8)
29954 .sr(1)
29955 .m(1)
29956 .n(n)
29957 .k(k)
29958 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080029959 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029960 }
29961 }
29962 }
29963
29964 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4_subtile) {
29965 TEST_REQUIRES_X86_XOP;
29966 for (uint32_t n = 5; n < 8; n++) {
29967 for (size_t k = 1; k <= 40; k += 9) {
29968 for (uint32_t m = 1; m <= 1; m++) {
29969 GemmMicrokernelTester()
29970 .mr(1)
29971 .nr(4)
29972 .kr(8)
29973 .sr(1)
29974 .m(m)
29975 .n(n)
29976 .k(k)
29977 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029978 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029979 }
29980 }
29981 }
29982 }
29983
29984 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4) {
29985 TEST_REQUIRES_X86_XOP;
29986 for (uint32_t n = 8; n <= 12; n += 4) {
29987 for (size_t k = 1; k <= 40; k += 9) {
29988 GemmMicrokernelTester()
29989 .mr(1)
29990 .nr(4)
29991 .kr(8)
29992 .sr(1)
29993 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080029994 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029995 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080029996 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029997 }
29998 }
29999 }
30000
30001 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4_strided_cn) {
30002 TEST_REQUIRES_X86_XOP;
30003 for (uint32_t n = 8; n <= 12; n += 4) {
30004 for (size_t k = 1; k <= 40; k += 9) {
30005 GemmMicrokernelTester()
30006 .mr(1)
30007 .nr(4)
30008 .kr(8)
30009 .sr(1)
30010 .m(1)
30011 .n(n)
30012 .k(k)
30013 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030014 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030015 }
30016 }
30017 }
30018
30019 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4_strided_a) {
30020 TEST_REQUIRES_X86_XOP;
30021 for (uint32_t n = 8; n <= 12; n += 4) {
30022 for (size_t k = 1; k <= 40; k += 9) {
30023 GemmMicrokernelTester()
30024 .mr(1)
30025 .nr(4)
30026 .kr(8)
30027 .sr(1)
30028 .m(1)
30029 .n(n)
30030 .k(k)
30031 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080030032 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030033 }
30034 }
30035 }
30036
30037 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4_subtile) {
30038 TEST_REQUIRES_X86_XOP;
30039 for (uint32_t n = 8; n <= 12; n += 4) {
30040 for (size_t k = 1; k <= 40; k += 9) {
30041 for (uint32_t m = 1; m <= 1; m++) {
30042 GemmMicrokernelTester()
30043 .mr(1)
30044 .nr(4)
30045 .kr(8)
30046 .sr(1)
30047 .m(m)
30048 .n(n)
30049 .k(k)
30050 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030051 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030052 }
30053 }
30054 }
30055 }
30056
30057 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, strided_cm_subtile) {
30058 TEST_REQUIRES_X86_XOP;
30059 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080030060 for (uint32_t n = 1; n <= 4; n++) {
30061 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030062 GemmMicrokernelTester()
30063 .mr(1)
30064 .nr(4)
30065 .kr(8)
30066 .sr(1)
30067 .m(m)
30068 .n(n)
30069 .k(k)
30070 .cm_stride(7)
30071 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030072 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030073 }
30074 }
30075 }
30076 }
30077
30078 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, qmin) {
30079 TEST_REQUIRES_X86_XOP;
30080 GemmMicrokernelTester()
30081 .mr(1)
30082 .nr(4)
30083 .kr(8)
30084 .sr(1)
30085 .m(1)
30086 .n(4)
30087 .k(8)
30088 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080030089 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030090 }
30091
30092 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, qmax) {
30093 TEST_REQUIRES_X86_XOP;
30094 GemmMicrokernelTester()
30095 .mr(1)
30096 .nr(4)
30097 .kr(8)
30098 .sr(1)
30099 .m(1)
30100 .n(4)
30101 .k(8)
30102 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080030103 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030104 }
30105
30106 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, strided_cm) {
30107 TEST_REQUIRES_X86_XOP;
30108 GemmMicrokernelTester()
30109 .mr(1)
30110 .nr(4)
30111 .kr(8)
30112 .sr(1)
30113 .m(1)
30114 .n(4)
30115 .k(8)
30116 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030117 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030118 }
30119#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
30120
30121
30122#if XNN_ARCH_X86 || XNN_ARCH_X86_64
30123 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8) {
30124 TEST_REQUIRES_X86_XOP;
30125 GemmMicrokernelTester()
30126 .mr(2)
30127 .nr(4)
30128 .kr(8)
30129 .sr(1)
30130 .m(2)
30131 .n(4)
30132 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080030133 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030134 }
30135
30136 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, strided_cn) {
30137 TEST_REQUIRES_X86_XOP;
30138 GemmMicrokernelTester()
30139 .mr(2)
30140 .nr(4)
30141 .kr(8)
30142 .sr(1)
30143 .m(2)
30144 .n(4)
30145 .k(8)
30146 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030147 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030148 }
30149
30150 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8_strided_a) {
30151 TEST_REQUIRES_X86_XOP;
30152 GemmMicrokernelTester()
30153 .mr(2)
30154 .nr(4)
30155 .kr(8)
30156 .sr(1)
30157 .m(2)
30158 .n(4)
30159 .k(8)
30160 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080030161 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030162 }
30163
30164 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8_subtile) {
30165 TEST_REQUIRES_X86_XOP;
Zhi An Ng83844ae2022-01-14 09:52:25 -080030166 for (uint32_t n = 1; n <= 4; n++) {
30167 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030168 GemmMicrokernelTester()
30169 .mr(2)
30170 .nr(4)
30171 .kr(8)
30172 .sr(1)
30173 .m(m)
30174 .n(n)
30175 .k(8)
30176 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030177 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030178 }
30179 }
30180 }
30181
30182 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8_subtile_m) {
30183 TEST_REQUIRES_X86_XOP;
30184 for (uint32_t m = 1; m <= 2; m++) {
30185 GemmMicrokernelTester()
30186 .mr(2)
30187 .nr(4)
30188 .kr(8)
30189 .sr(1)
30190 .m(m)
30191 .n(4)
30192 .k(8)
30193 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030194 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030195 }
30196 }
30197
30198 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8_subtile_n) {
30199 TEST_REQUIRES_X86_XOP;
30200 for (uint32_t n = 1; n <= 4; n++) {
30201 GemmMicrokernelTester()
30202 .mr(2)
30203 .nr(4)
30204 .kr(8)
30205 .sr(1)
30206 .m(2)
30207 .n(n)
30208 .k(8)
30209 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030210 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030211 }
30212 }
30213
30214 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_lt_8) {
30215 TEST_REQUIRES_X86_XOP;
30216 for (size_t k = 1; k < 8; k++) {
30217 GemmMicrokernelTester()
30218 .mr(2)
30219 .nr(4)
30220 .kr(8)
30221 .sr(1)
30222 .m(2)
30223 .n(4)
30224 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080030225 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030226 }
30227 }
30228
30229 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_lt_8_strided_a) {
30230 TEST_REQUIRES_X86_XOP;
30231 for (size_t k = 1; k < 8; k++) {
30232 GemmMicrokernelTester()
30233 .mr(2)
30234 .nr(4)
30235 .kr(8)
30236 .sr(1)
30237 .m(2)
30238 .n(4)
30239 .k(k)
30240 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080030241 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030242 }
30243 }
30244
30245 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_lt_8_subtile) {
30246 TEST_REQUIRES_X86_XOP;
30247 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080030248 for (uint32_t n = 1; n <= 4; n++) {
30249 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030250 GemmMicrokernelTester()
30251 .mr(2)
30252 .nr(4)
30253 .kr(8)
30254 .sr(1)
30255 .m(m)
30256 .n(n)
30257 .k(k)
30258 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030259 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030260 }
30261 }
30262 }
30263 }
30264
30265 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_gt_8) {
30266 TEST_REQUIRES_X86_XOP;
30267 for (size_t k = 9; k < 16; k++) {
30268 GemmMicrokernelTester()
30269 .mr(2)
30270 .nr(4)
30271 .kr(8)
30272 .sr(1)
30273 .m(2)
30274 .n(4)
30275 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080030276 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030277 }
30278 }
30279
30280 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_gt_8_strided_a) {
30281 TEST_REQUIRES_X86_XOP;
30282 for (size_t k = 9; k < 16; k++) {
30283 GemmMicrokernelTester()
30284 .mr(2)
30285 .nr(4)
30286 .kr(8)
30287 .sr(1)
30288 .m(2)
30289 .n(4)
30290 .k(k)
30291 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080030292 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030293 }
30294 }
30295
30296 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_gt_8_subtile) {
30297 TEST_REQUIRES_X86_XOP;
30298 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080030299 for (uint32_t n = 1; n <= 4; n++) {
30300 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030301 GemmMicrokernelTester()
30302 .mr(2)
30303 .nr(4)
30304 .kr(8)
30305 .sr(1)
30306 .m(m)
30307 .n(n)
30308 .k(k)
30309 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030310 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030311 }
30312 }
30313 }
30314 }
30315
30316 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_div_8) {
30317 TEST_REQUIRES_X86_XOP;
30318 for (size_t k = 16; k <= 80; k += 8) {
30319 GemmMicrokernelTester()
30320 .mr(2)
30321 .nr(4)
30322 .kr(8)
30323 .sr(1)
30324 .m(2)
30325 .n(4)
30326 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080030327 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030328 }
30329 }
30330
30331 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_div_8_strided_a) {
30332 TEST_REQUIRES_X86_XOP;
30333 for (size_t k = 16; k <= 80; k += 8) {
30334 GemmMicrokernelTester()
30335 .mr(2)
30336 .nr(4)
30337 .kr(8)
30338 .sr(1)
30339 .m(2)
30340 .n(4)
30341 .k(k)
30342 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080030343 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030344 }
30345 }
30346
30347 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_div_8_subtile) {
30348 TEST_REQUIRES_X86_XOP;
30349 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080030350 for (uint32_t n = 1; n <= 4; n++) {
30351 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030352 GemmMicrokernelTester()
30353 .mr(2)
30354 .nr(4)
30355 .kr(8)
30356 .sr(1)
30357 .m(m)
30358 .n(n)
30359 .k(k)
30360 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030361 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030362 }
30363 }
30364 }
30365 }
30366
30367 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4) {
30368 TEST_REQUIRES_X86_XOP;
30369 for (uint32_t n = 5; n < 8; n++) {
30370 for (size_t k = 1; k <= 40; k += 9) {
30371 GemmMicrokernelTester()
30372 .mr(2)
30373 .nr(4)
30374 .kr(8)
30375 .sr(1)
30376 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080030377 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030378 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080030379 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030380 }
30381 }
30382 }
30383
30384 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4_strided_cn) {
30385 TEST_REQUIRES_X86_XOP;
30386 for (uint32_t n = 5; n < 8; n++) {
30387 for (size_t k = 1; k <= 40; k += 9) {
30388 GemmMicrokernelTester()
30389 .mr(2)
30390 .nr(4)
30391 .kr(8)
30392 .sr(1)
30393 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080030394 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030395 .k(k)
30396 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030397 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030398 }
30399 }
30400 }
30401
30402 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4_strided_a) {
30403 TEST_REQUIRES_X86_XOP;
30404 for (uint32_t n = 5; n < 8; n++) {
30405 for (size_t k = 1; k <= 40; k += 9) {
30406 GemmMicrokernelTester()
30407 .mr(2)
30408 .nr(4)
30409 .kr(8)
30410 .sr(1)
30411 .m(2)
30412 .n(n)
30413 .k(k)
30414 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080030415 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030416 }
30417 }
30418 }
30419
30420 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4_subtile) {
30421 TEST_REQUIRES_X86_XOP;
30422 for (uint32_t n = 5; n < 8; n++) {
30423 for (size_t k = 1; k <= 40; k += 9) {
30424 for (uint32_t m = 1; m <= 2; m++) {
30425 GemmMicrokernelTester()
30426 .mr(2)
30427 .nr(4)
30428 .kr(8)
30429 .sr(1)
30430 .m(m)
30431 .n(n)
30432 .k(k)
30433 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030434 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030435 }
30436 }
30437 }
30438 }
30439
30440 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4) {
30441 TEST_REQUIRES_X86_XOP;
30442 for (uint32_t n = 8; n <= 12; n += 4) {
30443 for (size_t k = 1; k <= 40; k += 9) {
30444 GemmMicrokernelTester()
30445 .mr(2)
30446 .nr(4)
30447 .kr(8)
30448 .sr(1)
30449 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080030450 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030451 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080030452 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030453 }
30454 }
30455 }
30456
30457 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4_strided_cn) {
30458 TEST_REQUIRES_X86_XOP;
30459 for (uint32_t n = 8; n <= 12; n += 4) {
30460 for (size_t k = 1; k <= 40; k += 9) {
30461 GemmMicrokernelTester()
30462 .mr(2)
30463 .nr(4)
30464 .kr(8)
30465 .sr(1)
30466 .m(2)
30467 .n(n)
30468 .k(k)
30469 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030470 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030471 }
30472 }
30473 }
30474
30475 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4_strided_a) {
30476 TEST_REQUIRES_X86_XOP;
30477 for (uint32_t n = 8; n <= 12; n += 4) {
30478 for (size_t k = 1; k <= 40; k += 9) {
30479 GemmMicrokernelTester()
30480 .mr(2)
30481 .nr(4)
30482 .kr(8)
30483 .sr(1)
30484 .m(2)
30485 .n(n)
30486 .k(k)
30487 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080030488 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030489 }
30490 }
30491 }
30492
30493 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4_subtile) {
30494 TEST_REQUIRES_X86_XOP;
30495 for (uint32_t n = 8; n <= 12; n += 4) {
30496 for (size_t k = 1; k <= 40; k += 9) {
30497 for (uint32_t m = 1; m <= 2; m++) {
30498 GemmMicrokernelTester()
30499 .mr(2)
30500 .nr(4)
30501 .kr(8)
30502 .sr(1)
30503 .m(m)
30504 .n(n)
30505 .k(k)
30506 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030507 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030508 }
30509 }
30510 }
30511 }
30512
30513 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, strided_cm_subtile) {
30514 TEST_REQUIRES_X86_XOP;
30515 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080030516 for (uint32_t n = 1; n <= 4; n++) {
30517 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030518 GemmMicrokernelTester()
30519 .mr(2)
30520 .nr(4)
30521 .kr(8)
30522 .sr(1)
30523 .m(m)
30524 .n(n)
30525 .k(k)
30526 .cm_stride(7)
30527 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030528 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030529 }
30530 }
30531 }
30532 }
30533
30534 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, qmin) {
30535 TEST_REQUIRES_X86_XOP;
30536 GemmMicrokernelTester()
30537 .mr(2)
30538 .nr(4)
30539 .kr(8)
30540 .sr(1)
30541 .m(2)
30542 .n(4)
30543 .k(8)
30544 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080030545 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030546 }
30547
30548 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, qmax) {
30549 TEST_REQUIRES_X86_XOP;
30550 GemmMicrokernelTester()
30551 .mr(2)
30552 .nr(4)
30553 .kr(8)
30554 .sr(1)
30555 .m(2)
30556 .n(4)
30557 .k(8)
30558 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080030559 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030560 }
30561
30562 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, strided_cm) {
30563 TEST_REQUIRES_X86_XOP;
30564 GemmMicrokernelTester()
30565 .mr(2)
30566 .nr(4)
30567 .kr(8)
30568 .sr(1)
30569 .m(2)
30570 .n(4)
30571 .k(8)
30572 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030573 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030574 }
30575#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
30576
30577
30578#if XNN_ARCH_X86 || XNN_ARCH_X86_64
30579 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8) {
30580 TEST_REQUIRES_X86_SSE2;
30581 GemmMicrokernelTester()
30582 .mr(1)
30583 .nr(4)
30584 .kr(8)
30585 .sr(1)
30586 .m(1)
30587 .n(4)
30588 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080030589 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030590 }
30591
30592 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, strided_cn) {
30593 TEST_REQUIRES_X86_SSE2;
30594 GemmMicrokernelTester()
30595 .mr(1)
30596 .nr(4)
30597 .kr(8)
30598 .sr(1)
30599 .m(1)
30600 .n(4)
30601 .k(8)
30602 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030603 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030604 }
30605
30606 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8_strided_a) {
30607 TEST_REQUIRES_X86_SSE2;
30608 GemmMicrokernelTester()
30609 .mr(1)
30610 .nr(4)
30611 .kr(8)
30612 .sr(1)
30613 .m(1)
30614 .n(4)
30615 .k(8)
30616 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080030617 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030618 }
30619
30620 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8_subtile) {
30621 TEST_REQUIRES_X86_SSE2;
Zhi An Ng83844ae2022-01-14 09:52:25 -080030622 for (uint32_t n = 1; n <= 4; n++) {
30623 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030624 GemmMicrokernelTester()
30625 .mr(1)
30626 .nr(4)
30627 .kr(8)
30628 .sr(1)
30629 .m(m)
30630 .n(n)
30631 .k(8)
30632 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030633 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030634 }
30635 }
30636 }
30637
30638 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8_subtile_m) {
30639 TEST_REQUIRES_X86_SSE2;
30640 for (uint32_t m = 1; m <= 1; m++) {
30641 GemmMicrokernelTester()
30642 .mr(1)
30643 .nr(4)
30644 .kr(8)
30645 .sr(1)
30646 .m(m)
30647 .n(4)
30648 .k(8)
30649 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030650 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030651 }
30652 }
30653
30654 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8_subtile_n) {
30655 TEST_REQUIRES_X86_SSE2;
30656 for (uint32_t n = 1; n <= 4; n++) {
30657 GemmMicrokernelTester()
30658 .mr(1)
30659 .nr(4)
30660 .kr(8)
30661 .sr(1)
30662 .m(1)
30663 .n(n)
30664 .k(8)
30665 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030666 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030667 }
30668 }
30669
30670 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_lt_8) {
30671 TEST_REQUIRES_X86_SSE2;
30672 for (size_t k = 1; k < 8; k++) {
30673 GemmMicrokernelTester()
30674 .mr(1)
30675 .nr(4)
30676 .kr(8)
30677 .sr(1)
30678 .m(1)
30679 .n(4)
30680 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080030681 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030682 }
30683 }
30684
30685 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_lt_8_strided_a) {
30686 TEST_REQUIRES_X86_SSE2;
30687 for (size_t k = 1; k < 8; k++) {
30688 GemmMicrokernelTester()
30689 .mr(1)
30690 .nr(4)
30691 .kr(8)
30692 .sr(1)
30693 .m(1)
30694 .n(4)
30695 .k(k)
30696 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080030697 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030698 }
30699 }
30700
30701 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_lt_8_subtile) {
30702 TEST_REQUIRES_X86_SSE2;
30703 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080030704 for (uint32_t n = 1; n <= 4; n++) {
30705 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030706 GemmMicrokernelTester()
30707 .mr(1)
30708 .nr(4)
30709 .kr(8)
30710 .sr(1)
30711 .m(m)
30712 .n(n)
30713 .k(k)
30714 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030715 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030716 }
30717 }
30718 }
30719 }
30720
30721 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_gt_8) {
30722 TEST_REQUIRES_X86_SSE2;
30723 for (size_t k = 9; k < 16; k++) {
30724 GemmMicrokernelTester()
30725 .mr(1)
30726 .nr(4)
30727 .kr(8)
30728 .sr(1)
30729 .m(1)
30730 .n(4)
30731 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080030732 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030733 }
30734 }
30735
30736 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_gt_8_strided_a) {
30737 TEST_REQUIRES_X86_SSE2;
30738 for (size_t k = 9; k < 16; k++) {
30739 GemmMicrokernelTester()
30740 .mr(1)
30741 .nr(4)
30742 .kr(8)
30743 .sr(1)
30744 .m(1)
30745 .n(4)
30746 .k(k)
30747 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080030748 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030749 }
30750 }
30751
30752 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_gt_8_subtile) {
30753 TEST_REQUIRES_X86_SSE2;
30754 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080030755 for (uint32_t n = 1; n <= 4; n++) {
30756 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030757 GemmMicrokernelTester()
30758 .mr(1)
30759 .nr(4)
30760 .kr(8)
30761 .sr(1)
30762 .m(m)
30763 .n(n)
30764 .k(k)
30765 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030766 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030767 }
30768 }
30769 }
30770 }
30771
30772 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_div_8) {
30773 TEST_REQUIRES_X86_SSE2;
30774 for (size_t k = 16; k <= 80; k += 8) {
30775 GemmMicrokernelTester()
30776 .mr(1)
30777 .nr(4)
30778 .kr(8)
30779 .sr(1)
30780 .m(1)
30781 .n(4)
30782 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080030783 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030784 }
30785 }
30786
30787 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_div_8_strided_a) {
30788 TEST_REQUIRES_X86_SSE2;
30789 for (size_t k = 16; k <= 80; k += 8) {
30790 GemmMicrokernelTester()
30791 .mr(1)
30792 .nr(4)
30793 .kr(8)
30794 .sr(1)
30795 .m(1)
30796 .n(4)
30797 .k(k)
30798 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080030799 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030800 }
30801 }
30802
30803 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_div_8_subtile) {
30804 TEST_REQUIRES_X86_SSE2;
30805 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080030806 for (uint32_t n = 1; n <= 4; n++) {
30807 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030808 GemmMicrokernelTester()
30809 .mr(1)
30810 .nr(4)
30811 .kr(8)
30812 .sr(1)
30813 .m(m)
30814 .n(n)
30815 .k(k)
30816 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030817 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030818 }
30819 }
30820 }
30821 }
30822
30823 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4) {
30824 TEST_REQUIRES_X86_SSE2;
30825 for (uint32_t n = 5; n < 8; n++) {
30826 for (size_t k = 1; k <= 40; k += 9) {
30827 GemmMicrokernelTester()
30828 .mr(1)
30829 .nr(4)
30830 .kr(8)
30831 .sr(1)
30832 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080030833 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030834 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080030835 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030836 }
30837 }
30838 }
30839
30840 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4_strided_cn) {
30841 TEST_REQUIRES_X86_SSE2;
30842 for (uint32_t n = 5; n < 8; n++) {
30843 for (size_t k = 1; k <= 40; k += 9) {
30844 GemmMicrokernelTester()
30845 .mr(1)
30846 .nr(4)
30847 .kr(8)
30848 .sr(1)
30849 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080030850 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030851 .k(k)
30852 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030853 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030854 }
30855 }
30856 }
30857
30858 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4_strided_a) {
30859 TEST_REQUIRES_X86_SSE2;
30860 for (uint32_t n = 5; n < 8; n++) {
30861 for (size_t k = 1; k <= 40; k += 9) {
30862 GemmMicrokernelTester()
30863 .mr(1)
30864 .nr(4)
30865 .kr(8)
30866 .sr(1)
30867 .m(1)
30868 .n(n)
30869 .k(k)
30870 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080030871 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030872 }
30873 }
30874 }
30875
30876 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4_subtile) {
30877 TEST_REQUIRES_X86_SSE2;
30878 for (uint32_t n = 5; n < 8; n++) {
30879 for (size_t k = 1; k <= 40; k += 9) {
30880 for (uint32_t m = 1; m <= 1; m++) {
30881 GemmMicrokernelTester()
30882 .mr(1)
30883 .nr(4)
30884 .kr(8)
30885 .sr(1)
30886 .m(m)
30887 .n(n)
30888 .k(k)
30889 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030890 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030891 }
30892 }
30893 }
30894 }
30895
30896 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4) {
30897 TEST_REQUIRES_X86_SSE2;
30898 for (uint32_t n = 8; n <= 12; n += 4) {
30899 for (size_t k = 1; k <= 40; k += 9) {
30900 GemmMicrokernelTester()
30901 .mr(1)
30902 .nr(4)
30903 .kr(8)
30904 .sr(1)
30905 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080030906 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030907 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080030908 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030909 }
30910 }
30911 }
30912
30913 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4_strided_cn) {
30914 TEST_REQUIRES_X86_SSE2;
30915 for (uint32_t n = 8; n <= 12; n += 4) {
30916 for (size_t k = 1; k <= 40; k += 9) {
30917 GemmMicrokernelTester()
30918 .mr(1)
30919 .nr(4)
30920 .kr(8)
30921 .sr(1)
30922 .m(1)
30923 .n(n)
30924 .k(k)
30925 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030926 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030927 }
30928 }
30929 }
30930
30931 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4_strided_a) {
30932 TEST_REQUIRES_X86_SSE2;
30933 for (uint32_t n = 8; n <= 12; n += 4) {
30934 for (size_t k = 1; k <= 40; k += 9) {
30935 GemmMicrokernelTester()
30936 .mr(1)
30937 .nr(4)
30938 .kr(8)
30939 .sr(1)
30940 .m(1)
30941 .n(n)
30942 .k(k)
30943 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080030944 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030945 }
30946 }
30947 }
30948
30949 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4_subtile) {
30950 TEST_REQUIRES_X86_SSE2;
30951 for (uint32_t n = 8; n <= 12; n += 4) {
30952 for (size_t k = 1; k <= 40; k += 9) {
30953 for (uint32_t m = 1; m <= 1; m++) {
30954 GemmMicrokernelTester()
30955 .mr(1)
30956 .nr(4)
30957 .kr(8)
30958 .sr(1)
30959 .m(m)
30960 .n(n)
30961 .k(k)
30962 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030963 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030964 }
30965 }
30966 }
30967 }
30968
30969 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, strided_cm_subtile) {
30970 TEST_REQUIRES_X86_SSE2;
30971 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080030972 for (uint32_t n = 1; n <= 4; n++) {
30973 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030974 GemmMicrokernelTester()
30975 .mr(1)
30976 .nr(4)
30977 .kr(8)
30978 .sr(1)
30979 .m(m)
30980 .n(n)
30981 .k(k)
30982 .cm_stride(7)
30983 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030984 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030985 }
30986 }
30987 }
30988 }
30989
30990 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, qmin) {
30991 TEST_REQUIRES_X86_SSE2;
30992 GemmMicrokernelTester()
30993 .mr(1)
30994 .nr(4)
30995 .kr(8)
30996 .sr(1)
30997 .m(1)
30998 .n(4)
30999 .k(8)
31000 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080031001 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031002 }
31003
31004 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, qmax) {
31005 TEST_REQUIRES_X86_SSE2;
31006 GemmMicrokernelTester()
31007 .mr(1)
31008 .nr(4)
31009 .kr(8)
31010 .sr(1)
31011 .m(1)
31012 .n(4)
31013 .k(8)
31014 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080031015 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031016 }
31017
31018 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, strided_cm) {
31019 TEST_REQUIRES_X86_SSE2;
31020 GemmMicrokernelTester()
31021 .mr(1)
31022 .nr(4)
31023 .kr(8)
31024 .sr(1)
31025 .m(1)
31026 .n(4)
31027 .k(8)
31028 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080031029 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031030 }
31031#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
31032
31033
31034#if XNN_ARCH_X86 || XNN_ARCH_X86_64
31035 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8) {
31036 TEST_REQUIRES_X86_SSE2;
31037 GemmMicrokernelTester()
31038 .mr(3)
31039 .nr(4)
31040 .kr(8)
31041 .sr(1)
31042 .m(3)
31043 .n(4)
31044 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080031045 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031046 }
31047
31048 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, strided_cn) {
31049 TEST_REQUIRES_X86_SSE2;
31050 GemmMicrokernelTester()
31051 .mr(3)
31052 .nr(4)
31053 .kr(8)
31054 .sr(1)
31055 .m(3)
31056 .n(4)
31057 .k(8)
31058 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080031059 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031060 }
31061
31062 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_strided_a) {
31063 TEST_REQUIRES_X86_SSE2;
31064 GemmMicrokernelTester()
31065 .mr(3)
31066 .nr(4)
31067 .kr(8)
31068 .sr(1)
31069 .m(3)
31070 .n(4)
31071 .k(8)
31072 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080031073 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031074 }
31075
31076 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_subtile) {
31077 TEST_REQUIRES_X86_SSE2;
Zhi An Ng83844ae2022-01-14 09:52:25 -080031078 for (uint32_t n = 1; n <= 4; n++) {
31079 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031080 GemmMicrokernelTester()
31081 .mr(3)
31082 .nr(4)
31083 .kr(8)
31084 .sr(1)
31085 .m(m)
31086 .n(n)
31087 .k(8)
31088 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080031089 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031090 }
31091 }
31092 }
31093
31094 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_subtile_m) {
31095 TEST_REQUIRES_X86_SSE2;
31096 for (uint32_t m = 1; m <= 3; m++) {
31097 GemmMicrokernelTester()
31098 .mr(3)
31099 .nr(4)
31100 .kr(8)
31101 .sr(1)
31102 .m(m)
31103 .n(4)
31104 .k(8)
31105 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080031106 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031107 }
31108 }
31109
31110 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_subtile_n) {
31111 TEST_REQUIRES_X86_SSE2;
31112 for (uint32_t n = 1; n <= 4; n++) {
31113 GemmMicrokernelTester()
31114 .mr(3)
31115 .nr(4)
31116 .kr(8)
31117 .sr(1)
31118 .m(3)
31119 .n(n)
31120 .k(8)
31121 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080031122 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031123 }
31124 }
31125
31126 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_lt_8) {
31127 TEST_REQUIRES_X86_SSE2;
31128 for (size_t k = 1; k < 8; k++) {
31129 GemmMicrokernelTester()
31130 .mr(3)
31131 .nr(4)
31132 .kr(8)
31133 .sr(1)
31134 .m(3)
31135 .n(4)
31136 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080031137 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031138 }
31139 }
31140
31141 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_lt_8_strided_a) {
31142 TEST_REQUIRES_X86_SSE2;
31143 for (size_t k = 1; k < 8; k++) {
31144 GemmMicrokernelTester()
31145 .mr(3)
31146 .nr(4)
31147 .kr(8)
31148 .sr(1)
31149 .m(3)
31150 .n(4)
31151 .k(k)
31152 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080031153 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031154 }
31155 }
31156
31157 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_lt_8_subtile) {
31158 TEST_REQUIRES_X86_SSE2;
31159 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080031160 for (uint32_t n = 1; n <= 4; n++) {
31161 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031162 GemmMicrokernelTester()
31163 .mr(3)
31164 .nr(4)
31165 .kr(8)
31166 .sr(1)
31167 .m(m)
31168 .n(n)
31169 .k(k)
31170 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080031171 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031172 }
31173 }
31174 }
31175 }
31176
31177 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_gt_8) {
31178 TEST_REQUIRES_X86_SSE2;
31179 for (size_t k = 9; k < 16; k++) {
31180 GemmMicrokernelTester()
31181 .mr(3)
31182 .nr(4)
31183 .kr(8)
31184 .sr(1)
31185 .m(3)
31186 .n(4)
31187 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080031188 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031189 }
31190 }
31191
31192 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_gt_8_strided_a) {
31193 TEST_REQUIRES_X86_SSE2;
31194 for (size_t k = 9; k < 16; k++) {
31195 GemmMicrokernelTester()
31196 .mr(3)
31197 .nr(4)
31198 .kr(8)
31199 .sr(1)
31200 .m(3)
31201 .n(4)
31202 .k(k)
31203 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080031204 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031205 }
31206 }
31207
31208 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_gt_8_subtile) {
31209 TEST_REQUIRES_X86_SSE2;
31210 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080031211 for (uint32_t n = 1; n <= 4; n++) {
31212 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031213 GemmMicrokernelTester()
31214 .mr(3)
31215 .nr(4)
31216 .kr(8)
31217 .sr(1)
31218 .m(m)
31219 .n(n)
31220 .k(k)
31221 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080031222 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031223 }
31224 }
31225 }
31226 }
31227
31228 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_div_8) {
31229 TEST_REQUIRES_X86_SSE2;
31230 for (size_t k = 16; k <= 80; k += 8) {
31231 GemmMicrokernelTester()
31232 .mr(3)
31233 .nr(4)
31234 .kr(8)
31235 .sr(1)
31236 .m(3)
31237 .n(4)
31238 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080031239 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031240 }
31241 }
31242
31243 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_div_8_strided_a) {
31244 TEST_REQUIRES_X86_SSE2;
31245 for (size_t k = 16; k <= 80; k += 8) {
31246 GemmMicrokernelTester()
31247 .mr(3)
31248 .nr(4)
31249 .kr(8)
31250 .sr(1)
31251 .m(3)
31252 .n(4)
31253 .k(k)
31254 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080031255 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031256 }
31257 }
31258
31259 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_div_8_subtile) {
31260 TEST_REQUIRES_X86_SSE2;
31261 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080031262 for (uint32_t n = 1; n <= 4; n++) {
31263 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031264 GemmMicrokernelTester()
31265 .mr(3)
31266 .nr(4)
31267 .kr(8)
31268 .sr(1)
31269 .m(m)
31270 .n(n)
31271 .k(k)
31272 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080031273 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031274 }
31275 }
31276 }
31277 }
31278
31279 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4) {
31280 TEST_REQUIRES_X86_SSE2;
31281 for (uint32_t n = 5; n < 8; n++) {
31282 for (size_t k = 1; k <= 40; k += 9) {
31283 GemmMicrokernelTester()
31284 .mr(3)
31285 .nr(4)
31286 .kr(8)
31287 .sr(1)
31288 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080031289 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031290 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080031291 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031292 }
31293 }
31294 }
31295
31296 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4_strided_cn) {
31297 TEST_REQUIRES_X86_SSE2;
31298 for (uint32_t n = 5; n < 8; n++) {
31299 for (size_t k = 1; k <= 40; k += 9) {
31300 GemmMicrokernelTester()
31301 .mr(3)
31302 .nr(4)
31303 .kr(8)
31304 .sr(1)
31305 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080031306 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031307 .k(k)
31308 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080031309 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031310 }
31311 }
31312 }
31313
31314 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4_strided_a) {
31315 TEST_REQUIRES_X86_SSE2;
31316 for (uint32_t n = 5; n < 8; n++) {
31317 for (size_t k = 1; k <= 40; k += 9) {
31318 GemmMicrokernelTester()
31319 .mr(3)
31320 .nr(4)
31321 .kr(8)
31322 .sr(1)
31323 .m(3)
31324 .n(n)
31325 .k(k)
31326 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080031327 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031328 }
31329 }
31330 }
31331
31332 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4_subtile) {
31333 TEST_REQUIRES_X86_SSE2;
31334 for (uint32_t n = 5; n < 8; n++) {
31335 for (size_t k = 1; k <= 40; k += 9) {
31336 for (uint32_t m = 1; m <= 3; m++) {
31337 GemmMicrokernelTester()
31338 .mr(3)
31339 .nr(4)
31340 .kr(8)
31341 .sr(1)
31342 .m(m)
31343 .n(n)
31344 .k(k)
31345 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080031346 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031347 }
31348 }
31349 }
31350 }
31351
31352 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4) {
31353 TEST_REQUIRES_X86_SSE2;
31354 for (uint32_t n = 8; n <= 12; n += 4) {
31355 for (size_t k = 1; k <= 40; k += 9) {
31356 GemmMicrokernelTester()
31357 .mr(3)
31358 .nr(4)
31359 .kr(8)
31360 .sr(1)
31361 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080031362 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031363 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080031364 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031365 }
31366 }
31367 }
31368
31369 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4_strided_cn) {
31370 TEST_REQUIRES_X86_SSE2;
31371 for (uint32_t n = 8; n <= 12; n += 4) {
31372 for (size_t k = 1; k <= 40; k += 9) {
31373 GemmMicrokernelTester()
31374 .mr(3)
31375 .nr(4)
31376 .kr(8)
31377 .sr(1)
31378 .m(3)
31379 .n(n)
31380 .k(k)
31381 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080031382 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031383 }
31384 }
31385 }
31386
31387 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4_strided_a) {
31388 TEST_REQUIRES_X86_SSE2;
31389 for (uint32_t n = 8; n <= 12; n += 4) {
31390 for (size_t k = 1; k <= 40; k += 9) {
31391 GemmMicrokernelTester()
31392 .mr(3)
31393 .nr(4)
31394 .kr(8)
31395 .sr(1)
31396 .m(3)
31397 .n(n)
31398 .k(k)
31399 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080031400 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031401 }
31402 }
31403 }
31404
31405 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4_subtile) {
31406 TEST_REQUIRES_X86_SSE2;
31407 for (uint32_t n = 8; n <= 12; n += 4) {
31408 for (size_t k = 1; k <= 40; k += 9) {
31409 for (uint32_t m = 1; m <= 3; m++) {
31410 GemmMicrokernelTester()
31411 .mr(3)
31412 .nr(4)
31413 .kr(8)
31414 .sr(1)
31415 .m(m)
31416 .n(n)
31417 .k(k)
31418 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080031419 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031420 }
31421 }
31422 }
31423 }
31424
31425 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, strided_cm_subtile) {
31426 TEST_REQUIRES_X86_SSE2;
31427 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080031428 for (uint32_t n = 1; n <= 4; n++) {
31429 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031430 GemmMicrokernelTester()
31431 .mr(3)
31432 .nr(4)
31433 .kr(8)
31434 .sr(1)
31435 .m(m)
31436 .n(n)
31437 .k(k)
31438 .cm_stride(7)
31439 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080031440 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031441 }
31442 }
31443 }
31444 }
31445
31446 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, qmin) {
31447 TEST_REQUIRES_X86_SSE2;
31448 GemmMicrokernelTester()
31449 .mr(3)
31450 .nr(4)
31451 .kr(8)
31452 .sr(1)
31453 .m(3)
31454 .n(4)
31455 .k(8)
31456 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080031457 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031458 }
31459
31460 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, qmax) {
31461 TEST_REQUIRES_X86_SSE2;
31462 GemmMicrokernelTester()
31463 .mr(3)
31464 .nr(4)
31465 .kr(8)
31466 .sr(1)
31467 .m(3)
31468 .n(4)
31469 .k(8)
31470 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080031471 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031472 }
31473
31474 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, strided_cm) {
31475 TEST_REQUIRES_X86_SSE2;
31476 GemmMicrokernelTester()
31477 .mr(3)
31478 .nr(4)
31479 .kr(8)
31480 .sr(1)
31481 .m(3)
31482 .n(4)
31483 .k(8)
31484 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080031485 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031486 }
31487#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
31488
31489
31490#if XNN_ARCH_X86 || XNN_ARCH_X86_64
31491 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8) {
31492 TEST_REQUIRES_X86_SSE41;
31493 GemmMicrokernelTester()
31494 .mr(1)
31495 .nr(4)
31496 .kr(8)
31497 .sr(1)
31498 .m(1)
31499 .n(4)
31500 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080031501 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031502 }
31503
31504 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, strided_cn) {
31505 TEST_REQUIRES_X86_SSE41;
31506 GemmMicrokernelTester()
31507 .mr(1)
31508 .nr(4)
31509 .kr(8)
31510 .sr(1)
31511 .m(1)
31512 .n(4)
31513 .k(8)
31514 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080031515 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031516 }
31517
31518 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8_strided_a) {
31519 TEST_REQUIRES_X86_SSE41;
31520 GemmMicrokernelTester()
31521 .mr(1)
31522 .nr(4)
31523 .kr(8)
31524 .sr(1)
31525 .m(1)
31526 .n(4)
31527 .k(8)
31528 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080031529 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031530 }
31531
31532 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8_subtile) {
31533 TEST_REQUIRES_X86_SSE41;
Zhi An Ng83844ae2022-01-14 09:52:25 -080031534 for (uint32_t n = 1; n <= 4; n++) {
31535 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031536 GemmMicrokernelTester()
31537 .mr(1)
31538 .nr(4)
31539 .kr(8)
31540 .sr(1)
31541 .m(m)
31542 .n(n)
31543 .k(8)
31544 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080031545 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031546 }
31547 }
31548 }
31549
31550 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8_subtile_m) {
31551 TEST_REQUIRES_X86_SSE41;
31552 for (uint32_t m = 1; m <= 1; m++) {
31553 GemmMicrokernelTester()
31554 .mr(1)
31555 .nr(4)
31556 .kr(8)
31557 .sr(1)
31558 .m(m)
31559 .n(4)
31560 .k(8)
31561 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080031562 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031563 }
31564 }
31565
31566 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8_subtile_n) {
31567 TEST_REQUIRES_X86_SSE41;
31568 for (uint32_t n = 1; n <= 4; n++) {
31569 GemmMicrokernelTester()
31570 .mr(1)
31571 .nr(4)
31572 .kr(8)
31573 .sr(1)
31574 .m(1)
31575 .n(n)
31576 .k(8)
31577 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080031578 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031579 }
31580 }
31581
31582 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_lt_8) {
31583 TEST_REQUIRES_X86_SSE41;
31584 for (size_t k = 1; k < 8; k++) {
31585 GemmMicrokernelTester()
31586 .mr(1)
31587 .nr(4)
31588 .kr(8)
31589 .sr(1)
31590 .m(1)
31591 .n(4)
31592 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080031593 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031594 }
31595 }
31596
31597 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_lt_8_strided_a) {
31598 TEST_REQUIRES_X86_SSE41;
31599 for (size_t k = 1; k < 8; k++) {
31600 GemmMicrokernelTester()
31601 .mr(1)
31602 .nr(4)
31603 .kr(8)
31604 .sr(1)
31605 .m(1)
31606 .n(4)
31607 .k(k)
31608 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080031609 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031610 }
31611 }
31612
31613 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_lt_8_subtile) {
31614 TEST_REQUIRES_X86_SSE41;
31615 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080031616 for (uint32_t n = 1; n <= 4; n++) {
31617 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031618 GemmMicrokernelTester()
31619 .mr(1)
31620 .nr(4)
31621 .kr(8)
31622 .sr(1)
31623 .m(m)
31624 .n(n)
31625 .k(k)
31626 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080031627 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031628 }
31629 }
31630 }
31631 }
31632
31633 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_gt_8) {
31634 TEST_REQUIRES_X86_SSE41;
31635 for (size_t k = 9; k < 16; k++) {
31636 GemmMicrokernelTester()
31637 .mr(1)
31638 .nr(4)
31639 .kr(8)
31640 .sr(1)
31641 .m(1)
31642 .n(4)
31643 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080031644 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031645 }
31646 }
31647
31648 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_gt_8_strided_a) {
31649 TEST_REQUIRES_X86_SSE41;
31650 for (size_t k = 9; k < 16; k++) {
31651 GemmMicrokernelTester()
31652 .mr(1)
31653 .nr(4)
31654 .kr(8)
31655 .sr(1)
31656 .m(1)
31657 .n(4)
31658 .k(k)
31659 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080031660 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031661 }
31662 }
31663
31664 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_gt_8_subtile) {
31665 TEST_REQUIRES_X86_SSE41;
31666 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080031667 for (uint32_t n = 1; n <= 4; n++) {
31668 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031669 GemmMicrokernelTester()
31670 .mr(1)
31671 .nr(4)
31672 .kr(8)
31673 .sr(1)
31674 .m(m)
31675 .n(n)
31676 .k(k)
31677 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080031678 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031679 }
31680 }
31681 }
31682 }
31683
31684 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_div_8) {
31685 TEST_REQUIRES_X86_SSE41;
31686 for (size_t k = 16; k <= 80; k += 8) {
31687 GemmMicrokernelTester()
31688 .mr(1)
31689 .nr(4)
31690 .kr(8)
31691 .sr(1)
31692 .m(1)
31693 .n(4)
31694 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080031695 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031696 }
31697 }
31698
31699 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_div_8_strided_a) {
31700 TEST_REQUIRES_X86_SSE41;
31701 for (size_t k = 16; k <= 80; k += 8) {
31702 GemmMicrokernelTester()
31703 .mr(1)
31704 .nr(4)
31705 .kr(8)
31706 .sr(1)
31707 .m(1)
31708 .n(4)
31709 .k(k)
31710 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080031711 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031712 }
31713 }
31714
31715 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_div_8_subtile) {
31716 TEST_REQUIRES_X86_SSE41;
31717 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080031718 for (uint32_t n = 1; n <= 4; n++) {
31719 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031720 GemmMicrokernelTester()
31721 .mr(1)
31722 .nr(4)
31723 .kr(8)
31724 .sr(1)
31725 .m(m)
31726 .n(n)
31727 .k(k)
31728 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080031729 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031730 }
31731 }
31732 }
31733 }
31734
31735 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4) {
31736 TEST_REQUIRES_X86_SSE41;
31737 for (uint32_t n = 5; n < 8; n++) {
31738 for (size_t k = 1; k <= 40; k += 9) {
31739 GemmMicrokernelTester()
31740 .mr(1)
31741 .nr(4)
31742 .kr(8)
31743 .sr(1)
31744 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080031745 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031746 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080031747 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031748 }
31749 }
31750 }
31751
31752 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4_strided_cn) {
31753 TEST_REQUIRES_X86_SSE41;
31754 for (uint32_t n = 5; n < 8; n++) {
31755 for (size_t k = 1; k <= 40; k += 9) {
31756 GemmMicrokernelTester()
31757 .mr(1)
31758 .nr(4)
31759 .kr(8)
31760 .sr(1)
31761 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080031762 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031763 .k(k)
31764 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080031765 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031766 }
31767 }
31768 }
31769
31770 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4_strided_a) {
31771 TEST_REQUIRES_X86_SSE41;
31772 for (uint32_t n = 5; n < 8; n++) {
31773 for (size_t k = 1; k <= 40; k += 9) {
31774 GemmMicrokernelTester()
31775 .mr(1)
31776 .nr(4)
31777 .kr(8)
31778 .sr(1)
31779 .m(1)
31780 .n(n)
31781 .k(k)
31782 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080031783 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031784 }
31785 }
31786 }
31787
31788 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4_subtile) {
31789 TEST_REQUIRES_X86_SSE41;
31790 for (uint32_t n = 5; n < 8; n++) {
31791 for (size_t k = 1; k <= 40; k += 9) {
31792 for (uint32_t m = 1; m <= 1; m++) {
31793 GemmMicrokernelTester()
31794 .mr(1)
31795 .nr(4)
31796 .kr(8)
31797 .sr(1)
31798 .m(m)
31799 .n(n)
31800 .k(k)
31801 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080031802 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031803 }
31804 }
31805 }
31806 }
31807
31808 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4) {
31809 TEST_REQUIRES_X86_SSE41;
31810 for (uint32_t n = 8; n <= 12; n += 4) {
31811 for (size_t k = 1; k <= 40; k += 9) {
31812 GemmMicrokernelTester()
31813 .mr(1)
31814 .nr(4)
31815 .kr(8)
31816 .sr(1)
31817 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080031818 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031819 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080031820 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031821 }
31822 }
31823 }
31824
31825 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4_strided_cn) {
31826 TEST_REQUIRES_X86_SSE41;
31827 for (uint32_t n = 8; n <= 12; n += 4) {
31828 for (size_t k = 1; k <= 40; k += 9) {
31829 GemmMicrokernelTester()
31830 .mr(1)
31831 .nr(4)
31832 .kr(8)
31833 .sr(1)
31834 .m(1)
31835 .n(n)
31836 .k(k)
31837 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080031838 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031839 }
31840 }
31841 }
31842
31843 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4_strided_a) {
31844 TEST_REQUIRES_X86_SSE41;
31845 for (uint32_t n = 8; n <= 12; n += 4) {
31846 for (size_t k = 1; k <= 40; k += 9) {
31847 GemmMicrokernelTester()
31848 .mr(1)
31849 .nr(4)
31850 .kr(8)
31851 .sr(1)
31852 .m(1)
31853 .n(n)
31854 .k(k)
31855 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080031856 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031857 }
31858 }
31859 }
31860
31861 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4_subtile) {
31862 TEST_REQUIRES_X86_SSE41;
31863 for (uint32_t n = 8; n <= 12; n += 4) {
31864 for (size_t k = 1; k <= 40; k += 9) {
31865 for (uint32_t m = 1; m <= 1; m++) {
31866 GemmMicrokernelTester()
31867 .mr(1)
31868 .nr(4)
31869 .kr(8)
31870 .sr(1)
31871 .m(m)
31872 .n(n)
31873 .k(k)
31874 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080031875 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031876 }
31877 }
31878 }
31879 }
31880
31881 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, strided_cm_subtile) {
31882 TEST_REQUIRES_X86_SSE41;
31883 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080031884 for (uint32_t n = 1; n <= 4; n++) {
31885 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031886 GemmMicrokernelTester()
31887 .mr(1)
31888 .nr(4)
31889 .kr(8)
31890 .sr(1)
31891 .m(m)
31892 .n(n)
31893 .k(k)
31894 .cm_stride(7)
31895 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080031896 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031897 }
31898 }
31899 }
31900 }
31901
31902 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, qmin) {
31903 TEST_REQUIRES_X86_SSE41;
31904 GemmMicrokernelTester()
31905 .mr(1)
31906 .nr(4)
31907 .kr(8)
31908 .sr(1)
31909 .m(1)
31910 .n(4)
31911 .k(8)
31912 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080031913 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031914 }
31915
31916 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, qmax) {
31917 TEST_REQUIRES_X86_SSE41;
31918 GemmMicrokernelTester()
31919 .mr(1)
31920 .nr(4)
31921 .kr(8)
31922 .sr(1)
31923 .m(1)
31924 .n(4)
31925 .k(8)
31926 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080031927 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031928 }
31929
31930 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, strided_cm) {
31931 TEST_REQUIRES_X86_SSE41;
31932 GemmMicrokernelTester()
31933 .mr(1)
31934 .nr(4)
31935 .kr(8)
31936 .sr(1)
31937 .m(1)
31938 .n(4)
31939 .k(8)
31940 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080031941 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031942 }
31943#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
31944
31945
31946#if XNN_ARCH_X86 || XNN_ARCH_X86_64
31947 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8) {
31948 TEST_REQUIRES_X86_AVX;
31949 GemmMicrokernelTester()
31950 .mr(1)
31951 .nr(4)
31952 .kr(8)
31953 .sr(1)
31954 .m(1)
31955 .n(4)
31956 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080031957 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031958 }
31959
31960 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, strided_cn) {
31961 TEST_REQUIRES_X86_AVX;
31962 GemmMicrokernelTester()
31963 .mr(1)
31964 .nr(4)
31965 .kr(8)
31966 .sr(1)
31967 .m(1)
31968 .n(4)
31969 .k(8)
31970 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080031971 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031972 }
31973
31974 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8_strided_a) {
31975 TEST_REQUIRES_X86_AVX;
31976 GemmMicrokernelTester()
31977 .mr(1)
31978 .nr(4)
31979 .kr(8)
31980 .sr(1)
31981 .m(1)
31982 .n(4)
31983 .k(8)
31984 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080031985 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031986 }
31987
31988 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8_subtile) {
31989 TEST_REQUIRES_X86_AVX;
Zhi An Ng83844ae2022-01-14 09:52:25 -080031990 for (uint32_t n = 1; n <= 4; n++) {
31991 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031992 GemmMicrokernelTester()
31993 .mr(1)
31994 .nr(4)
31995 .kr(8)
31996 .sr(1)
31997 .m(m)
31998 .n(n)
31999 .k(8)
32000 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080032001 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032002 }
32003 }
32004 }
32005
32006 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8_subtile_m) {
32007 TEST_REQUIRES_X86_AVX;
32008 for (uint32_t m = 1; m <= 1; m++) {
32009 GemmMicrokernelTester()
32010 .mr(1)
32011 .nr(4)
32012 .kr(8)
32013 .sr(1)
32014 .m(m)
32015 .n(4)
32016 .k(8)
32017 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080032018 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032019 }
32020 }
32021
32022 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8_subtile_n) {
32023 TEST_REQUIRES_X86_AVX;
32024 for (uint32_t n = 1; n <= 4; n++) {
32025 GemmMicrokernelTester()
32026 .mr(1)
32027 .nr(4)
32028 .kr(8)
32029 .sr(1)
32030 .m(1)
32031 .n(n)
32032 .k(8)
32033 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080032034 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032035 }
32036 }
32037
32038 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_lt_8) {
32039 TEST_REQUIRES_X86_AVX;
32040 for (size_t k = 1; k < 8; k++) {
32041 GemmMicrokernelTester()
32042 .mr(1)
32043 .nr(4)
32044 .kr(8)
32045 .sr(1)
32046 .m(1)
32047 .n(4)
32048 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080032049 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032050 }
32051 }
32052
32053 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_lt_8_strided_a) {
32054 TEST_REQUIRES_X86_AVX;
32055 for (size_t k = 1; k < 8; k++) {
32056 GemmMicrokernelTester()
32057 .mr(1)
32058 .nr(4)
32059 .kr(8)
32060 .sr(1)
32061 .m(1)
32062 .n(4)
32063 .k(k)
32064 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080032065 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032066 }
32067 }
32068
32069 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_lt_8_subtile) {
32070 TEST_REQUIRES_X86_AVX;
32071 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080032072 for (uint32_t n = 1; n <= 4; n++) {
32073 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032074 GemmMicrokernelTester()
32075 .mr(1)
32076 .nr(4)
32077 .kr(8)
32078 .sr(1)
32079 .m(m)
32080 .n(n)
32081 .k(k)
32082 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080032083 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032084 }
32085 }
32086 }
32087 }
32088
32089 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_gt_8) {
32090 TEST_REQUIRES_X86_AVX;
32091 for (size_t k = 9; k < 16; k++) {
32092 GemmMicrokernelTester()
32093 .mr(1)
32094 .nr(4)
32095 .kr(8)
32096 .sr(1)
32097 .m(1)
32098 .n(4)
32099 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080032100 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032101 }
32102 }
32103
32104 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_gt_8_strided_a) {
32105 TEST_REQUIRES_X86_AVX;
32106 for (size_t k = 9; k < 16; k++) {
32107 GemmMicrokernelTester()
32108 .mr(1)
32109 .nr(4)
32110 .kr(8)
32111 .sr(1)
32112 .m(1)
32113 .n(4)
32114 .k(k)
32115 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080032116 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032117 }
32118 }
32119
32120 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_gt_8_subtile) {
32121 TEST_REQUIRES_X86_AVX;
32122 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080032123 for (uint32_t n = 1; n <= 4; n++) {
32124 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032125 GemmMicrokernelTester()
32126 .mr(1)
32127 .nr(4)
32128 .kr(8)
32129 .sr(1)
32130 .m(m)
32131 .n(n)
32132 .k(k)
32133 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080032134 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032135 }
32136 }
32137 }
32138 }
32139
32140 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_div_8) {
32141 TEST_REQUIRES_X86_AVX;
32142 for (size_t k = 16; k <= 80; k += 8) {
32143 GemmMicrokernelTester()
32144 .mr(1)
32145 .nr(4)
32146 .kr(8)
32147 .sr(1)
32148 .m(1)
32149 .n(4)
32150 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080032151 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032152 }
32153 }
32154
32155 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_div_8_strided_a) {
32156 TEST_REQUIRES_X86_AVX;
32157 for (size_t k = 16; k <= 80; k += 8) {
32158 GemmMicrokernelTester()
32159 .mr(1)
32160 .nr(4)
32161 .kr(8)
32162 .sr(1)
32163 .m(1)
32164 .n(4)
32165 .k(k)
32166 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080032167 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032168 }
32169 }
32170
32171 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_div_8_subtile) {
32172 TEST_REQUIRES_X86_AVX;
32173 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080032174 for (uint32_t n = 1; n <= 4; n++) {
32175 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032176 GemmMicrokernelTester()
32177 .mr(1)
32178 .nr(4)
32179 .kr(8)
32180 .sr(1)
32181 .m(m)
32182 .n(n)
32183 .k(k)
32184 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080032185 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032186 }
32187 }
32188 }
32189 }
32190
32191 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4) {
32192 TEST_REQUIRES_X86_AVX;
32193 for (uint32_t n = 5; n < 8; n++) {
32194 for (size_t k = 1; k <= 40; k += 9) {
32195 GemmMicrokernelTester()
32196 .mr(1)
32197 .nr(4)
32198 .kr(8)
32199 .sr(1)
32200 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080032201 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032202 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080032203 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032204 }
32205 }
32206 }
32207
32208 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4_strided_cn) {
32209 TEST_REQUIRES_X86_AVX;
32210 for (uint32_t n = 5; n < 8; n++) {
32211 for (size_t k = 1; k <= 40; k += 9) {
32212 GemmMicrokernelTester()
32213 .mr(1)
32214 .nr(4)
32215 .kr(8)
32216 .sr(1)
32217 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080032218 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032219 .k(k)
32220 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080032221 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032222 }
32223 }
32224 }
32225
32226 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4_strided_a) {
32227 TEST_REQUIRES_X86_AVX;
32228 for (uint32_t n = 5; n < 8; n++) {
32229 for (size_t k = 1; k <= 40; k += 9) {
32230 GemmMicrokernelTester()
32231 .mr(1)
32232 .nr(4)
32233 .kr(8)
32234 .sr(1)
32235 .m(1)
32236 .n(n)
32237 .k(k)
32238 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080032239 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032240 }
32241 }
32242 }
32243
32244 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4_subtile) {
32245 TEST_REQUIRES_X86_AVX;
32246 for (uint32_t n = 5; n < 8; n++) {
32247 for (size_t k = 1; k <= 40; k += 9) {
32248 for (uint32_t m = 1; m <= 1; m++) {
32249 GemmMicrokernelTester()
32250 .mr(1)
32251 .nr(4)
32252 .kr(8)
32253 .sr(1)
32254 .m(m)
32255 .n(n)
32256 .k(k)
32257 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080032258 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032259 }
32260 }
32261 }
32262 }
32263
32264 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4) {
32265 TEST_REQUIRES_X86_AVX;
32266 for (uint32_t n = 8; n <= 12; n += 4) {
32267 for (size_t k = 1; k <= 40; k += 9) {
32268 GemmMicrokernelTester()
32269 .mr(1)
32270 .nr(4)
32271 .kr(8)
32272 .sr(1)
32273 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080032274 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032275 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080032276 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032277 }
32278 }
32279 }
32280
32281 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4_strided_cn) {
32282 TEST_REQUIRES_X86_AVX;
32283 for (uint32_t n = 8; n <= 12; n += 4) {
32284 for (size_t k = 1; k <= 40; k += 9) {
32285 GemmMicrokernelTester()
32286 .mr(1)
32287 .nr(4)
32288 .kr(8)
32289 .sr(1)
32290 .m(1)
32291 .n(n)
32292 .k(k)
32293 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080032294 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032295 }
32296 }
32297 }
32298
32299 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4_strided_a) {
32300 TEST_REQUIRES_X86_AVX;
32301 for (uint32_t n = 8; n <= 12; n += 4) {
32302 for (size_t k = 1; k <= 40; k += 9) {
32303 GemmMicrokernelTester()
32304 .mr(1)
32305 .nr(4)
32306 .kr(8)
32307 .sr(1)
32308 .m(1)
32309 .n(n)
32310 .k(k)
32311 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080032312 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032313 }
32314 }
32315 }
32316
32317 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4_subtile) {
32318 TEST_REQUIRES_X86_AVX;
32319 for (uint32_t n = 8; n <= 12; n += 4) {
32320 for (size_t k = 1; k <= 40; k += 9) {
32321 for (uint32_t m = 1; m <= 1; m++) {
32322 GemmMicrokernelTester()
32323 .mr(1)
32324 .nr(4)
32325 .kr(8)
32326 .sr(1)
32327 .m(m)
32328 .n(n)
32329 .k(k)
32330 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080032331 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032332 }
32333 }
32334 }
32335 }
32336
32337 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, strided_cm_subtile) {
32338 TEST_REQUIRES_X86_AVX;
32339 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080032340 for (uint32_t n = 1; n <= 4; n++) {
32341 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032342 GemmMicrokernelTester()
32343 .mr(1)
32344 .nr(4)
32345 .kr(8)
32346 .sr(1)
32347 .m(m)
32348 .n(n)
32349 .k(k)
32350 .cm_stride(7)
32351 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080032352 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032353 }
32354 }
32355 }
32356 }
32357
32358 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, qmin) {
32359 TEST_REQUIRES_X86_AVX;
32360 GemmMicrokernelTester()
32361 .mr(1)
32362 .nr(4)
32363 .kr(8)
32364 .sr(1)
32365 .m(1)
32366 .n(4)
32367 .k(8)
32368 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080032369 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032370 }
32371
32372 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, qmax) {
32373 TEST_REQUIRES_X86_AVX;
32374 GemmMicrokernelTester()
32375 .mr(1)
32376 .nr(4)
32377 .kr(8)
32378 .sr(1)
32379 .m(1)
32380 .n(4)
32381 .k(8)
32382 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080032383 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032384 }
32385
32386 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, strided_cm) {
32387 TEST_REQUIRES_X86_AVX;
32388 GemmMicrokernelTester()
32389 .mr(1)
32390 .nr(4)
32391 .kr(8)
32392 .sr(1)
32393 .m(1)
32394 .n(4)
32395 .k(8)
32396 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080032397 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032398 }
32399#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
32400
32401
32402#if XNN_ARCH_X86 || XNN_ARCH_X86_64
32403 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8) {
32404 TEST_REQUIRES_X86_XOP;
32405 GemmMicrokernelTester()
32406 .mr(3)
32407 .nr(4)
32408 .kr(8)
32409 .sr(1)
32410 .m(3)
32411 .n(4)
32412 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080032413 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032414 }
32415
32416 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, strided_cn) {
32417 TEST_REQUIRES_X86_XOP;
32418 GemmMicrokernelTester()
32419 .mr(3)
32420 .nr(4)
32421 .kr(8)
32422 .sr(1)
32423 .m(3)
32424 .n(4)
32425 .k(8)
32426 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080032427 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032428 }
32429
32430 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8_strided_a) {
32431 TEST_REQUIRES_X86_XOP;
32432 GemmMicrokernelTester()
32433 .mr(3)
32434 .nr(4)
32435 .kr(8)
32436 .sr(1)
32437 .m(3)
32438 .n(4)
32439 .k(8)
32440 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080032441 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032442 }
32443
32444 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8_subtile) {
32445 TEST_REQUIRES_X86_XOP;
Zhi An Ng83844ae2022-01-14 09:52:25 -080032446 for (uint32_t n = 1; n <= 4; n++) {
32447 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032448 GemmMicrokernelTester()
32449 .mr(3)
32450 .nr(4)
32451 .kr(8)
32452 .sr(1)
32453 .m(m)
32454 .n(n)
32455 .k(8)
32456 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080032457 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032458 }
32459 }
32460 }
32461
32462 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8_subtile_m) {
32463 TEST_REQUIRES_X86_XOP;
32464 for (uint32_t m = 1; m <= 3; m++) {
32465 GemmMicrokernelTester()
32466 .mr(3)
32467 .nr(4)
32468 .kr(8)
32469 .sr(1)
32470 .m(m)
32471 .n(4)
32472 .k(8)
32473 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080032474 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032475 }
32476 }
32477
32478 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8_subtile_n) {
32479 TEST_REQUIRES_X86_XOP;
32480 for (uint32_t n = 1; n <= 4; n++) {
32481 GemmMicrokernelTester()
32482 .mr(3)
32483 .nr(4)
32484 .kr(8)
32485 .sr(1)
32486 .m(3)
32487 .n(n)
32488 .k(8)
32489 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080032490 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032491 }
32492 }
32493
32494 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_lt_8) {
32495 TEST_REQUIRES_X86_XOP;
32496 for (size_t k = 1; k < 8; k++) {
32497 GemmMicrokernelTester()
32498 .mr(3)
32499 .nr(4)
32500 .kr(8)
32501 .sr(1)
32502 .m(3)
32503 .n(4)
32504 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080032505 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032506 }
32507 }
32508
32509 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_lt_8_strided_a) {
32510 TEST_REQUIRES_X86_XOP;
32511 for (size_t k = 1; k < 8; k++) {
32512 GemmMicrokernelTester()
32513 .mr(3)
32514 .nr(4)
32515 .kr(8)
32516 .sr(1)
32517 .m(3)
32518 .n(4)
32519 .k(k)
32520 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080032521 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032522 }
32523 }
32524
32525 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_lt_8_subtile) {
32526 TEST_REQUIRES_X86_XOP;
32527 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080032528 for (uint32_t n = 1; n <= 4; n++) {
32529 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032530 GemmMicrokernelTester()
32531 .mr(3)
32532 .nr(4)
32533 .kr(8)
32534 .sr(1)
32535 .m(m)
32536 .n(n)
32537 .k(k)
32538 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080032539 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032540 }
32541 }
32542 }
32543 }
32544
32545 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_gt_8) {
32546 TEST_REQUIRES_X86_XOP;
32547 for (size_t k = 9; k < 16; k++) {
32548 GemmMicrokernelTester()
32549 .mr(3)
32550 .nr(4)
32551 .kr(8)
32552 .sr(1)
32553 .m(3)
32554 .n(4)
32555 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080032556 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032557 }
32558 }
32559
32560 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_gt_8_strided_a) {
32561 TEST_REQUIRES_X86_XOP;
32562 for (size_t k = 9; k < 16; k++) {
32563 GemmMicrokernelTester()
32564 .mr(3)
32565 .nr(4)
32566 .kr(8)
32567 .sr(1)
32568 .m(3)
32569 .n(4)
32570 .k(k)
32571 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080032572 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032573 }
32574 }
32575
32576 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_gt_8_subtile) {
32577 TEST_REQUIRES_X86_XOP;
32578 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080032579 for (uint32_t n = 1; n <= 4; n++) {
32580 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032581 GemmMicrokernelTester()
32582 .mr(3)
32583 .nr(4)
32584 .kr(8)
32585 .sr(1)
32586 .m(m)
32587 .n(n)
32588 .k(k)
32589 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080032590 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032591 }
32592 }
32593 }
32594 }
32595
32596 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_div_8) {
32597 TEST_REQUIRES_X86_XOP;
32598 for (size_t k = 16; k <= 80; k += 8) {
32599 GemmMicrokernelTester()
32600 .mr(3)
32601 .nr(4)
32602 .kr(8)
32603 .sr(1)
32604 .m(3)
32605 .n(4)
32606 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080032607 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032608 }
32609 }
32610
32611 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_div_8_strided_a) {
32612 TEST_REQUIRES_X86_XOP;
32613 for (size_t k = 16; k <= 80; k += 8) {
32614 GemmMicrokernelTester()
32615 .mr(3)
32616 .nr(4)
32617 .kr(8)
32618 .sr(1)
32619 .m(3)
32620 .n(4)
32621 .k(k)
32622 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080032623 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032624 }
32625 }
32626
32627 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_div_8_subtile) {
32628 TEST_REQUIRES_X86_XOP;
32629 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080032630 for (uint32_t n = 1; n <= 4; n++) {
32631 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032632 GemmMicrokernelTester()
32633 .mr(3)
32634 .nr(4)
32635 .kr(8)
32636 .sr(1)
32637 .m(m)
32638 .n(n)
32639 .k(k)
32640 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080032641 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032642 }
32643 }
32644 }
32645 }
32646
32647 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4) {
32648 TEST_REQUIRES_X86_XOP;
32649 for (uint32_t n = 5; n < 8; n++) {
32650 for (size_t k = 1; k <= 40; k += 9) {
32651 GemmMicrokernelTester()
32652 .mr(3)
32653 .nr(4)
32654 .kr(8)
32655 .sr(1)
32656 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080032657 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032658 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080032659 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032660 }
32661 }
32662 }
32663
32664 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4_strided_cn) {
32665 TEST_REQUIRES_X86_XOP;
32666 for (uint32_t n = 5; n < 8; n++) {
32667 for (size_t k = 1; k <= 40; k += 9) {
32668 GemmMicrokernelTester()
32669 .mr(3)
32670 .nr(4)
32671 .kr(8)
32672 .sr(1)
32673 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080032674 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032675 .k(k)
32676 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080032677 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032678 }
32679 }
32680 }
32681
32682 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4_strided_a) {
32683 TEST_REQUIRES_X86_XOP;
32684 for (uint32_t n = 5; n < 8; n++) {
32685 for (size_t k = 1; k <= 40; k += 9) {
32686 GemmMicrokernelTester()
32687 .mr(3)
32688 .nr(4)
32689 .kr(8)
32690 .sr(1)
32691 .m(3)
32692 .n(n)
32693 .k(k)
32694 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080032695 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032696 }
32697 }
32698 }
32699
32700 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4_subtile) {
32701 TEST_REQUIRES_X86_XOP;
32702 for (uint32_t n = 5; n < 8; n++) {
32703 for (size_t k = 1; k <= 40; k += 9) {
32704 for (uint32_t m = 1; m <= 3; m++) {
32705 GemmMicrokernelTester()
32706 .mr(3)
32707 .nr(4)
32708 .kr(8)
32709 .sr(1)
32710 .m(m)
32711 .n(n)
32712 .k(k)
32713 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080032714 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032715 }
32716 }
32717 }
32718 }
32719
32720 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4) {
32721 TEST_REQUIRES_X86_XOP;
32722 for (uint32_t n = 8; n <= 12; n += 4) {
32723 for (size_t k = 1; k <= 40; k += 9) {
32724 GemmMicrokernelTester()
32725 .mr(3)
32726 .nr(4)
32727 .kr(8)
32728 .sr(1)
32729 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080032730 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032731 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080032732 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032733 }
32734 }
32735 }
32736
32737 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4_strided_cn) {
32738 TEST_REQUIRES_X86_XOP;
32739 for (uint32_t n = 8; n <= 12; n += 4) {
32740 for (size_t k = 1; k <= 40; k += 9) {
32741 GemmMicrokernelTester()
32742 .mr(3)
32743 .nr(4)
32744 .kr(8)
32745 .sr(1)
32746 .m(3)
32747 .n(n)
32748 .k(k)
32749 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080032750 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032751 }
32752 }
32753 }
32754
32755 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4_strided_a) {
32756 TEST_REQUIRES_X86_XOP;
32757 for (uint32_t n = 8; n <= 12; n += 4) {
32758 for (size_t k = 1; k <= 40; k += 9) {
32759 GemmMicrokernelTester()
32760 .mr(3)
32761 .nr(4)
32762 .kr(8)
32763 .sr(1)
32764 .m(3)
32765 .n(n)
32766 .k(k)
32767 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080032768 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032769 }
32770 }
32771 }
32772
32773 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4_subtile) {
32774 TEST_REQUIRES_X86_XOP;
32775 for (uint32_t n = 8; n <= 12; n += 4) {
32776 for (size_t k = 1; k <= 40; k += 9) {
32777 for (uint32_t m = 1; m <= 3; m++) {
32778 GemmMicrokernelTester()
32779 .mr(3)
32780 .nr(4)
32781 .kr(8)
32782 .sr(1)
32783 .m(m)
32784 .n(n)
32785 .k(k)
32786 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080032787 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032788 }
32789 }
32790 }
32791 }
32792
32793 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, strided_cm_subtile) {
32794 TEST_REQUIRES_X86_XOP;
32795 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080032796 for (uint32_t n = 1; n <= 4; n++) {
32797 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032798 GemmMicrokernelTester()
32799 .mr(3)
32800 .nr(4)
32801 .kr(8)
32802 .sr(1)
32803 .m(m)
32804 .n(n)
32805 .k(k)
32806 .cm_stride(7)
32807 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080032808 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032809 }
32810 }
32811 }
32812 }
32813
32814 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, qmin) {
32815 TEST_REQUIRES_X86_XOP;
32816 GemmMicrokernelTester()
32817 .mr(3)
32818 .nr(4)
32819 .kr(8)
32820 .sr(1)
32821 .m(3)
32822 .n(4)
32823 .k(8)
32824 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080032825 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032826 }
32827
32828 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, qmax) {
32829 TEST_REQUIRES_X86_XOP;
32830 GemmMicrokernelTester()
32831 .mr(3)
32832 .nr(4)
32833 .kr(8)
32834 .sr(1)
32835 .m(3)
32836 .n(4)
32837 .k(8)
32838 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080032839 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032840 }
32841
32842 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, strided_cm) {
32843 TEST_REQUIRES_X86_XOP;
32844 GemmMicrokernelTester()
32845 .mr(3)
32846 .nr(4)
32847 .kr(8)
32848 .sr(1)
32849 .m(3)
32850 .n(4)
32851 .k(8)
32852 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080032853 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032854 }
32855#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
32856
32857
32858#if XNN_ARCH_X86 || XNN_ARCH_X86_64
32859 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_eq_8) {
32860 TEST_REQUIRES_X86_AVX2;
32861 GemmMicrokernelTester()
32862 .extended_weights(true)
32863 .mr(2)
32864 .nr(8)
32865 .kr(8)
32866 .sr(1)
32867 .m(2)
32868 .n(8)
32869 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080032870 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032871 }
32872
32873 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, strided_cn) {
32874 TEST_REQUIRES_X86_AVX2;
32875 GemmMicrokernelTester()
32876 .extended_weights(true)
32877 .mr(2)
32878 .nr(8)
32879 .kr(8)
32880 .sr(1)
32881 .m(2)
32882 .n(8)
32883 .k(8)
32884 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080032885 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032886 }
32887
32888 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_eq_8_strided_a) {
32889 TEST_REQUIRES_X86_AVX2;
32890 GemmMicrokernelTester()
32891 .extended_weights(true)
32892 .mr(2)
32893 .nr(8)
32894 .kr(8)
32895 .sr(1)
32896 .m(2)
32897 .n(8)
32898 .k(8)
32899 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080032900 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032901 }
32902
32903 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_eq_8_subtile) {
32904 TEST_REQUIRES_X86_AVX2;
Zhi An Ng83844ae2022-01-14 09:52:25 -080032905 for (uint32_t n = 1; n <= 8; n++) {
32906 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032907 GemmMicrokernelTester()
32908 .extended_weights(true)
32909 .mr(2)
32910 .nr(8)
32911 .kr(8)
32912 .sr(1)
32913 .m(m)
32914 .n(n)
32915 .k(8)
32916 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080032917 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032918 }
32919 }
32920 }
32921
32922 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_eq_8_subtile_m) {
32923 TEST_REQUIRES_X86_AVX2;
32924 for (uint32_t m = 1; m <= 2; m++) {
32925 GemmMicrokernelTester()
32926 .extended_weights(true)
32927 .mr(2)
32928 .nr(8)
32929 .kr(8)
32930 .sr(1)
32931 .m(m)
32932 .n(8)
32933 .k(8)
32934 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080032935 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032936 }
32937 }
32938
32939 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_eq_8_subtile_n) {
32940 TEST_REQUIRES_X86_AVX2;
32941 for (uint32_t n = 1; n <= 8; n++) {
32942 GemmMicrokernelTester()
32943 .extended_weights(true)
32944 .mr(2)
32945 .nr(8)
32946 .kr(8)
32947 .sr(1)
32948 .m(2)
32949 .n(n)
32950 .k(8)
32951 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080032952 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032953 }
32954 }
32955
32956 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_lt_8) {
32957 TEST_REQUIRES_X86_AVX2;
32958 for (size_t k = 1; k < 8; k++) {
32959 GemmMicrokernelTester()
32960 .extended_weights(true)
32961 .mr(2)
32962 .nr(8)
32963 .kr(8)
32964 .sr(1)
32965 .m(2)
32966 .n(8)
32967 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080032968 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032969 }
32970 }
32971
32972 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_lt_8_strided_a) {
32973 TEST_REQUIRES_X86_AVX2;
32974 for (size_t k = 1; k < 8; k++) {
32975 GemmMicrokernelTester()
32976 .extended_weights(true)
32977 .mr(2)
32978 .nr(8)
32979 .kr(8)
32980 .sr(1)
32981 .m(2)
32982 .n(8)
32983 .k(k)
32984 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080032985 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032986 }
32987 }
32988
32989 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_lt_8_subtile) {
32990 TEST_REQUIRES_X86_AVX2;
32991 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080032992 for (uint32_t n = 1; n <= 8; n++) {
32993 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080032994 GemmMicrokernelTester()
32995 .extended_weights(true)
32996 .mr(2)
32997 .nr(8)
32998 .kr(8)
32999 .sr(1)
33000 .m(m)
33001 .n(n)
33002 .k(k)
33003 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080033004 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033005 }
33006 }
33007 }
33008 }
33009
33010 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_gt_8) {
33011 TEST_REQUIRES_X86_AVX2;
33012 for (size_t k = 9; k < 16; k++) {
33013 GemmMicrokernelTester()
33014 .extended_weights(true)
33015 .mr(2)
33016 .nr(8)
33017 .kr(8)
33018 .sr(1)
33019 .m(2)
33020 .n(8)
33021 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080033022 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033023 }
33024 }
33025
33026 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_gt_8_strided_a) {
33027 TEST_REQUIRES_X86_AVX2;
33028 for (size_t k = 9; k < 16; k++) {
33029 GemmMicrokernelTester()
33030 .extended_weights(true)
33031 .mr(2)
33032 .nr(8)
33033 .kr(8)
33034 .sr(1)
33035 .m(2)
33036 .n(8)
33037 .k(k)
33038 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080033039 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033040 }
33041 }
33042
33043 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_gt_8_subtile) {
33044 TEST_REQUIRES_X86_AVX2;
33045 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080033046 for (uint32_t n = 1; n <= 8; n++) {
33047 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033048 GemmMicrokernelTester()
33049 .extended_weights(true)
33050 .mr(2)
33051 .nr(8)
33052 .kr(8)
33053 .sr(1)
33054 .m(m)
33055 .n(n)
33056 .k(k)
33057 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080033058 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033059 }
33060 }
33061 }
33062 }
33063
33064 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_div_8) {
33065 TEST_REQUIRES_X86_AVX2;
33066 for (size_t k = 16; k <= 80; k += 8) {
33067 GemmMicrokernelTester()
33068 .extended_weights(true)
33069 .mr(2)
33070 .nr(8)
33071 .kr(8)
33072 .sr(1)
33073 .m(2)
33074 .n(8)
33075 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080033076 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033077 }
33078 }
33079
33080 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_div_8_strided_a) {
33081 TEST_REQUIRES_X86_AVX2;
33082 for (size_t k = 16; k <= 80; k += 8) {
33083 GemmMicrokernelTester()
33084 .extended_weights(true)
33085 .mr(2)
33086 .nr(8)
33087 .kr(8)
33088 .sr(1)
33089 .m(2)
33090 .n(8)
33091 .k(k)
33092 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080033093 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033094 }
33095 }
33096
33097 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_div_8_subtile) {
33098 TEST_REQUIRES_X86_AVX2;
33099 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080033100 for (uint32_t n = 1; n <= 8; n++) {
33101 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033102 GemmMicrokernelTester()
33103 .extended_weights(true)
33104 .mr(2)
33105 .nr(8)
33106 .kr(8)
33107 .sr(1)
33108 .m(m)
33109 .n(n)
33110 .k(k)
33111 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080033112 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033113 }
33114 }
33115 }
33116 }
33117
33118 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, n_gt_8) {
33119 TEST_REQUIRES_X86_AVX2;
33120 for (uint32_t n = 9; n < 16; n++) {
33121 for (size_t k = 1; k <= 40; k += 9) {
33122 GemmMicrokernelTester()
33123 .extended_weights(true)
33124 .mr(2)
33125 .nr(8)
33126 .kr(8)
33127 .sr(1)
33128 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080033129 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033130 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080033131 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033132 }
33133 }
33134 }
33135
33136 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, n_gt_8_strided_cn) {
33137 TEST_REQUIRES_X86_AVX2;
33138 for (uint32_t n = 9; n < 16; n++) {
33139 for (size_t k = 1; k <= 40; k += 9) {
33140 GemmMicrokernelTester()
33141 .extended_weights(true)
33142 .mr(2)
33143 .nr(8)
33144 .kr(8)
33145 .sr(1)
33146 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080033147 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033148 .k(k)
33149 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080033150 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033151 }
33152 }
33153 }
33154
33155 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, n_gt_8_strided_a) {
33156 TEST_REQUIRES_X86_AVX2;
33157 for (uint32_t n = 9; n < 16; n++) {
33158 for (size_t k = 1; k <= 40; k += 9) {
33159 GemmMicrokernelTester()
33160 .extended_weights(true)
33161 .mr(2)
33162 .nr(8)
33163 .kr(8)
33164 .sr(1)
33165 .m(2)
33166 .n(n)
33167 .k(k)
33168 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080033169 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033170 }
33171 }
33172 }
33173
33174 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, n_gt_8_subtile) {
33175 TEST_REQUIRES_X86_AVX2;
33176 for (uint32_t n = 9; n < 16; n++) {
33177 for (size_t k = 1; k <= 40; k += 9) {
33178 for (uint32_t m = 1; m <= 2; m++) {
33179 GemmMicrokernelTester()
33180 .extended_weights(true)
33181 .mr(2)
33182 .nr(8)
33183 .kr(8)
33184 .sr(1)
33185 .m(m)
33186 .n(n)
33187 .k(k)
33188 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080033189 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033190 }
33191 }
33192 }
33193 }
33194
33195 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, n_div_8) {
33196 TEST_REQUIRES_X86_AVX2;
33197 for (uint32_t n = 16; n <= 24; n += 8) {
33198 for (size_t k = 1; k <= 40; k += 9) {
33199 GemmMicrokernelTester()
33200 .extended_weights(true)
33201 .mr(2)
33202 .nr(8)
33203 .kr(8)
33204 .sr(1)
33205 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080033206 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033207 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080033208 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033209 }
33210 }
33211 }
33212
33213 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, n_div_8_strided_cn) {
33214 TEST_REQUIRES_X86_AVX2;
33215 for (uint32_t n = 16; n <= 24; n += 8) {
33216 for (size_t k = 1; k <= 40; k += 9) {
33217 GemmMicrokernelTester()
33218 .extended_weights(true)
33219 .mr(2)
33220 .nr(8)
33221 .kr(8)
33222 .sr(1)
33223 .m(2)
33224 .n(n)
33225 .k(k)
33226 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080033227 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033228 }
33229 }
33230 }
33231
33232 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, n_div_8_strided_a) {
33233 TEST_REQUIRES_X86_AVX2;
33234 for (uint32_t n = 16; n <= 24; n += 8) {
33235 for (size_t k = 1; k <= 40; k += 9) {
33236 GemmMicrokernelTester()
33237 .extended_weights(true)
33238 .mr(2)
33239 .nr(8)
33240 .kr(8)
33241 .sr(1)
33242 .m(2)
33243 .n(n)
33244 .k(k)
33245 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080033246 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033247 }
33248 }
33249 }
33250
33251 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, n_div_8_subtile) {
33252 TEST_REQUIRES_X86_AVX2;
33253 for (uint32_t n = 16; n <= 24; n += 8) {
33254 for (size_t k = 1; k <= 40; k += 9) {
33255 for (uint32_t m = 1; m <= 2; m++) {
33256 GemmMicrokernelTester()
33257 .extended_weights(true)
33258 .mr(2)
33259 .nr(8)
33260 .kr(8)
33261 .sr(1)
33262 .m(m)
33263 .n(n)
33264 .k(k)
33265 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080033266 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033267 }
33268 }
33269 }
33270 }
33271
33272 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, strided_cm_subtile) {
33273 TEST_REQUIRES_X86_AVX2;
33274 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080033275 for (uint32_t n = 1; n <= 8; n++) {
33276 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033277 GemmMicrokernelTester()
33278 .extended_weights(true)
33279 .mr(2)
33280 .nr(8)
33281 .kr(8)
33282 .sr(1)
33283 .m(m)
33284 .n(n)
33285 .k(k)
33286 .cm_stride(11)
33287 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080033288 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033289 }
33290 }
33291 }
33292 }
33293
33294 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, strided_cm) {
33295 TEST_REQUIRES_X86_AVX2;
33296 GemmMicrokernelTester()
33297 .extended_weights(true)
33298 .mr(2)
33299 .nr(8)
33300 .kr(8)
33301 .sr(1)
33302 .m(2)
33303 .n(8)
33304 .k(8)
33305 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080033306 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033307 }
33308#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
33309
33310
33311#if XNN_ARCH_X86 || XNN_ARCH_X86_64
33312 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_eq_8) {
33313 TEST_REQUIRES_X86_AVX512SKX;
33314 GemmMicrokernelTester()
33315 .mr(3)
33316 .nr(16)
33317 .kr(8)
33318 .sr(1)
33319 .m(3)
33320 .n(16)
33321 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080033322 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033323 }
33324
33325 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, strided_cn) {
33326 TEST_REQUIRES_X86_AVX512SKX;
33327 GemmMicrokernelTester()
33328 .mr(3)
33329 .nr(16)
33330 .kr(8)
33331 .sr(1)
33332 .m(3)
33333 .n(16)
33334 .k(8)
33335 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080033336 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033337 }
33338
33339 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_eq_8_strided_a) {
33340 TEST_REQUIRES_X86_AVX512SKX;
33341 GemmMicrokernelTester()
33342 .mr(3)
33343 .nr(16)
33344 .kr(8)
33345 .sr(1)
33346 .m(3)
33347 .n(16)
33348 .k(8)
33349 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080033350 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033351 }
33352
33353 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_eq_8_subtile) {
33354 TEST_REQUIRES_X86_AVX512SKX;
Zhi An Ng83844ae2022-01-14 09:52:25 -080033355 for (uint32_t n = 1; n <= 16; n++) {
33356 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033357 GemmMicrokernelTester()
33358 .mr(3)
33359 .nr(16)
33360 .kr(8)
33361 .sr(1)
33362 .m(m)
33363 .n(n)
33364 .k(8)
33365 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080033366 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033367 }
33368 }
33369 }
33370
33371 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_eq_8_subtile_m) {
33372 TEST_REQUIRES_X86_AVX512SKX;
33373 for (uint32_t m = 1; m <= 3; m++) {
33374 GemmMicrokernelTester()
33375 .mr(3)
33376 .nr(16)
33377 .kr(8)
33378 .sr(1)
33379 .m(m)
33380 .n(16)
33381 .k(8)
33382 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080033383 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033384 }
33385 }
33386
33387 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_eq_8_subtile_n) {
33388 TEST_REQUIRES_X86_AVX512SKX;
33389 for (uint32_t n = 1; n <= 16; n++) {
33390 GemmMicrokernelTester()
33391 .mr(3)
33392 .nr(16)
33393 .kr(8)
33394 .sr(1)
33395 .m(3)
33396 .n(n)
33397 .k(8)
33398 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080033399 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033400 }
33401 }
33402
33403 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_lt_8) {
33404 TEST_REQUIRES_X86_AVX512SKX;
33405 for (size_t k = 1; k < 8; k++) {
33406 GemmMicrokernelTester()
33407 .mr(3)
33408 .nr(16)
33409 .kr(8)
33410 .sr(1)
33411 .m(3)
33412 .n(16)
33413 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080033414 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033415 }
33416 }
33417
33418 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_lt_8_strided_a) {
33419 TEST_REQUIRES_X86_AVX512SKX;
33420 for (size_t k = 1; k < 8; k++) {
33421 GemmMicrokernelTester()
33422 .mr(3)
33423 .nr(16)
33424 .kr(8)
33425 .sr(1)
33426 .m(3)
33427 .n(16)
33428 .k(k)
33429 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080033430 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033431 }
33432 }
33433
33434 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_lt_8_subtile) {
33435 TEST_REQUIRES_X86_AVX512SKX;
33436 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080033437 for (uint32_t n = 1; n <= 16; n++) {
33438 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033439 GemmMicrokernelTester()
33440 .mr(3)
33441 .nr(16)
33442 .kr(8)
33443 .sr(1)
33444 .m(m)
33445 .n(n)
33446 .k(k)
33447 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080033448 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033449 }
33450 }
33451 }
33452 }
33453
33454 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_gt_8) {
33455 TEST_REQUIRES_X86_AVX512SKX;
33456 for (size_t k = 9; k < 16; k++) {
33457 GemmMicrokernelTester()
33458 .mr(3)
33459 .nr(16)
33460 .kr(8)
33461 .sr(1)
33462 .m(3)
33463 .n(16)
33464 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080033465 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033466 }
33467 }
33468
33469 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_gt_8_strided_a) {
33470 TEST_REQUIRES_X86_AVX512SKX;
33471 for (size_t k = 9; k < 16; k++) {
33472 GemmMicrokernelTester()
33473 .mr(3)
33474 .nr(16)
33475 .kr(8)
33476 .sr(1)
33477 .m(3)
33478 .n(16)
33479 .k(k)
33480 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080033481 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033482 }
33483 }
33484
33485 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_gt_8_subtile) {
33486 TEST_REQUIRES_X86_AVX512SKX;
33487 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080033488 for (uint32_t n = 1; n <= 16; n++) {
33489 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033490 GemmMicrokernelTester()
33491 .mr(3)
33492 .nr(16)
33493 .kr(8)
33494 .sr(1)
33495 .m(m)
33496 .n(n)
33497 .k(k)
33498 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080033499 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033500 }
33501 }
33502 }
33503 }
33504
33505 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_div_8) {
33506 TEST_REQUIRES_X86_AVX512SKX;
33507 for (size_t k = 16; k <= 80; k += 8) {
33508 GemmMicrokernelTester()
33509 .mr(3)
33510 .nr(16)
33511 .kr(8)
33512 .sr(1)
33513 .m(3)
33514 .n(16)
33515 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080033516 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033517 }
33518 }
33519
33520 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_div_8_strided_a) {
33521 TEST_REQUIRES_X86_AVX512SKX;
33522 for (size_t k = 16; k <= 80; k += 8) {
33523 GemmMicrokernelTester()
33524 .mr(3)
33525 .nr(16)
33526 .kr(8)
33527 .sr(1)
33528 .m(3)
33529 .n(16)
33530 .k(k)
33531 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080033532 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033533 }
33534 }
33535
33536 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_div_8_subtile) {
33537 TEST_REQUIRES_X86_AVX512SKX;
33538 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080033539 for (uint32_t n = 1; n <= 16; n++) {
33540 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033541 GemmMicrokernelTester()
33542 .mr(3)
33543 .nr(16)
33544 .kr(8)
33545 .sr(1)
33546 .m(m)
33547 .n(n)
33548 .k(k)
33549 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080033550 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033551 }
33552 }
33553 }
33554 }
33555
33556 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, n_gt_16) {
33557 TEST_REQUIRES_X86_AVX512SKX;
33558 for (uint32_t n = 17; n < 32; n++) {
33559 for (size_t k = 1; k <= 40; k += 9) {
33560 GemmMicrokernelTester()
33561 .mr(3)
33562 .nr(16)
33563 .kr(8)
33564 .sr(1)
33565 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080033566 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033567 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080033568 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033569 }
33570 }
33571 }
33572
33573 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, n_gt_16_strided_cn) {
33574 TEST_REQUIRES_X86_AVX512SKX;
33575 for (uint32_t n = 17; n < 32; n++) {
33576 for (size_t k = 1; k <= 40; k += 9) {
33577 GemmMicrokernelTester()
33578 .mr(3)
33579 .nr(16)
33580 .kr(8)
33581 .sr(1)
33582 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080033583 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033584 .k(k)
33585 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080033586 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033587 }
33588 }
33589 }
33590
33591 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, n_gt_16_strided_a) {
33592 TEST_REQUIRES_X86_AVX512SKX;
33593 for (uint32_t n = 17; n < 32; n++) {
33594 for (size_t k = 1; k <= 40; k += 9) {
33595 GemmMicrokernelTester()
33596 .mr(3)
33597 .nr(16)
33598 .kr(8)
33599 .sr(1)
33600 .m(3)
33601 .n(n)
33602 .k(k)
33603 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080033604 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033605 }
33606 }
33607 }
33608
33609 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, n_gt_16_subtile) {
33610 TEST_REQUIRES_X86_AVX512SKX;
33611 for (uint32_t n = 17; n < 32; n++) {
33612 for (size_t k = 1; k <= 40; k += 9) {
33613 for (uint32_t m = 1; m <= 3; m++) {
33614 GemmMicrokernelTester()
33615 .mr(3)
33616 .nr(16)
33617 .kr(8)
33618 .sr(1)
33619 .m(m)
33620 .n(n)
33621 .k(k)
33622 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080033623 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033624 }
33625 }
33626 }
33627 }
33628
33629 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, n_div_16) {
33630 TEST_REQUIRES_X86_AVX512SKX;
33631 for (uint32_t n = 32; n <= 48; n += 16) {
33632 for (size_t k = 1; k <= 40; k += 9) {
33633 GemmMicrokernelTester()
33634 .mr(3)
33635 .nr(16)
33636 .kr(8)
33637 .sr(1)
33638 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080033639 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033640 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080033641 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033642 }
33643 }
33644 }
33645
33646 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, n_div_16_strided_cn) {
33647 TEST_REQUIRES_X86_AVX512SKX;
33648 for (uint32_t n = 32; n <= 48; n += 16) {
33649 for (size_t k = 1; k <= 40; k += 9) {
33650 GemmMicrokernelTester()
33651 .mr(3)
33652 .nr(16)
33653 .kr(8)
33654 .sr(1)
33655 .m(3)
33656 .n(n)
33657 .k(k)
33658 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080033659 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033660 }
33661 }
33662 }
33663
33664 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, n_div_16_strided_a) {
33665 TEST_REQUIRES_X86_AVX512SKX;
33666 for (uint32_t n = 32; n <= 48; n += 16) {
33667 for (size_t k = 1; k <= 40; k += 9) {
33668 GemmMicrokernelTester()
33669 .mr(3)
33670 .nr(16)
33671 .kr(8)
33672 .sr(1)
33673 .m(3)
33674 .n(n)
33675 .k(k)
33676 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080033677 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033678 }
33679 }
33680 }
33681
33682 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, n_div_16_subtile) {
33683 TEST_REQUIRES_X86_AVX512SKX;
33684 for (uint32_t n = 32; n <= 48; n += 16) {
33685 for (size_t k = 1; k <= 40; k += 9) {
33686 for (uint32_t m = 1; m <= 3; m++) {
33687 GemmMicrokernelTester()
33688 .mr(3)
33689 .nr(16)
33690 .kr(8)
33691 .sr(1)
33692 .m(m)
33693 .n(n)
33694 .k(k)
33695 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080033696 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033697 }
33698 }
33699 }
33700 }
33701
33702 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, strided_cm_subtile) {
33703 TEST_REQUIRES_X86_AVX512SKX;
33704 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080033705 for (uint32_t n = 1; n <= 16; n++) {
33706 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033707 GemmMicrokernelTester()
33708 .mr(3)
33709 .nr(16)
33710 .kr(8)
33711 .sr(1)
33712 .m(m)
33713 .n(n)
33714 .k(k)
33715 .cm_stride(19)
33716 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080033717 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033718 }
33719 }
33720 }
33721 }
33722
33723 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, qmin) {
33724 TEST_REQUIRES_X86_AVX512SKX;
33725 GemmMicrokernelTester()
33726 .mr(3)
33727 .nr(16)
33728 .kr(8)
33729 .sr(1)
33730 .m(3)
33731 .n(16)
33732 .k(8)
33733 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080033734 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033735 }
33736
33737 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, qmax) {
33738 TEST_REQUIRES_X86_AVX512SKX;
33739 GemmMicrokernelTester()
33740 .mr(3)
33741 .nr(16)
33742 .kr(8)
33743 .sr(1)
33744 .m(3)
33745 .n(16)
33746 .k(8)
33747 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080033748 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033749 }
33750
33751 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, strided_cm) {
33752 TEST_REQUIRES_X86_AVX512SKX;
33753 GemmMicrokernelTester()
33754 .mr(3)
33755 .nr(16)
33756 .kr(8)
33757 .sr(1)
33758 .m(3)
33759 .n(16)
33760 .k(8)
33761 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080033762 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033763 }
33764#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
33765
33766
33767#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
33768 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8) {
33769 GemmMicrokernelTester()
33770 .mr(2)
33771 .nr(4)
33772 .kr(2)
33773 .sr(4)
33774 .m(2)
33775 .n(4)
33776 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080033777 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033778 }
33779
33780 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, strided_cn) {
33781 GemmMicrokernelTester()
33782 .mr(2)
33783 .nr(4)
33784 .kr(2)
33785 .sr(4)
33786 .m(2)
33787 .n(4)
33788 .k(8)
33789 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080033790 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033791 }
33792
33793 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_strided_a) {
33794 GemmMicrokernelTester()
33795 .mr(2)
33796 .nr(4)
33797 .kr(2)
33798 .sr(4)
33799 .m(2)
33800 .n(4)
33801 .k(8)
33802 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080033803 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033804 }
33805
33806 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080033807 for (uint32_t n = 1; n <= 4; n++) {
33808 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033809 GemmMicrokernelTester()
33810 .mr(2)
33811 .nr(4)
33812 .kr(2)
33813 .sr(4)
33814 .m(m)
33815 .n(n)
33816 .k(8)
33817 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080033818 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033819 }
33820 }
33821 }
33822
33823 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_m) {
33824 for (uint32_t m = 1; m <= 2; m++) {
33825 GemmMicrokernelTester()
33826 .mr(2)
33827 .nr(4)
33828 .kr(2)
33829 .sr(4)
33830 .m(m)
33831 .n(4)
33832 .k(8)
33833 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080033834 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033835 }
33836 }
33837
33838 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_n) {
33839 for (uint32_t n = 1; n <= 4; n++) {
33840 GemmMicrokernelTester()
33841 .mr(2)
33842 .nr(4)
33843 .kr(2)
33844 .sr(4)
33845 .m(2)
33846 .n(n)
33847 .k(8)
33848 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080033849 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033850 }
33851 }
33852
33853 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_lt_8) {
33854 for (size_t k = 1; k < 8; k++) {
33855 GemmMicrokernelTester()
33856 .mr(2)
33857 .nr(4)
33858 .kr(2)
33859 .sr(4)
33860 .m(2)
33861 .n(4)
33862 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080033863 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033864 }
33865 }
33866
33867 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_lt_8_strided_a) {
33868 for (size_t k = 1; k < 8; k++) {
33869 GemmMicrokernelTester()
33870 .mr(2)
33871 .nr(4)
33872 .kr(2)
33873 .sr(4)
33874 .m(2)
33875 .n(4)
33876 .k(k)
33877 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080033878 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033879 }
33880 }
33881
33882 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_lt_8_subtile) {
33883 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080033884 for (uint32_t n = 1; n <= 4; n++) {
33885 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033886 GemmMicrokernelTester()
33887 .mr(2)
33888 .nr(4)
33889 .kr(2)
33890 .sr(4)
33891 .m(m)
33892 .n(n)
33893 .k(k)
33894 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080033895 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033896 }
33897 }
33898 }
33899 }
33900
33901 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_gt_8) {
33902 for (size_t k = 9; k < 16; k++) {
33903 GemmMicrokernelTester()
33904 .mr(2)
33905 .nr(4)
33906 .kr(2)
33907 .sr(4)
33908 .m(2)
33909 .n(4)
33910 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080033911 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033912 }
33913 }
33914
33915 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_gt_8_strided_a) {
33916 for (size_t k = 9; k < 16; k++) {
33917 GemmMicrokernelTester()
33918 .mr(2)
33919 .nr(4)
33920 .kr(2)
33921 .sr(4)
33922 .m(2)
33923 .n(4)
33924 .k(k)
33925 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080033926 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033927 }
33928 }
33929
33930 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_gt_8_subtile) {
33931 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080033932 for (uint32_t n = 1; n <= 4; n++) {
33933 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033934 GemmMicrokernelTester()
33935 .mr(2)
33936 .nr(4)
33937 .kr(2)
33938 .sr(4)
33939 .m(m)
33940 .n(n)
33941 .k(k)
33942 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080033943 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033944 }
33945 }
33946 }
33947 }
33948
33949 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_div_8) {
33950 for (size_t k = 16; k <= 80; k += 8) {
33951 GemmMicrokernelTester()
33952 .mr(2)
33953 .nr(4)
33954 .kr(2)
33955 .sr(4)
33956 .m(2)
33957 .n(4)
33958 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080033959 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033960 }
33961 }
33962
33963 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_div_8_strided_a) {
33964 for (size_t k = 16; k <= 80; k += 8) {
33965 GemmMicrokernelTester()
33966 .mr(2)
33967 .nr(4)
33968 .kr(2)
33969 .sr(4)
33970 .m(2)
33971 .n(4)
33972 .k(k)
33973 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080033974 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033975 }
33976 }
33977
33978 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_div_8_subtile) {
33979 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080033980 for (uint32_t n = 1; n <= 4; n++) {
33981 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033982 GemmMicrokernelTester()
33983 .mr(2)
33984 .nr(4)
33985 .kr(2)
33986 .sr(4)
33987 .m(m)
33988 .n(n)
33989 .k(k)
33990 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080033991 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080033992 }
33993 }
33994 }
33995 }
33996
33997 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4) {
33998 for (uint32_t n = 5; n < 8; n++) {
33999 for (size_t k = 1; k <= 40; k += 9) {
34000 GemmMicrokernelTester()
34001 .mr(2)
34002 .nr(4)
34003 .kr(2)
34004 .sr(4)
34005 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080034006 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034007 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080034008 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034009 }
34010 }
34011 }
34012
34013 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_cn) {
34014 for (uint32_t n = 5; n < 8; n++) {
34015 for (size_t k = 1; k <= 40; k += 9) {
34016 GemmMicrokernelTester()
34017 .mr(2)
34018 .nr(4)
34019 .kr(2)
34020 .sr(4)
34021 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080034022 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034023 .k(k)
34024 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080034025 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034026 }
34027 }
34028 }
34029
34030 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_a) {
34031 for (uint32_t n = 5; n < 8; n++) {
34032 for (size_t k = 1; k <= 40; k += 9) {
34033 GemmMicrokernelTester()
34034 .mr(2)
34035 .nr(4)
34036 .kr(2)
34037 .sr(4)
34038 .m(2)
34039 .n(n)
34040 .k(k)
34041 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080034042 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034043 }
34044 }
34045 }
34046
34047 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4_subtile) {
34048 for (uint32_t n = 5; n < 8; n++) {
34049 for (size_t k = 1; k <= 40; k += 9) {
34050 for (uint32_t m = 1; m <= 2; m++) {
34051 GemmMicrokernelTester()
34052 .mr(2)
34053 .nr(4)
34054 .kr(2)
34055 .sr(4)
34056 .m(m)
34057 .n(n)
34058 .k(k)
34059 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080034060 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034061 }
34062 }
34063 }
34064 }
34065
34066 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4) {
34067 for (uint32_t n = 8; n <= 12; n += 4) {
34068 for (size_t k = 1; k <= 40; k += 9) {
34069 GemmMicrokernelTester()
34070 .mr(2)
34071 .nr(4)
34072 .kr(2)
34073 .sr(4)
34074 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080034075 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034076 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080034077 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034078 }
34079 }
34080 }
34081
34082 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4_strided_cn) {
34083 for (uint32_t n = 8; n <= 12; n += 4) {
34084 for (size_t k = 1; k <= 40; k += 9) {
34085 GemmMicrokernelTester()
34086 .mr(2)
34087 .nr(4)
34088 .kr(2)
34089 .sr(4)
34090 .m(2)
34091 .n(n)
34092 .k(k)
34093 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080034094 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034095 }
34096 }
34097 }
34098
34099 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4_strided_a) {
34100 for (uint32_t n = 8; n <= 12; n += 4) {
34101 for (size_t k = 1; k <= 40; k += 9) {
34102 GemmMicrokernelTester()
34103 .mr(2)
34104 .nr(4)
34105 .kr(2)
34106 .sr(4)
34107 .m(2)
34108 .n(n)
34109 .k(k)
34110 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080034111 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034112 }
34113 }
34114 }
34115
34116 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4_subtile) {
34117 for (uint32_t n = 8; n <= 12; n += 4) {
34118 for (size_t k = 1; k <= 40; k += 9) {
34119 for (uint32_t m = 1; m <= 2; m++) {
34120 GemmMicrokernelTester()
34121 .mr(2)
34122 .nr(4)
34123 .kr(2)
34124 .sr(4)
34125 .m(m)
34126 .n(n)
34127 .k(k)
34128 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080034129 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034130 }
34131 }
34132 }
34133 }
34134
34135 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, strided_cm_subtile) {
34136 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080034137 for (uint32_t n = 1; n <= 4; n++) {
34138 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034139 GemmMicrokernelTester()
34140 .mr(2)
34141 .nr(4)
34142 .kr(2)
34143 .sr(4)
34144 .m(m)
34145 .n(n)
34146 .k(k)
34147 .cm_stride(7)
34148 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080034149 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034150 }
34151 }
34152 }
34153 }
34154
34155 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, qmin) {
34156 GemmMicrokernelTester()
34157 .mr(2)
34158 .nr(4)
34159 .kr(2)
34160 .sr(4)
34161 .m(2)
34162 .n(4)
34163 .k(8)
34164 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080034165 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034166 }
34167
34168 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, qmax) {
34169 GemmMicrokernelTester()
34170 .mr(2)
34171 .nr(4)
34172 .kr(2)
34173 .sr(4)
34174 .m(2)
34175 .n(4)
34176 .k(8)
34177 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080034178 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034179 }
34180
34181 TEST(QC8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, strided_cm) {
34182 GemmMicrokernelTester()
34183 .mr(2)
34184 .nr(4)
34185 .kr(2)
34186 .sr(4)
34187 .m(2)
34188 .n(4)
34189 .k(8)
34190 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080034191 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034192 }
34193#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
34194
34195
34196#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
34197 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8) {
34198 GemmMicrokernelTester()
34199 .mr(3)
34200 .nr(4)
34201 .kr(2)
34202 .sr(4)
34203 .m(3)
34204 .n(4)
34205 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080034206 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034207 }
34208
34209 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, strided_cn) {
34210 GemmMicrokernelTester()
34211 .mr(3)
34212 .nr(4)
34213 .kr(2)
34214 .sr(4)
34215 .m(3)
34216 .n(4)
34217 .k(8)
34218 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080034219 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034220 }
34221
34222 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_strided_a) {
34223 GemmMicrokernelTester()
34224 .mr(3)
34225 .nr(4)
34226 .kr(2)
34227 .sr(4)
34228 .m(3)
34229 .n(4)
34230 .k(8)
34231 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080034232 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034233 }
34234
34235 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080034236 for (uint32_t n = 1; n <= 4; n++) {
34237 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034238 GemmMicrokernelTester()
34239 .mr(3)
34240 .nr(4)
34241 .kr(2)
34242 .sr(4)
34243 .m(m)
34244 .n(n)
34245 .k(8)
34246 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080034247 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034248 }
34249 }
34250 }
34251
34252 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_m) {
34253 for (uint32_t m = 1; m <= 3; m++) {
34254 GemmMicrokernelTester()
34255 .mr(3)
34256 .nr(4)
34257 .kr(2)
34258 .sr(4)
34259 .m(m)
34260 .n(4)
34261 .k(8)
34262 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080034263 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034264 }
34265 }
34266
34267 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_n) {
34268 for (uint32_t n = 1; n <= 4; n++) {
34269 GemmMicrokernelTester()
34270 .mr(3)
34271 .nr(4)
34272 .kr(2)
34273 .sr(4)
34274 .m(3)
34275 .n(n)
34276 .k(8)
34277 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080034278 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034279 }
34280 }
34281
34282 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_lt_8) {
34283 for (size_t k = 1; k < 8; k++) {
34284 GemmMicrokernelTester()
34285 .mr(3)
34286 .nr(4)
34287 .kr(2)
34288 .sr(4)
34289 .m(3)
34290 .n(4)
34291 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080034292 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034293 }
34294 }
34295
34296 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_lt_8_strided_a) {
34297 for (size_t k = 1; k < 8; k++) {
34298 GemmMicrokernelTester()
34299 .mr(3)
34300 .nr(4)
34301 .kr(2)
34302 .sr(4)
34303 .m(3)
34304 .n(4)
34305 .k(k)
34306 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080034307 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034308 }
34309 }
34310
34311 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_lt_8_subtile) {
34312 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080034313 for (uint32_t n = 1; n <= 4; n++) {
34314 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034315 GemmMicrokernelTester()
34316 .mr(3)
34317 .nr(4)
34318 .kr(2)
34319 .sr(4)
34320 .m(m)
34321 .n(n)
34322 .k(k)
34323 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080034324 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034325 }
34326 }
34327 }
34328 }
34329
34330 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_gt_8) {
34331 for (size_t k = 9; k < 16; k++) {
34332 GemmMicrokernelTester()
34333 .mr(3)
34334 .nr(4)
34335 .kr(2)
34336 .sr(4)
34337 .m(3)
34338 .n(4)
34339 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080034340 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034341 }
34342 }
34343
34344 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_gt_8_strided_a) {
34345 for (size_t k = 9; k < 16; k++) {
34346 GemmMicrokernelTester()
34347 .mr(3)
34348 .nr(4)
34349 .kr(2)
34350 .sr(4)
34351 .m(3)
34352 .n(4)
34353 .k(k)
34354 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080034355 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034356 }
34357 }
34358
34359 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_gt_8_subtile) {
34360 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080034361 for (uint32_t n = 1; n <= 4; n++) {
34362 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034363 GemmMicrokernelTester()
34364 .mr(3)
34365 .nr(4)
34366 .kr(2)
34367 .sr(4)
34368 .m(m)
34369 .n(n)
34370 .k(k)
34371 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080034372 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034373 }
34374 }
34375 }
34376 }
34377
34378 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_div_8) {
34379 for (size_t k = 16; k <= 80; k += 8) {
34380 GemmMicrokernelTester()
34381 .mr(3)
34382 .nr(4)
34383 .kr(2)
34384 .sr(4)
34385 .m(3)
34386 .n(4)
34387 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080034388 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034389 }
34390 }
34391
34392 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_div_8_strided_a) {
34393 for (size_t k = 16; k <= 80; k += 8) {
34394 GemmMicrokernelTester()
34395 .mr(3)
34396 .nr(4)
34397 .kr(2)
34398 .sr(4)
34399 .m(3)
34400 .n(4)
34401 .k(k)
34402 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080034403 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034404 }
34405 }
34406
34407 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_div_8_subtile) {
34408 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080034409 for (uint32_t n = 1; n <= 4; n++) {
34410 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034411 GemmMicrokernelTester()
34412 .mr(3)
34413 .nr(4)
34414 .kr(2)
34415 .sr(4)
34416 .m(m)
34417 .n(n)
34418 .k(k)
34419 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080034420 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034421 }
34422 }
34423 }
34424 }
34425
34426 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4) {
34427 for (uint32_t n = 5; n < 8; n++) {
34428 for (size_t k = 1; k <= 40; k += 9) {
34429 GemmMicrokernelTester()
34430 .mr(3)
34431 .nr(4)
34432 .kr(2)
34433 .sr(4)
34434 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080034435 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034436 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080034437 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034438 }
34439 }
34440 }
34441
34442 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_cn) {
34443 for (uint32_t n = 5; n < 8; n++) {
34444 for (size_t k = 1; k <= 40; k += 9) {
34445 GemmMicrokernelTester()
34446 .mr(3)
34447 .nr(4)
34448 .kr(2)
34449 .sr(4)
34450 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080034451 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034452 .k(k)
34453 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080034454 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034455 }
34456 }
34457 }
34458
34459 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_a) {
34460 for (uint32_t n = 5; n < 8; n++) {
34461 for (size_t k = 1; k <= 40; k += 9) {
34462 GemmMicrokernelTester()
34463 .mr(3)
34464 .nr(4)
34465 .kr(2)
34466 .sr(4)
34467 .m(3)
34468 .n(n)
34469 .k(k)
34470 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080034471 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034472 }
34473 }
34474 }
34475
34476 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4_subtile) {
34477 for (uint32_t n = 5; n < 8; n++) {
34478 for (size_t k = 1; k <= 40; k += 9) {
34479 for (uint32_t m = 1; m <= 3; m++) {
34480 GemmMicrokernelTester()
34481 .mr(3)
34482 .nr(4)
34483 .kr(2)
34484 .sr(4)
34485 .m(m)
34486 .n(n)
34487 .k(k)
34488 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080034489 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034490 }
34491 }
34492 }
34493 }
34494
34495 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4) {
34496 for (uint32_t n = 8; n <= 12; n += 4) {
34497 for (size_t k = 1; k <= 40; k += 9) {
34498 GemmMicrokernelTester()
34499 .mr(3)
34500 .nr(4)
34501 .kr(2)
34502 .sr(4)
34503 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080034504 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034505 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080034506 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034507 }
34508 }
34509 }
34510
34511 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4_strided_cn) {
34512 for (uint32_t n = 8; n <= 12; n += 4) {
34513 for (size_t k = 1; k <= 40; k += 9) {
34514 GemmMicrokernelTester()
34515 .mr(3)
34516 .nr(4)
34517 .kr(2)
34518 .sr(4)
34519 .m(3)
34520 .n(n)
34521 .k(k)
34522 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080034523 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034524 }
34525 }
34526 }
34527
34528 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4_strided_a) {
34529 for (uint32_t n = 8; n <= 12; n += 4) {
34530 for (size_t k = 1; k <= 40; k += 9) {
34531 GemmMicrokernelTester()
34532 .mr(3)
34533 .nr(4)
34534 .kr(2)
34535 .sr(4)
34536 .m(3)
34537 .n(n)
34538 .k(k)
34539 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080034540 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034541 }
34542 }
34543 }
34544
34545 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4_subtile) {
34546 for (uint32_t n = 8; n <= 12; n += 4) {
34547 for (size_t k = 1; k <= 40; k += 9) {
34548 for (uint32_t m = 1; m <= 3; m++) {
34549 GemmMicrokernelTester()
34550 .mr(3)
34551 .nr(4)
34552 .kr(2)
34553 .sr(4)
34554 .m(m)
34555 .n(n)
34556 .k(k)
34557 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080034558 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034559 }
34560 }
34561 }
34562 }
34563
34564 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, strided_cm_subtile) {
34565 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080034566 for (uint32_t n = 1; n <= 4; n++) {
34567 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034568 GemmMicrokernelTester()
34569 .mr(3)
34570 .nr(4)
34571 .kr(2)
34572 .sr(4)
34573 .m(m)
34574 .n(n)
34575 .k(k)
34576 .cm_stride(7)
34577 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080034578 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034579 }
34580 }
34581 }
34582 }
34583
34584 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, qmin) {
34585 GemmMicrokernelTester()
34586 .mr(3)
34587 .nr(4)
34588 .kr(2)
34589 .sr(4)
34590 .m(3)
34591 .n(4)
34592 .k(8)
34593 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080034594 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034595 }
34596
34597 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, qmax) {
34598 GemmMicrokernelTester()
34599 .mr(3)
34600 .nr(4)
34601 .kr(2)
34602 .sr(4)
34603 .m(3)
34604 .n(4)
34605 .k(8)
34606 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080034607 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034608 }
34609
34610 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, strided_cm) {
34611 GemmMicrokernelTester()
34612 .mr(3)
34613 .nr(4)
34614 .kr(2)
34615 .sr(4)
34616 .m(3)
34617 .n(4)
34618 .k(8)
34619 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080034620 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034621 }
34622#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
34623
34624
34625#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
34626 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8) {
34627 GemmMicrokernelTester()
34628 .mr(4)
34629 .nr(4)
34630 .kr(2)
34631 .sr(4)
34632 .m(4)
34633 .n(4)
34634 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080034635 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034636 }
34637
34638 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, strided_cn) {
34639 GemmMicrokernelTester()
34640 .mr(4)
34641 .nr(4)
34642 .kr(2)
34643 .sr(4)
34644 .m(4)
34645 .n(4)
34646 .k(8)
34647 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080034648 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034649 }
34650
34651 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_strided_a) {
34652 GemmMicrokernelTester()
34653 .mr(4)
34654 .nr(4)
34655 .kr(2)
34656 .sr(4)
34657 .m(4)
34658 .n(4)
34659 .k(8)
34660 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080034661 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034662 }
34663
34664 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080034665 for (uint32_t n = 1; n <= 4; n++) {
34666 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034667 GemmMicrokernelTester()
34668 .mr(4)
34669 .nr(4)
34670 .kr(2)
34671 .sr(4)
34672 .m(m)
34673 .n(n)
34674 .k(8)
34675 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080034676 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034677 }
34678 }
34679 }
34680
34681 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_m) {
34682 for (uint32_t m = 1; m <= 4; m++) {
34683 GemmMicrokernelTester()
34684 .mr(4)
34685 .nr(4)
34686 .kr(2)
34687 .sr(4)
34688 .m(m)
34689 .n(4)
34690 .k(8)
34691 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080034692 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034693 }
34694 }
34695
34696 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_n) {
34697 for (uint32_t n = 1; n <= 4; n++) {
34698 GemmMicrokernelTester()
34699 .mr(4)
34700 .nr(4)
34701 .kr(2)
34702 .sr(4)
34703 .m(4)
34704 .n(n)
34705 .k(8)
34706 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080034707 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034708 }
34709 }
34710
34711 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_lt_8) {
34712 for (size_t k = 1; k < 8; k++) {
34713 GemmMicrokernelTester()
34714 .mr(4)
34715 .nr(4)
34716 .kr(2)
34717 .sr(4)
34718 .m(4)
34719 .n(4)
34720 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080034721 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034722 }
34723 }
34724
34725 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_lt_8_strided_a) {
34726 for (size_t k = 1; k < 8; k++) {
34727 GemmMicrokernelTester()
34728 .mr(4)
34729 .nr(4)
34730 .kr(2)
34731 .sr(4)
34732 .m(4)
34733 .n(4)
34734 .k(k)
34735 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080034736 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034737 }
34738 }
34739
34740 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_lt_8_subtile) {
34741 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080034742 for (uint32_t n = 1; n <= 4; n++) {
34743 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034744 GemmMicrokernelTester()
34745 .mr(4)
34746 .nr(4)
34747 .kr(2)
34748 .sr(4)
34749 .m(m)
34750 .n(n)
34751 .k(k)
34752 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080034753 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034754 }
34755 }
34756 }
34757 }
34758
34759 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_gt_8) {
34760 for (size_t k = 9; k < 16; k++) {
34761 GemmMicrokernelTester()
34762 .mr(4)
34763 .nr(4)
34764 .kr(2)
34765 .sr(4)
34766 .m(4)
34767 .n(4)
34768 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080034769 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034770 }
34771 }
34772
34773 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_gt_8_strided_a) {
34774 for (size_t k = 9; k < 16; k++) {
34775 GemmMicrokernelTester()
34776 .mr(4)
34777 .nr(4)
34778 .kr(2)
34779 .sr(4)
34780 .m(4)
34781 .n(4)
34782 .k(k)
34783 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080034784 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034785 }
34786 }
34787
34788 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_gt_8_subtile) {
34789 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080034790 for (uint32_t n = 1; n <= 4; n++) {
34791 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034792 GemmMicrokernelTester()
34793 .mr(4)
34794 .nr(4)
34795 .kr(2)
34796 .sr(4)
34797 .m(m)
34798 .n(n)
34799 .k(k)
34800 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080034801 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034802 }
34803 }
34804 }
34805 }
34806
34807 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_div_8) {
34808 for (size_t k = 16; k <= 80; k += 8) {
34809 GemmMicrokernelTester()
34810 .mr(4)
34811 .nr(4)
34812 .kr(2)
34813 .sr(4)
34814 .m(4)
34815 .n(4)
34816 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080034817 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034818 }
34819 }
34820
34821 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_div_8_strided_a) {
34822 for (size_t k = 16; k <= 80; k += 8) {
34823 GemmMicrokernelTester()
34824 .mr(4)
34825 .nr(4)
34826 .kr(2)
34827 .sr(4)
34828 .m(4)
34829 .n(4)
34830 .k(k)
34831 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080034832 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034833 }
34834 }
34835
34836 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_div_8_subtile) {
34837 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080034838 for (uint32_t n = 1; n <= 4; n++) {
34839 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034840 GemmMicrokernelTester()
34841 .mr(4)
34842 .nr(4)
34843 .kr(2)
34844 .sr(4)
34845 .m(m)
34846 .n(n)
34847 .k(k)
34848 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080034849 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034850 }
34851 }
34852 }
34853 }
34854
34855 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4) {
34856 for (uint32_t n = 5; n < 8; n++) {
34857 for (size_t k = 1; k <= 40; k += 9) {
34858 GemmMicrokernelTester()
34859 .mr(4)
34860 .nr(4)
34861 .kr(2)
34862 .sr(4)
34863 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080034864 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034865 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080034866 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034867 }
34868 }
34869 }
34870
34871 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_cn) {
34872 for (uint32_t n = 5; n < 8; n++) {
34873 for (size_t k = 1; k <= 40; k += 9) {
34874 GemmMicrokernelTester()
34875 .mr(4)
34876 .nr(4)
34877 .kr(2)
34878 .sr(4)
34879 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080034880 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034881 .k(k)
34882 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080034883 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034884 }
34885 }
34886 }
34887
34888 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_a) {
34889 for (uint32_t n = 5; n < 8; n++) {
34890 for (size_t k = 1; k <= 40; k += 9) {
34891 GemmMicrokernelTester()
34892 .mr(4)
34893 .nr(4)
34894 .kr(2)
34895 .sr(4)
34896 .m(4)
34897 .n(n)
34898 .k(k)
34899 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080034900 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034901 }
34902 }
34903 }
34904
34905 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4_subtile) {
34906 for (uint32_t n = 5; n < 8; n++) {
34907 for (size_t k = 1; k <= 40; k += 9) {
34908 for (uint32_t m = 1; m <= 4; m++) {
34909 GemmMicrokernelTester()
34910 .mr(4)
34911 .nr(4)
34912 .kr(2)
34913 .sr(4)
34914 .m(m)
34915 .n(n)
34916 .k(k)
34917 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080034918 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034919 }
34920 }
34921 }
34922 }
34923
34924 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4) {
34925 for (uint32_t n = 8; n <= 12; n += 4) {
34926 for (size_t k = 1; k <= 40; k += 9) {
34927 GemmMicrokernelTester()
34928 .mr(4)
34929 .nr(4)
34930 .kr(2)
34931 .sr(4)
34932 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080034933 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034934 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080034935 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034936 }
34937 }
34938 }
34939
34940 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4_strided_cn) {
34941 for (uint32_t n = 8; n <= 12; n += 4) {
34942 for (size_t k = 1; k <= 40; k += 9) {
34943 GemmMicrokernelTester()
34944 .mr(4)
34945 .nr(4)
34946 .kr(2)
34947 .sr(4)
34948 .m(4)
34949 .n(n)
34950 .k(k)
34951 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080034952 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034953 }
34954 }
34955 }
34956
34957 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4_strided_a) {
34958 for (uint32_t n = 8; n <= 12; n += 4) {
34959 for (size_t k = 1; k <= 40; k += 9) {
34960 GemmMicrokernelTester()
34961 .mr(4)
34962 .nr(4)
34963 .kr(2)
34964 .sr(4)
34965 .m(4)
34966 .n(n)
34967 .k(k)
34968 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080034969 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034970 }
34971 }
34972 }
34973
34974 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4_subtile) {
34975 for (uint32_t n = 8; n <= 12; n += 4) {
34976 for (size_t k = 1; k <= 40; k += 9) {
34977 for (uint32_t m = 1; m <= 4; m++) {
34978 GemmMicrokernelTester()
34979 .mr(4)
34980 .nr(4)
34981 .kr(2)
34982 .sr(4)
34983 .m(m)
34984 .n(n)
34985 .k(k)
34986 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080034987 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034988 }
34989 }
34990 }
34991 }
34992
34993 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, strided_cm_subtile) {
34994 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080034995 for (uint32_t n = 1; n <= 4; n++) {
34996 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080034997 GemmMicrokernelTester()
34998 .mr(4)
34999 .nr(4)
35000 .kr(2)
35001 .sr(4)
35002 .m(m)
35003 .n(n)
35004 .k(k)
35005 .cm_stride(7)
35006 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080035007 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035008 }
35009 }
35010 }
35011 }
35012
35013 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, qmin) {
35014 GemmMicrokernelTester()
35015 .mr(4)
35016 .nr(4)
35017 .kr(2)
35018 .sr(4)
35019 .m(4)
35020 .n(4)
35021 .k(8)
35022 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080035023 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035024 }
35025
35026 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, qmax) {
35027 GemmMicrokernelTester()
35028 .mr(4)
35029 .nr(4)
35030 .kr(2)
35031 .sr(4)
35032 .m(4)
35033 .n(4)
35034 .k(8)
35035 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080035036 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035037 }
35038
35039 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, strided_cm) {
35040 GemmMicrokernelTester()
35041 .mr(4)
35042 .nr(4)
35043 .kr(2)
35044 .sr(4)
35045 .m(4)
35046 .n(4)
35047 .k(8)
35048 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080035049 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035050 }
35051#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
35052
35053
35054#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
35055 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8) {
35056 GemmMicrokernelTester()
35057 .mr(1)
35058 .nr(4)
35059 .kr(2)
35060 .sr(4)
35061 .m(1)
35062 .n(4)
35063 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080035064 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035065 }
35066
35067 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, strided_cn) {
35068 GemmMicrokernelTester()
35069 .mr(1)
35070 .nr(4)
35071 .kr(2)
35072 .sr(4)
35073 .m(1)
35074 .n(4)
35075 .k(8)
35076 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080035077 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035078 }
35079
35080 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8_strided_a) {
35081 GemmMicrokernelTester()
35082 .mr(1)
35083 .nr(4)
35084 .kr(2)
35085 .sr(4)
35086 .m(1)
35087 .n(4)
35088 .k(8)
35089 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080035090 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035091 }
35092
35093 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080035094 for (uint32_t n = 1; n <= 4; n++) {
35095 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035096 GemmMicrokernelTester()
35097 .mr(1)
35098 .nr(4)
35099 .kr(2)
35100 .sr(4)
35101 .m(m)
35102 .n(n)
35103 .k(8)
35104 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080035105 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035106 }
35107 }
35108 }
35109
35110 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_m) {
35111 for (uint32_t m = 1; m <= 1; m++) {
35112 GemmMicrokernelTester()
35113 .mr(1)
35114 .nr(4)
35115 .kr(2)
35116 .sr(4)
35117 .m(m)
35118 .n(4)
35119 .k(8)
35120 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080035121 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035122 }
35123 }
35124
35125 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_n) {
35126 for (uint32_t n = 1; n <= 4; n++) {
35127 GemmMicrokernelTester()
35128 .mr(1)
35129 .nr(4)
35130 .kr(2)
35131 .sr(4)
35132 .m(1)
35133 .n(n)
35134 .k(8)
35135 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080035136 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035137 }
35138 }
35139
35140 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, k_lt_8) {
35141 for (size_t k = 1; k < 8; k++) {
35142 GemmMicrokernelTester()
35143 .mr(1)
35144 .nr(4)
35145 .kr(2)
35146 .sr(4)
35147 .m(1)
35148 .n(4)
35149 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080035150 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035151 }
35152 }
35153
35154 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, k_lt_8_strided_a) {
35155 for (size_t k = 1; k < 8; k++) {
35156 GemmMicrokernelTester()
35157 .mr(1)
35158 .nr(4)
35159 .kr(2)
35160 .sr(4)
35161 .m(1)
35162 .n(4)
35163 .k(k)
35164 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080035165 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035166 }
35167 }
35168
35169 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, k_lt_8_subtile) {
35170 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080035171 for (uint32_t n = 1; n <= 4; n++) {
35172 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035173 GemmMicrokernelTester()
35174 .mr(1)
35175 .nr(4)
35176 .kr(2)
35177 .sr(4)
35178 .m(m)
35179 .n(n)
35180 .k(k)
35181 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080035182 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035183 }
35184 }
35185 }
35186 }
35187
35188 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, k_gt_8) {
35189 for (size_t k = 9; k < 16; k++) {
35190 GemmMicrokernelTester()
35191 .mr(1)
35192 .nr(4)
35193 .kr(2)
35194 .sr(4)
35195 .m(1)
35196 .n(4)
35197 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080035198 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035199 }
35200 }
35201
35202 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, k_gt_8_strided_a) {
35203 for (size_t k = 9; k < 16; k++) {
35204 GemmMicrokernelTester()
35205 .mr(1)
35206 .nr(4)
35207 .kr(2)
35208 .sr(4)
35209 .m(1)
35210 .n(4)
35211 .k(k)
35212 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080035213 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035214 }
35215 }
35216
35217 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, k_gt_8_subtile) {
35218 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080035219 for (uint32_t n = 1; n <= 4; n++) {
35220 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035221 GemmMicrokernelTester()
35222 .mr(1)
35223 .nr(4)
35224 .kr(2)
35225 .sr(4)
35226 .m(m)
35227 .n(n)
35228 .k(k)
35229 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080035230 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035231 }
35232 }
35233 }
35234 }
35235
35236 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, k_div_8) {
35237 for (size_t k = 16; k <= 80; k += 8) {
35238 GemmMicrokernelTester()
35239 .mr(1)
35240 .nr(4)
35241 .kr(2)
35242 .sr(4)
35243 .m(1)
35244 .n(4)
35245 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080035246 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035247 }
35248 }
35249
35250 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, k_div_8_strided_a) {
35251 for (size_t k = 16; k <= 80; k += 8) {
35252 GemmMicrokernelTester()
35253 .mr(1)
35254 .nr(4)
35255 .kr(2)
35256 .sr(4)
35257 .m(1)
35258 .n(4)
35259 .k(k)
35260 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080035261 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035262 }
35263 }
35264
35265 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, k_div_8_subtile) {
35266 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080035267 for (uint32_t n = 1; n <= 4; n++) {
35268 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035269 GemmMicrokernelTester()
35270 .mr(1)
35271 .nr(4)
35272 .kr(2)
35273 .sr(4)
35274 .m(m)
35275 .n(n)
35276 .k(k)
35277 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080035278 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035279 }
35280 }
35281 }
35282 }
35283
35284 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, n_gt_4) {
35285 for (uint32_t n = 5; n < 8; n++) {
35286 for (size_t k = 1; k <= 40; k += 9) {
35287 GemmMicrokernelTester()
35288 .mr(1)
35289 .nr(4)
35290 .kr(2)
35291 .sr(4)
35292 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080035293 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035294 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080035295 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035296 }
35297 }
35298 }
35299
35300 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_cn) {
35301 for (uint32_t n = 5; n < 8; n++) {
35302 for (size_t k = 1; k <= 40; k += 9) {
35303 GemmMicrokernelTester()
35304 .mr(1)
35305 .nr(4)
35306 .kr(2)
35307 .sr(4)
35308 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080035309 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035310 .k(k)
35311 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080035312 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035313 }
35314 }
35315 }
35316
35317 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_a) {
35318 for (uint32_t n = 5; n < 8; n++) {
35319 for (size_t k = 1; k <= 40; k += 9) {
35320 GemmMicrokernelTester()
35321 .mr(1)
35322 .nr(4)
35323 .kr(2)
35324 .sr(4)
35325 .m(1)
35326 .n(n)
35327 .k(k)
35328 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080035329 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035330 }
35331 }
35332 }
35333
35334 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, n_gt_4_subtile) {
35335 for (uint32_t n = 5; n < 8; n++) {
35336 for (size_t k = 1; k <= 40; k += 9) {
35337 for (uint32_t m = 1; m <= 1; m++) {
35338 GemmMicrokernelTester()
35339 .mr(1)
35340 .nr(4)
35341 .kr(2)
35342 .sr(4)
35343 .m(m)
35344 .n(n)
35345 .k(k)
35346 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080035347 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035348 }
35349 }
35350 }
35351 }
35352
35353 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, n_div_4) {
35354 for (uint32_t n = 8; n <= 12; n += 4) {
35355 for (size_t k = 1; k <= 40; k += 9) {
35356 GemmMicrokernelTester()
35357 .mr(1)
35358 .nr(4)
35359 .kr(2)
35360 .sr(4)
35361 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080035362 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035363 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080035364 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035365 }
35366 }
35367 }
35368
35369 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, n_div_4_strided_cn) {
35370 for (uint32_t n = 8; n <= 12; n += 4) {
35371 for (size_t k = 1; k <= 40; k += 9) {
35372 GemmMicrokernelTester()
35373 .mr(1)
35374 .nr(4)
35375 .kr(2)
35376 .sr(4)
35377 .m(1)
35378 .n(n)
35379 .k(k)
35380 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080035381 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035382 }
35383 }
35384 }
35385
35386 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, n_div_4_strided_a) {
35387 for (uint32_t n = 8; n <= 12; n += 4) {
35388 for (size_t k = 1; k <= 40; k += 9) {
35389 GemmMicrokernelTester()
35390 .mr(1)
35391 .nr(4)
35392 .kr(2)
35393 .sr(4)
35394 .m(1)
35395 .n(n)
35396 .k(k)
35397 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080035398 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035399 }
35400 }
35401 }
35402
35403 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, n_div_4_subtile) {
35404 for (uint32_t n = 8; n <= 12; n += 4) {
35405 for (size_t k = 1; k <= 40; k += 9) {
35406 for (uint32_t m = 1; m <= 1; m++) {
35407 GemmMicrokernelTester()
35408 .mr(1)
35409 .nr(4)
35410 .kr(2)
35411 .sr(4)
35412 .m(m)
35413 .n(n)
35414 .k(k)
35415 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080035416 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035417 }
35418 }
35419 }
35420 }
35421
35422 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, strided_cm_subtile) {
35423 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080035424 for (uint32_t n = 1; n <= 4; n++) {
35425 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035426 GemmMicrokernelTester()
35427 .mr(1)
35428 .nr(4)
35429 .kr(2)
35430 .sr(4)
35431 .m(m)
35432 .n(n)
35433 .k(k)
35434 .cm_stride(7)
35435 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080035436 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035437 }
35438 }
35439 }
35440 }
35441
35442 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, qmin) {
35443 GemmMicrokernelTester()
35444 .mr(1)
35445 .nr(4)
35446 .kr(2)
35447 .sr(4)
35448 .m(1)
35449 .n(4)
35450 .k(8)
35451 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080035452 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035453 }
35454
35455 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, qmax) {
35456 GemmMicrokernelTester()
35457 .mr(1)
35458 .nr(4)
35459 .kr(2)
35460 .sr(4)
35461 .m(1)
35462 .n(4)
35463 .k(8)
35464 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080035465 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035466 }
35467
35468 TEST(QC8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD128, strided_cm) {
35469 GemmMicrokernelTester()
35470 .mr(1)
35471 .nr(4)
35472 .kr(2)
35473 .sr(4)
35474 .m(1)
35475 .n(4)
35476 .k(8)
35477 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080035478 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035479 }
35480#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
35481
35482
35483#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
35484 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8) {
35485 GemmMicrokernelTester()
35486 .mr(3)
35487 .nr(4)
35488 .kr(2)
35489 .sr(4)
35490 .m(3)
35491 .n(4)
35492 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080035493 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035494 }
35495
35496 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, strided_cn) {
35497 GemmMicrokernelTester()
35498 .mr(3)
35499 .nr(4)
35500 .kr(2)
35501 .sr(4)
35502 .m(3)
35503 .n(4)
35504 .k(8)
35505 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080035506 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035507 }
35508
35509 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8_strided_a) {
35510 GemmMicrokernelTester()
35511 .mr(3)
35512 .nr(4)
35513 .kr(2)
35514 .sr(4)
35515 .m(3)
35516 .n(4)
35517 .k(8)
35518 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080035519 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035520 }
35521
35522 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080035523 for (uint32_t n = 1; n <= 4; n++) {
35524 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035525 GemmMicrokernelTester()
35526 .mr(3)
35527 .nr(4)
35528 .kr(2)
35529 .sr(4)
35530 .m(m)
35531 .n(n)
35532 .k(8)
35533 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080035534 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035535 }
35536 }
35537 }
35538
35539 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_m) {
35540 for (uint32_t m = 1; m <= 3; m++) {
35541 GemmMicrokernelTester()
35542 .mr(3)
35543 .nr(4)
35544 .kr(2)
35545 .sr(4)
35546 .m(m)
35547 .n(4)
35548 .k(8)
35549 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080035550 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035551 }
35552 }
35553
35554 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_n) {
35555 for (uint32_t n = 1; n <= 4; n++) {
35556 GemmMicrokernelTester()
35557 .mr(3)
35558 .nr(4)
35559 .kr(2)
35560 .sr(4)
35561 .m(3)
35562 .n(n)
35563 .k(8)
35564 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080035565 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035566 }
35567 }
35568
35569 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, k_lt_8) {
35570 for (size_t k = 1; k < 8; k++) {
35571 GemmMicrokernelTester()
35572 .mr(3)
35573 .nr(4)
35574 .kr(2)
35575 .sr(4)
35576 .m(3)
35577 .n(4)
35578 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080035579 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035580 }
35581 }
35582
35583 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, k_lt_8_strided_a) {
35584 for (size_t k = 1; k < 8; k++) {
35585 GemmMicrokernelTester()
35586 .mr(3)
35587 .nr(4)
35588 .kr(2)
35589 .sr(4)
35590 .m(3)
35591 .n(4)
35592 .k(k)
35593 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080035594 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035595 }
35596 }
35597
35598 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, k_lt_8_subtile) {
35599 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080035600 for (uint32_t n = 1; n <= 4; n++) {
35601 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035602 GemmMicrokernelTester()
35603 .mr(3)
35604 .nr(4)
35605 .kr(2)
35606 .sr(4)
35607 .m(m)
35608 .n(n)
35609 .k(k)
35610 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080035611 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035612 }
35613 }
35614 }
35615 }
35616
35617 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, k_gt_8) {
35618 for (size_t k = 9; k < 16; k++) {
35619 GemmMicrokernelTester()
35620 .mr(3)
35621 .nr(4)
35622 .kr(2)
35623 .sr(4)
35624 .m(3)
35625 .n(4)
35626 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080035627 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035628 }
35629 }
35630
35631 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, k_gt_8_strided_a) {
35632 for (size_t k = 9; k < 16; k++) {
35633 GemmMicrokernelTester()
35634 .mr(3)
35635 .nr(4)
35636 .kr(2)
35637 .sr(4)
35638 .m(3)
35639 .n(4)
35640 .k(k)
35641 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080035642 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035643 }
35644 }
35645
35646 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, k_gt_8_subtile) {
35647 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080035648 for (uint32_t n = 1; n <= 4; n++) {
35649 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035650 GemmMicrokernelTester()
35651 .mr(3)
35652 .nr(4)
35653 .kr(2)
35654 .sr(4)
35655 .m(m)
35656 .n(n)
35657 .k(k)
35658 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080035659 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035660 }
35661 }
35662 }
35663 }
35664
35665 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, k_div_8) {
35666 for (size_t k = 16; k <= 80; k += 8) {
35667 GemmMicrokernelTester()
35668 .mr(3)
35669 .nr(4)
35670 .kr(2)
35671 .sr(4)
35672 .m(3)
35673 .n(4)
35674 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080035675 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035676 }
35677 }
35678
35679 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, k_div_8_strided_a) {
35680 for (size_t k = 16; k <= 80; k += 8) {
35681 GemmMicrokernelTester()
35682 .mr(3)
35683 .nr(4)
35684 .kr(2)
35685 .sr(4)
35686 .m(3)
35687 .n(4)
35688 .k(k)
35689 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080035690 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035691 }
35692 }
35693
35694 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, k_div_8_subtile) {
35695 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080035696 for (uint32_t n = 1; n <= 4; n++) {
35697 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035698 GemmMicrokernelTester()
35699 .mr(3)
35700 .nr(4)
35701 .kr(2)
35702 .sr(4)
35703 .m(m)
35704 .n(n)
35705 .k(k)
35706 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080035707 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035708 }
35709 }
35710 }
35711 }
35712
35713 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, n_gt_4) {
35714 for (uint32_t n = 5; n < 8; n++) {
35715 for (size_t k = 1; k <= 40; k += 9) {
35716 GemmMicrokernelTester()
35717 .mr(3)
35718 .nr(4)
35719 .kr(2)
35720 .sr(4)
35721 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080035722 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035723 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080035724 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035725 }
35726 }
35727 }
35728
35729 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_cn) {
35730 for (uint32_t n = 5; n < 8; n++) {
35731 for (size_t k = 1; k <= 40; k += 9) {
35732 GemmMicrokernelTester()
35733 .mr(3)
35734 .nr(4)
35735 .kr(2)
35736 .sr(4)
35737 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080035738 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035739 .k(k)
35740 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080035741 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035742 }
35743 }
35744 }
35745
35746 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_a) {
35747 for (uint32_t n = 5; n < 8; n++) {
35748 for (size_t k = 1; k <= 40; k += 9) {
35749 GemmMicrokernelTester()
35750 .mr(3)
35751 .nr(4)
35752 .kr(2)
35753 .sr(4)
35754 .m(3)
35755 .n(n)
35756 .k(k)
35757 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080035758 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035759 }
35760 }
35761 }
35762
35763 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, n_gt_4_subtile) {
35764 for (uint32_t n = 5; n < 8; n++) {
35765 for (size_t k = 1; k <= 40; k += 9) {
35766 for (uint32_t m = 1; m <= 3; m++) {
35767 GemmMicrokernelTester()
35768 .mr(3)
35769 .nr(4)
35770 .kr(2)
35771 .sr(4)
35772 .m(m)
35773 .n(n)
35774 .k(k)
35775 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080035776 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035777 }
35778 }
35779 }
35780 }
35781
35782 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, n_div_4) {
35783 for (uint32_t n = 8; n <= 12; n += 4) {
35784 for (size_t k = 1; k <= 40; k += 9) {
35785 GemmMicrokernelTester()
35786 .mr(3)
35787 .nr(4)
35788 .kr(2)
35789 .sr(4)
35790 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080035791 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035792 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080035793 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035794 }
35795 }
35796 }
35797
35798 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, n_div_4_strided_cn) {
35799 for (uint32_t n = 8; n <= 12; n += 4) {
35800 for (size_t k = 1; k <= 40; k += 9) {
35801 GemmMicrokernelTester()
35802 .mr(3)
35803 .nr(4)
35804 .kr(2)
35805 .sr(4)
35806 .m(3)
35807 .n(n)
35808 .k(k)
35809 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080035810 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035811 }
35812 }
35813 }
35814
35815 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, n_div_4_strided_a) {
35816 for (uint32_t n = 8; n <= 12; n += 4) {
35817 for (size_t k = 1; k <= 40; k += 9) {
35818 GemmMicrokernelTester()
35819 .mr(3)
35820 .nr(4)
35821 .kr(2)
35822 .sr(4)
35823 .m(3)
35824 .n(n)
35825 .k(k)
35826 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080035827 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035828 }
35829 }
35830 }
35831
35832 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, n_div_4_subtile) {
35833 for (uint32_t n = 8; n <= 12; n += 4) {
35834 for (size_t k = 1; k <= 40; k += 9) {
35835 for (uint32_t m = 1; m <= 3; m++) {
35836 GemmMicrokernelTester()
35837 .mr(3)
35838 .nr(4)
35839 .kr(2)
35840 .sr(4)
35841 .m(m)
35842 .n(n)
35843 .k(k)
35844 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080035845 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035846 }
35847 }
35848 }
35849 }
35850
35851 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, strided_cm_subtile) {
35852 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080035853 for (uint32_t n = 1; n <= 4; n++) {
35854 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035855 GemmMicrokernelTester()
35856 .mr(3)
35857 .nr(4)
35858 .kr(2)
35859 .sr(4)
35860 .m(m)
35861 .n(n)
35862 .k(k)
35863 .cm_stride(7)
35864 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080035865 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035866 }
35867 }
35868 }
35869 }
35870
35871 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, qmin) {
35872 GemmMicrokernelTester()
35873 .mr(3)
35874 .nr(4)
35875 .kr(2)
35876 .sr(4)
35877 .m(3)
35878 .n(4)
35879 .k(8)
35880 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080035881 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035882 }
35883
35884 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, qmax) {
35885 GemmMicrokernelTester()
35886 .mr(3)
35887 .nr(4)
35888 .kr(2)
35889 .sr(4)
35890 .m(3)
35891 .n(4)
35892 .k(8)
35893 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080035894 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035895 }
35896
35897 TEST(QC8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD128, strided_cm) {
35898 GemmMicrokernelTester()
35899 .mr(3)
35900 .nr(4)
35901 .kr(2)
35902 .sr(4)
35903 .m(3)
35904 .n(4)
35905 .k(8)
35906 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080035907 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035908 }
35909#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
35910
35911
35912#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
35913 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8) {
35914 GemmMicrokernelTester()
35915 .mr(1)
35916 .nr(4)
35917 .kr(8)
35918 .sr(1)
35919 .m(1)
35920 .n(4)
35921 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080035922 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035923 }
35924
35925 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, strided_cn) {
35926 GemmMicrokernelTester()
35927 .mr(1)
35928 .nr(4)
35929 .kr(8)
35930 .sr(1)
35931 .m(1)
35932 .n(4)
35933 .k(8)
35934 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080035935 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035936 }
35937
35938 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_strided_a) {
35939 GemmMicrokernelTester()
35940 .mr(1)
35941 .nr(4)
35942 .kr(8)
35943 .sr(1)
35944 .m(1)
35945 .n(4)
35946 .k(8)
35947 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080035948 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035949 }
35950
35951 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080035952 for (uint32_t n = 1; n <= 4; n++) {
35953 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035954 GemmMicrokernelTester()
35955 .mr(1)
35956 .nr(4)
35957 .kr(8)
35958 .sr(1)
35959 .m(m)
35960 .n(n)
35961 .k(8)
35962 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080035963 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035964 }
35965 }
35966 }
35967
35968 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_m) {
35969 for (uint32_t m = 1; m <= 1; m++) {
35970 GemmMicrokernelTester()
35971 .mr(1)
35972 .nr(4)
35973 .kr(8)
35974 .sr(1)
35975 .m(m)
35976 .n(4)
35977 .k(8)
35978 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080035979 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035980 }
35981 }
35982
35983 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_n) {
35984 for (uint32_t n = 1; n <= 4; n++) {
35985 GemmMicrokernelTester()
35986 .mr(1)
35987 .nr(4)
35988 .kr(8)
35989 .sr(1)
35990 .m(1)
35991 .n(n)
35992 .k(8)
35993 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080035994 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080035995 }
35996 }
35997
35998 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, k_lt_8) {
35999 for (size_t k = 1; k < 8; k++) {
36000 GemmMicrokernelTester()
36001 .mr(1)
36002 .nr(4)
36003 .kr(8)
36004 .sr(1)
36005 .m(1)
36006 .n(4)
36007 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080036008 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036009 }
36010 }
36011
36012 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, k_lt_8_strided_a) {
36013 for (size_t k = 1; k < 8; k++) {
36014 GemmMicrokernelTester()
36015 .mr(1)
36016 .nr(4)
36017 .kr(8)
36018 .sr(1)
36019 .m(1)
36020 .n(4)
36021 .k(k)
36022 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080036023 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036024 }
36025 }
36026
36027 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, k_lt_8_subtile) {
36028 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080036029 for (uint32_t n = 1; n <= 4; n++) {
36030 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036031 GemmMicrokernelTester()
36032 .mr(1)
36033 .nr(4)
36034 .kr(8)
36035 .sr(1)
36036 .m(m)
36037 .n(n)
36038 .k(k)
36039 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080036040 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036041 }
36042 }
36043 }
36044 }
36045
36046 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, k_gt_8) {
36047 for (size_t k = 9; k < 16; k++) {
36048 GemmMicrokernelTester()
36049 .mr(1)
36050 .nr(4)
36051 .kr(8)
36052 .sr(1)
36053 .m(1)
36054 .n(4)
36055 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080036056 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036057 }
36058 }
36059
36060 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, k_gt_8_strided_a) {
36061 for (size_t k = 9; k < 16; k++) {
36062 GemmMicrokernelTester()
36063 .mr(1)
36064 .nr(4)
36065 .kr(8)
36066 .sr(1)
36067 .m(1)
36068 .n(4)
36069 .k(k)
36070 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080036071 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036072 }
36073 }
36074
36075 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, k_gt_8_subtile) {
36076 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080036077 for (uint32_t n = 1; n <= 4; n++) {
36078 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036079 GemmMicrokernelTester()
36080 .mr(1)
36081 .nr(4)
36082 .kr(8)
36083 .sr(1)
36084 .m(m)
36085 .n(n)
36086 .k(k)
36087 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080036088 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036089 }
36090 }
36091 }
36092 }
36093
36094 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, k_div_8) {
36095 for (size_t k = 16; k <= 80; k += 8) {
36096 GemmMicrokernelTester()
36097 .mr(1)
36098 .nr(4)
36099 .kr(8)
36100 .sr(1)
36101 .m(1)
36102 .n(4)
36103 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080036104 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036105 }
36106 }
36107
36108 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, k_div_8_strided_a) {
36109 for (size_t k = 16; k <= 80; k += 8) {
36110 GemmMicrokernelTester()
36111 .mr(1)
36112 .nr(4)
36113 .kr(8)
36114 .sr(1)
36115 .m(1)
36116 .n(4)
36117 .k(k)
36118 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080036119 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036120 }
36121 }
36122
36123 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, k_div_8_subtile) {
36124 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080036125 for (uint32_t n = 1; n <= 4; n++) {
36126 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036127 GemmMicrokernelTester()
36128 .mr(1)
36129 .nr(4)
36130 .kr(8)
36131 .sr(1)
36132 .m(m)
36133 .n(n)
36134 .k(k)
36135 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080036136 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036137 }
36138 }
36139 }
36140 }
36141
36142 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4) {
36143 for (uint32_t n = 5; n < 8; n++) {
36144 for (size_t k = 1; k <= 40; k += 9) {
36145 GemmMicrokernelTester()
36146 .mr(1)
36147 .nr(4)
36148 .kr(8)
36149 .sr(1)
36150 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080036151 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036152 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080036153 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036154 }
36155 }
36156 }
36157
36158 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_cn) {
36159 for (uint32_t n = 5; n < 8; n++) {
36160 for (size_t k = 1; k <= 40; k += 9) {
36161 GemmMicrokernelTester()
36162 .mr(1)
36163 .nr(4)
36164 .kr(8)
36165 .sr(1)
36166 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080036167 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036168 .k(k)
36169 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080036170 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036171 }
36172 }
36173 }
36174
36175 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_a) {
36176 for (uint32_t n = 5; n < 8; n++) {
36177 for (size_t k = 1; k <= 40; k += 9) {
36178 GemmMicrokernelTester()
36179 .mr(1)
36180 .nr(4)
36181 .kr(8)
36182 .sr(1)
36183 .m(1)
36184 .n(n)
36185 .k(k)
36186 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080036187 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036188 }
36189 }
36190 }
36191
36192 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4_subtile) {
36193 for (uint32_t n = 5; n < 8; n++) {
36194 for (size_t k = 1; k <= 40; k += 9) {
36195 for (uint32_t m = 1; m <= 1; m++) {
36196 GemmMicrokernelTester()
36197 .mr(1)
36198 .nr(4)
36199 .kr(8)
36200 .sr(1)
36201 .m(m)
36202 .n(n)
36203 .k(k)
36204 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080036205 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036206 }
36207 }
36208 }
36209 }
36210
36211 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, n_div_4) {
36212 for (uint32_t n = 8; n <= 12; n += 4) {
36213 for (size_t k = 1; k <= 40; k += 9) {
36214 GemmMicrokernelTester()
36215 .mr(1)
36216 .nr(4)
36217 .kr(8)
36218 .sr(1)
36219 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080036220 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036221 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080036222 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036223 }
36224 }
36225 }
36226
36227 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, n_div_4_strided_cn) {
36228 for (uint32_t n = 8; n <= 12; n += 4) {
36229 for (size_t k = 1; k <= 40; k += 9) {
36230 GemmMicrokernelTester()
36231 .mr(1)
36232 .nr(4)
36233 .kr(8)
36234 .sr(1)
36235 .m(1)
36236 .n(n)
36237 .k(k)
36238 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080036239 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036240 }
36241 }
36242 }
36243
36244 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, n_div_4_strided_a) {
36245 for (uint32_t n = 8; n <= 12; n += 4) {
36246 for (size_t k = 1; k <= 40; k += 9) {
36247 GemmMicrokernelTester()
36248 .mr(1)
36249 .nr(4)
36250 .kr(8)
36251 .sr(1)
36252 .m(1)
36253 .n(n)
36254 .k(k)
36255 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080036256 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036257 }
36258 }
36259 }
36260
36261 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, n_div_4_subtile) {
36262 for (uint32_t n = 8; n <= 12; n += 4) {
36263 for (size_t k = 1; k <= 40; k += 9) {
36264 for (uint32_t m = 1; m <= 1; m++) {
36265 GemmMicrokernelTester()
36266 .mr(1)
36267 .nr(4)
36268 .kr(8)
36269 .sr(1)
36270 .m(m)
36271 .n(n)
36272 .k(k)
36273 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080036274 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036275 }
36276 }
36277 }
36278 }
36279
36280 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, strided_cm_subtile) {
36281 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080036282 for (uint32_t n = 1; n <= 4; n++) {
36283 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036284 GemmMicrokernelTester()
36285 .mr(1)
36286 .nr(4)
36287 .kr(8)
36288 .sr(1)
36289 .m(m)
36290 .n(n)
36291 .k(k)
36292 .cm_stride(7)
36293 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080036294 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036295 }
36296 }
36297 }
36298 }
36299
36300 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, qmin) {
36301 GemmMicrokernelTester()
36302 .mr(1)
36303 .nr(4)
36304 .kr(8)
36305 .sr(1)
36306 .m(1)
36307 .n(4)
36308 .k(8)
36309 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080036310 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036311 }
36312
36313 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, qmax) {
36314 GemmMicrokernelTester()
36315 .mr(1)
36316 .nr(4)
36317 .kr(8)
36318 .sr(1)
36319 .m(1)
36320 .n(4)
36321 .k(8)
36322 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080036323 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036324 }
36325
36326 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD64, strided_cm) {
36327 GemmMicrokernelTester()
36328 .mr(1)
36329 .nr(4)
36330 .kr(8)
36331 .sr(1)
36332 .m(1)
36333 .n(4)
36334 .k(8)
36335 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080036336 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036337 }
36338#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
36339
36340
36341#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
36342 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8) {
36343 GemmMicrokernelTester()
36344 .mr(2)
36345 .nr(4)
36346 .kr(8)
36347 .sr(1)
36348 .m(2)
36349 .n(4)
36350 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080036351 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036352 }
36353
36354 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, strided_cn) {
36355 GemmMicrokernelTester()
36356 .mr(2)
36357 .nr(4)
36358 .kr(8)
36359 .sr(1)
36360 .m(2)
36361 .n(4)
36362 .k(8)
36363 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080036364 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036365 }
36366
36367 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_strided_a) {
36368 GemmMicrokernelTester()
36369 .mr(2)
36370 .nr(4)
36371 .kr(8)
36372 .sr(1)
36373 .m(2)
36374 .n(4)
36375 .k(8)
36376 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080036377 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036378 }
36379
36380 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080036381 for (uint32_t n = 1; n <= 4; n++) {
36382 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036383 GemmMicrokernelTester()
36384 .mr(2)
36385 .nr(4)
36386 .kr(8)
36387 .sr(1)
36388 .m(m)
36389 .n(n)
36390 .k(8)
36391 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080036392 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036393 }
36394 }
36395 }
36396
36397 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_m) {
36398 for (uint32_t m = 1; m <= 2; m++) {
36399 GemmMicrokernelTester()
36400 .mr(2)
36401 .nr(4)
36402 .kr(8)
36403 .sr(1)
36404 .m(m)
36405 .n(4)
36406 .k(8)
36407 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080036408 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036409 }
36410 }
36411
36412 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_n) {
36413 for (uint32_t n = 1; n <= 4; n++) {
36414 GemmMicrokernelTester()
36415 .mr(2)
36416 .nr(4)
36417 .kr(8)
36418 .sr(1)
36419 .m(2)
36420 .n(n)
36421 .k(8)
36422 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080036423 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036424 }
36425 }
36426
36427 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_lt_8) {
36428 for (size_t k = 1; k < 8; k++) {
36429 GemmMicrokernelTester()
36430 .mr(2)
36431 .nr(4)
36432 .kr(8)
36433 .sr(1)
36434 .m(2)
36435 .n(4)
36436 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080036437 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036438 }
36439 }
36440
36441 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_lt_8_strided_a) {
36442 for (size_t k = 1; k < 8; k++) {
36443 GemmMicrokernelTester()
36444 .mr(2)
36445 .nr(4)
36446 .kr(8)
36447 .sr(1)
36448 .m(2)
36449 .n(4)
36450 .k(k)
36451 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080036452 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036453 }
36454 }
36455
36456 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_lt_8_subtile) {
36457 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080036458 for (uint32_t n = 1; n <= 4; n++) {
36459 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036460 GemmMicrokernelTester()
36461 .mr(2)
36462 .nr(4)
36463 .kr(8)
36464 .sr(1)
36465 .m(m)
36466 .n(n)
36467 .k(k)
36468 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080036469 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036470 }
36471 }
36472 }
36473 }
36474
36475 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_gt_8) {
36476 for (size_t k = 9; k < 16; k++) {
36477 GemmMicrokernelTester()
36478 .mr(2)
36479 .nr(4)
36480 .kr(8)
36481 .sr(1)
36482 .m(2)
36483 .n(4)
36484 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080036485 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036486 }
36487 }
36488
36489 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_gt_8_strided_a) {
36490 for (size_t k = 9; k < 16; k++) {
36491 GemmMicrokernelTester()
36492 .mr(2)
36493 .nr(4)
36494 .kr(8)
36495 .sr(1)
36496 .m(2)
36497 .n(4)
36498 .k(k)
36499 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080036500 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036501 }
36502 }
36503
36504 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_gt_8_subtile) {
36505 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080036506 for (uint32_t n = 1; n <= 4; n++) {
36507 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036508 GemmMicrokernelTester()
36509 .mr(2)
36510 .nr(4)
36511 .kr(8)
36512 .sr(1)
36513 .m(m)
36514 .n(n)
36515 .k(k)
36516 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080036517 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036518 }
36519 }
36520 }
36521 }
36522
36523 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_div_8) {
36524 for (size_t k = 16; k <= 80; k += 8) {
36525 GemmMicrokernelTester()
36526 .mr(2)
36527 .nr(4)
36528 .kr(8)
36529 .sr(1)
36530 .m(2)
36531 .n(4)
36532 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080036533 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036534 }
36535 }
36536
36537 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_div_8_strided_a) {
36538 for (size_t k = 16; k <= 80; k += 8) {
36539 GemmMicrokernelTester()
36540 .mr(2)
36541 .nr(4)
36542 .kr(8)
36543 .sr(1)
36544 .m(2)
36545 .n(4)
36546 .k(k)
36547 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080036548 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036549 }
36550 }
36551
36552 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_div_8_subtile) {
36553 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080036554 for (uint32_t n = 1; n <= 4; n++) {
36555 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036556 GemmMicrokernelTester()
36557 .mr(2)
36558 .nr(4)
36559 .kr(8)
36560 .sr(1)
36561 .m(m)
36562 .n(n)
36563 .k(k)
36564 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080036565 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036566 }
36567 }
36568 }
36569 }
36570
36571 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4) {
36572 for (uint32_t n = 5; n < 8; n++) {
36573 for (size_t k = 1; k <= 40; k += 9) {
36574 GemmMicrokernelTester()
36575 .mr(2)
36576 .nr(4)
36577 .kr(8)
36578 .sr(1)
36579 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080036580 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036581 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080036582 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036583 }
36584 }
36585 }
36586
36587 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_cn) {
36588 for (uint32_t n = 5; n < 8; n++) {
36589 for (size_t k = 1; k <= 40; k += 9) {
36590 GemmMicrokernelTester()
36591 .mr(2)
36592 .nr(4)
36593 .kr(8)
36594 .sr(1)
36595 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080036596 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036597 .k(k)
36598 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080036599 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036600 }
36601 }
36602 }
36603
36604 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_a) {
36605 for (uint32_t n = 5; n < 8; n++) {
36606 for (size_t k = 1; k <= 40; k += 9) {
36607 GemmMicrokernelTester()
36608 .mr(2)
36609 .nr(4)
36610 .kr(8)
36611 .sr(1)
36612 .m(2)
36613 .n(n)
36614 .k(k)
36615 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080036616 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036617 }
36618 }
36619 }
36620
36621 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4_subtile) {
36622 for (uint32_t n = 5; n < 8; n++) {
36623 for (size_t k = 1; k <= 40; k += 9) {
36624 for (uint32_t m = 1; m <= 2; m++) {
36625 GemmMicrokernelTester()
36626 .mr(2)
36627 .nr(4)
36628 .kr(8)
36629 .sr(1)
36630 .m(m)
36631 .n(n)
36632 .k(k)
36633 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080036634 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036635 }
36636 }
36637 }
36638 }
36639
36640 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, n_div_4) {
36641 for (uint32_t n = 8; n <= 12; n += 4) {
36642 for (size_t k = 1; k <= 40; k += 9) {
36643 GemmMicrokernelTester()
36644 .mr(2)
36645 .nr(4)
36646 .kr(8)
36647 .sr(1)
36648 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080036649 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036650 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080036651 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036652 }
36653 }
36654 }
36655
36656 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, n_div_4_strided_cn) {
36657 for (uint32_t n = 8; n <= 12; n += 4) {
36658 for (size_t k = 1; k <= 40; k += 9) {
36659 GemmMicrokernelTester()
36660 .mr(2)
36661 .nr(4)
36662 .kr(8)
36663 .sr(1)
36664 .m(2)
36665 .n(n)
36666 .k(k)
36667 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080036668 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036669 }
36670 }
36671 }
36672
36673 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, n_div_4_strided_a) {
36674 for (uint32_t n = 8; n <= 12; n += 4) {
36675 for (size_t k = 1; k <= 40; k += 9) {
36676 GemmMicrokernelTester()
36677 .mr(2)
36678 .nr(4)
36679 .kr(8)
36680 .sr(1)
36681 .m(2)
36682 .n(n)
36683 .k(k)
36684 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080036685 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036686 }
36687 }
36688 }
36689
36690 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, n_div_4_subtile) {
36691 for (uint32_t n = 8; n <= 12; n += 4) {
36692 for (size_t k = 1; k <= 40; k += 9) {
36693 for (uint32_t m = 1; m <= 2; m++) {
36694 GemmMicrokernelTester()
36695 .mr(2)
36696 .nr(4)
36697 .kr(8)
36698 .sr(1)
36699 .m(m)
36700 .n(n)
36701 .k(k)
36702 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080036703 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036704 }
36705 }
36706 }
36707 }
36708
36709 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, strided_cm_subtile) {
36710 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080036711 for (uint32_t n = 1; n <= 4; n++) {
36712 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036713 GemmMicrokernelTester()
36714 .mr(2)
36715 .nr(4)
36716 .kr(8)
36717 .sr(1)
36718 .m(m)
36719 .n(n)
36720 .k(k)
36721 .cm_stride(7)
36722 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080036723 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036724 }
36725 }
36726 }
36727 }
36728
36729 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, qmin) {
36730 GemmMicrokernelTester()
36731 .mr(2)
36732 .nr(4)
36733 .kr(8)
36734 .sr(1)
36735 .m(2)
36736 .n(4)
36737 .k(8)
36738 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080036739 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036740 }
36741
36742 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, qmax) {
36743 GemmMicrokernelTester()
36744 .mr(2)
36745 .nr(4)
36746 .kr(8)
36747 .sr(1)
36748 .m(2)
36749 .n(4)
36750 .k(8)
36751 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080036752 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036753 }
36754
36755 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, strided_cm) {
36756 GemmMicrokernelTester()
36757 .mr(2)
36758 .nr(4)
36759 .kr(8)
36760 .sr(1)
36761 .m(2)
36762 .n(4)
36763 .k(8)
36764 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080036765 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036766 }
36767#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
36768
36769
36770#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
36771 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8) {
36772 GemmMicrokernelTester()
36773 .mr(4)
36774 .nr(4)
36775 .kr(8)
36776 .sr(1)
36777 .m(4)
36778 .n(4)
36779 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080036780 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036781 }
36782
36783 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, strided_cn) {
36784 GemmMicrokernelTester()
36785 .mr(4)
36786 .nr(4)
36787 .kr(8)
36788 .sr(1)
36789 .m(4)
36790 .n(4)
36791 .k(8)
36792 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080036793 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036794 }
36795
36796 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_strided_a) {
36797 GemmMicrokernelTester()
36798 .mr(4)
36799 .nr(4)
36800 .kr(8)
36801 .sr(1)
36802 .m(4)
36803 .n(4)
36804 .k(8)
36805 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080036806 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036807 }
36808
36809 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080036810 for (uint32_t n = 1; n <= 4; n++) {
36811 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036812 GemmMicrokernelTester()
36813 .mr(4)
36814 .nr(4)
36815 .kr(8)
36816 .sr(1)
36817 .m(m)
36818 .n(n)
36819 .k(8)
36820 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080036821 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036822 }
36823 }
36824 }
36825
36826 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_m) {
36827 for (uint32_t m = 1; m <= 4; m++) {
36828 GemmMicrokernelTester()
36829 .mr(4)
36830 .nr(4)
36831 .kr(8)
36832 .sr(1)
36833 .m(m)
36834 .n(4)
36835 .k(8)
36836 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080036837 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036838 }
36839 }
36840
36841 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_n) {
36842 for (uint32_t n = 1; n <= 4; n++) {
36843 GemmMicrokernelTester()
36844 .mr(4)
36845 .nr(4)
36846 .kr(8)
36847 .sr(1)
36848 .m(4)
36849 .n(n)
36850 .k(8)
36851 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080036852 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036853 }
36854 }
36855
36856 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_lt_8) {
36857 for (size_t k = 1; k < 8; k++) {
36858 GemmMicrokernelTester()
36859 .mr(4)
36860 .nr(4)
36861 .kr(8)
36862 .sr(1)
36863 .m(4)
36864 .n(4)
36865 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080036866 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036867 }
36868 }
36869
36870 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_lt_8_strided_a) {
36871 for (size_t k = 1; k < 8; k++) {
36872 GemmMicrokernelTester()
36873 .mr(4)
36874 .nr(4)
36875 .kr(8)
36876 .sr(1)
36877 .m(4)
36878 .n(4)
36879 .k(k)
36880 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080036881 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036882 }
36883 }
36884
36885 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_lt_8_subtile) {
36886 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080036887 for (uint32_t n = 1; n <= 4; n++) {
36888 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036889 GemmMicrokernelTester()
36890 .mr(4)
36891 .nr(4)
36892 .kr(8)
36893 .sr(1)
36894 .m(m)
36895 .n(n)
36896 .k(k)
36897 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080036898 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036899 }
36900 }
36901 }
36902 }
36903
36904 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_gt_8) {
36905 for (size_t k = 9; k < 16; k++) {
36906 GemmMicrokernelTester()
36907 .mr(4)
36908 .nr(4)
36909 .kr(8)
36910 .sr(1)
36911 .m(4)
36912 .n(4)
36913 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080036914 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036915 }
36916 }
36917
36918 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_gt_8_strided_a) {
36919 for (size_t k = 9; k < 16; k++) {
36920 GemmMicrokernelTester()
36921 .mr(4)
36922 .nr(4)
36923 .kr(8)
36924 .sr(1)
36925 .m(4)
36926 .n(4)
36927 .k(k)
36928 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080036929 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036930 }
36931 }
36932
36933 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_gt_8_subtile) {
36934 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080036935 for (uint32_t n = 1; n <= 4; n++) {
36936 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036937 GemmMicrokernelTester()
36938 .mr(4)
36939 .nr(4)
36940 .kr(8)
36941 .sr(1)
36942 .m(m)
36943 .n(n)
36944 .k(k)
36945 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080036946 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036947 }
36948 }
36949 }
36950 }
36951
36952 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_div_8) {
36953 for (size_t k = 16; k <= 80; k += 8) {
36954 GemmMicrokernelTester()
36955 .mr(4)
36956 .nr(4)
36957 .kr(8)
36958 .sr(1)
36959 .m(4)
36960 .n(4)
36961 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080036962 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036963 }
36964 }
36965
36966 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_div_8_strided_a) {
36967 for (size_t k = 16; k <= 80; k += 8) {
36968 GemmMicrokernelTester()
36969 .mr(4)
36970 .nr(4)
36971 .kr(8)
36972 .sr(1)
36973 .m(4)
36974 .n(4)
36975 .k(k)
36976 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080036977 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036978 }
36979 }
36980
36981 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_div_8_subtile) {
36982 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080036983 for (uint32_t n = 1; n <= 4; n++) {
36984 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036985 GemmMicrokernelTester()
36986 .mr(4)
36987 .nr(4)
36988 .kr(8)
36989 .sr(1)
36990 .m(m)
36991 .n(n)
36992 .k(k)
36993 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080036994 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080036995 }
36996 }
36997 }
36998 }
36999
37000 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4) {
37001 for (uint32_t n = 5; n < 8; n++) {
37002 for (size_t k = 1; k <= 40; k += 9) {
37003 GemmMicrokernelTester()
37004 .mr(4)
37005 .nr(4)
37006 .kr(8)
37007 .sr(1)
37008 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080037009 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037010 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080037011 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037012 }
37013 }
37014 }
37015
37016 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_cn) {
37017 for (uint32_t n = 5; n < 8; n++) {
37018 for (size_t k = 1; k <= 40; k += 9) {
37019 GemmMicrokernelTester()
37020 .mr(4)
37021 .nr(4)
37022 .kr(8)
37023 .sr(1)
37024 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080037025 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037026 .k(k)
37027 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080037028 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037029 }
37030 }
37031 }
37032
37033 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_a) {
37034 for (uint32_t n = 5; n < 8; n++) {
37035 for (size_t k = 1; k <= 40; k += 9) {
37036 GemmMicrokernelTester()
37037 .mr(4)
37038 .nr(4)
37039 .kr(8)
37040 .sr(1)
37041 .m(4)
37042 .n(n)
37043 .k(k)
37044 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080037045 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037046 }
37047 }
37048 }
37049
37050 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4_subtile) {
37051 for (uint32_t n = 5; n < 8; n++) {
37052 for (size_t k = 1; k <= 40; k += 9) {
37053 for (uint32_t m = 1; m <= 4; m++) {
37054 GemmMicrokernelTester()
37055 .mr(4)
37056 .nr(4)
37057 .kr(8)
37058 .sr(1)
37059 .m(m)
37060 .n(n)
37061 .k(k)
37062 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080037063 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037064 }
37065 }
37066 }
37067 }
37068
37069 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, n_div_4) {
37070 for (uint32_t n = 8; n <= 12; n += 4) {
37071 for (size_t k = 1; k <= 40; k += 9) {
37072 GemmMicrokernelTester()
37073 .mr(4)
37074 .nr(4)
37075 .kr(8)
37076 .sr(1)
37077 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080037078 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037079 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080037080 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037081 }
37082 }
37083 }
37084
37085 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, n_div_4_strided_cn) {
37086 for (uint32_t n = 8; n <= 12; n += 4) {
37087 for (size_t k = 1; k <= 40; k += 9) {
37088 GemmMicrokernelTester()
37089 .mr(4)
37090 .nr(4)
37091 .kr(8)
37092 .sr(1)
37093 .m(4)
37094 .n(n)
37095 .k(k)
37096 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080037097 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037098 }
37099 }
37100 }
37101
37102 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, n_div_4_strided_a) {
37103 for (uint32_t n = 8; n <= 12; n += 4) {
37104 for (size_t k = 1; k <= 40; k += 9) {
37105 GemmMicrokernelTester()
37106 .mr(4)
37107 .nr(4)
37108 .kr(8)
37109 .sr(1)
37110 .m(4)
37111 .n(n)
37112 .k(k)
37113 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080037114 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037115 }
37116 }
37117 }
37118
37119 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, n_div_4_subtile) {
37120 for (uint32_t n = 8; n <= 12; n += 4) {
37121 for (size_t k = 1; k <= 40; k += 9) {
37122 for (uint32_t m = 1; m <= 4; m++) {
37123 GemmMicrokernelTester()
37124 .mr(4)
37125 .nr(4)
37126 .kr(8)
37127 .sr(1)
37128 .m(m)
37129 .n(n)
37130 .k(k)
37131 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080037132 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037133 }
37134 }
37135 }
37136 }
37137
37138 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, strided_cm_subtile) {
37139 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080037140 for (uint32_t n = 1; n <= 4; n++) {
37141 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037142 GemmMicrokernelTester()
37143 .mr(4)
37144 .nr(4)
37145 .kr(8)
37146 .sr(1)
37147 .m(m)
37148 .n(n)
37149 .k(k)
37150 .cm_stride(7)
37151 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080037152 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037153 }
37154 }
37155 }
37156 }
37157
37158 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, qmin) {
37159 GemmMicrokernelTester()
37160 .mr(4)
37161 .nr(4)
37162 .kr(8)
37163 .sr(1)
37164 .m(4)
37165 .n(4)
37166 .k(8)
37167 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080037168 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037169 }
37170
37171 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, qmax) {
37172 GemmMicrokernelTester()
37173 .mr(4)
37174 .nr(4)
37175 .kr(8)
37176 .sr(1)
37177 .m(4)
37178 .n(4)
37179 .k(8)
37180 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080037181 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037182 }
37183
37184 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, strided_cm) {
37185 GemmMicrokernelTester()
37186 .mr(4)
37187 .nr(4)
37188 .kr(8)
37189 .sr(1)
37190 .m(4)
37191 .n(4)
37192 .k(8)
37193 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080037194 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037195 }
37196#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
37197
37198
37199#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
37200 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8) {
37201 GemmMicrokernelTester()
37202 .mr(3)
37203 .nr(4)
37204 .kr(8)
37205 .sr(1)
37206 .m(3)
37207 .n(4)
37208 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080037209 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037210 }
37211
37212 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, strided_cn) {
37213 GemmMicrokernelTester()
37214 .mr(3)
37215 .nr(4)
37216 .kr(8)
37217 .sr(1)
37218 .m(3)
37219 .n(4)
37220 .k(8)
37221 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080037222 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037223 }
37224
37225 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8_strided_a) {
37226 GemmMicrokernelTester()
37227 .mr(3)
37228 .nr(4)
37229 .kr(8)
37230 .sr(1)
37231 .m(3)
37232 .n(4)
37233 .k(8)
37234 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080037235 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037236 }
37237
37238 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080037239 for (uint32_t n = 1; n <= 4; n++) {
37240 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037241 GemmMicrokernelTester()
37242 .mr(3)
37243 .nr(4)
37244 .kr(8)
37245 .sr(1)
37246 .m(m)
37247 .n(n)
37248 .k(8)
37249 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080037250 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037251 }
37252 }
37253 }
37254
37255 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_m) {
37256 for (uint32_t m = 1; m <= 3; m++) {
37257 GemmMicrokernelTester()
37258 .mr(3)
37259 .nr(4)
37260 .kr(8)
37261 .sr(1)
37262 .m(m)
37263 .n(4)
37264 .k(8)
37265 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080037266 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037267 }
37268 }
37269
37270 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_n) {
37271 for (uint32_t n = 1; n <= 4; n++) {
37272 GemmMicrokernelTester()
37273 .mr(3)
37274 .nr(4)
37275 .kr(8)
37276 .sr(1)
37277 .m(3)
37278 .n(n)
37279 .k(8)
37280 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080037281 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037282 }
37283 }
37284
37285 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_lt_8) {
37286 for (size_t k = 1; k < 8; k++) {
37287 GemmMicrokernelTester()
37288 .mr(3)
37289 .nr(4)
37290 .kr(8)
37291 .sr(1)
37292 .m(3)
37293 .n(4)
37294 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080037295 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037296 }
37297 }
37298
37299 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_lt_8_strided_a) {
37300 for (size_t k = 1; k < 8; k++) {
37301 GemmMicrokernelTester()
37302 .mr(3)
37303 .nr(4)
37304 .kr(8)
37305 .sr(1)
37306 .m(3)
37307 .n(4)
37308 .k(k)
37309 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080037310 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037311 }
37312 }
37313
37314 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_lt_8_subtile) {
37315 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080037316 for (uint32_t n = 1; n <= 4; n++) {
37317 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037318 GemmMicrokernelTester()
37319 .mr(3)
37320 .nr(4)
37321 .kr(8)
37322 .sr(1)
37323 .m(m)
37324 .n(n)
37325 .k(k)
37326 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080037327 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037328 }
37329 }
37330 }
37331 }
37332
37333 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_gt_8) {
37334 for (size_t k = 9; k < 16; k++) {
37335 GemmMicrokernelTester()
37336 .mr(3)
37337 .nr(4)
37338 .kr(8)
37339 .sr(1)
37340 .m(3)
37341 .n(4)
37342 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080037343 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037344 }
37345 }
37346
37347 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_gt_8_strided_a) {
37348 for (size_t k = 9; k < 16; k++) {
37349 GemmMicrokernelTester()
37350 .mr(3)
37351 .nr(4)
37352 .kr(8)
37353 .sr(1)
37354 .m(3)
37355 .n(4)
37356 .k(k)
37357 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080037358 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037359 }
37360 }
37361
37362 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_gt_8_subtile) {
37363 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080037364 for (uint32_t n = 1; n <= 4; n++) {
37365 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037366 GemmMicrokernelTester()
37367 .mr(3)
37368 .nr(4)
37369 .kr(8)
37370 .sr(1)
37371 .m(m)
37372 .n(n)
37373 .k(k)
37374 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080037375 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037376 }
37377 }
37378 }
37379 }
37380
37381 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_div_8) {
37382 for (size_t k = 16; k <= 80; k += 8) {
37383 GemmMicrokernelTester()
37384 .mr(3)
37385 .nr(4)
37386 .kr(8)
37387 .sr(1)
37388 .m(3)
37389 .n(4)
37390 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080037391 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037392 }
37393 }
37394
37395 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_div_8_strided_a) {
37396 for (size_t k = 16; k <= 80; k += 8) {
37397 GemmMicrokernelTester()
37398 .mr(3)
37399 .nr(4)
37400 .kr(8)
37401 .sr(1)
37402 .m(3)
37403 .n(4)
37404 .k(k)
37405 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080037406 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037407 }
37408 }
37409
37410 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_div_8_subtile) {
37411 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080037412 for (uint32_t n = 1; n <= 4; n++) {
37413 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037414 GemmMicrokernelTester()
37415 .mr(3)
37416 .nr(4)
37417 .kr(8)
37418 .sr(1)
37419 .m(m)
37420 .n(n)
37421 .k(k)
37422 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080037423 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037424 }
37425 }
37426 }
37427 }
37428
37429 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, n_gt_4) {
37430 for (uint32_t n = 5; n < 8; n++) {
37431 for (size_t k = 1; k <= 40; k += 9) {
37432 GemmMicrokernelTester()
37433 .mr(3)
37434 .nr(4)
37435 .kr(8)
37436 .sr(1)
37437 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080037438 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037439 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080037440 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037441 }
37442 }
37443 }
37444
37445 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_cn) {
37446 for (uint32_t n = 5; n < 8; n++) {
37447 for (size_t k = 1; k <= 40; k += 9) {
37448 GemmMicrokernelTester()
37449 .mr(3)
37450 .nr(4)
37451 .kr(8)
37452 .sr(1)
37453 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080037454 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037455 .k(k)
37456 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080037457 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037458 }
37459 }
37460 }
37461
37462 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_a) {
37463 for (uint32_t n = 5; n < 8; n++) {
37464 for (size_t k = 1; k <= 40; k += 9) {
37465 GemmMicrokernelTester()
37466 .mr(3)
37467 .nr(4)
37468 .kr(8)
37469 .sr(1)
37470 .m(3)
37471 .n(n)
37472 .k(k)
37473 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080037474 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037475 }
37476 }
37477 }
37478
37479 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, n_gt_4_subtile) {
37480 for (uint32_t n = 5; n < 8; n++) {
37481 for (size_t k = 1; k <= 40; k += 9) {
37482 for (uint32_t m = 1; m <= 3; m++) {
37483 GemmMicrokernelTester()
37484 .mr(3)
37485 .nr(4)
37486 .kr(8)
37487 .sr(1)
37488 .m(m)
37489 .n(n)
37490 .k(k)
37491 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080037492 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037493 }
37494 }
37495 }
37496 }
37497
37498 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, n_div_4) {
37499 for (uint32_t n = 8; n <= 12; n += 4) {
37500 for (size_t k = 1; k <= 40; k += 9) {
37501 GemmMicrokernelTester()
37502 .mr(3)
37503 .nr(4)
37504 .kr(8)
37505 .sr(1)
37506 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080037507 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037508 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080037509 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037510 }
37511 }
37512 }
37513
37514 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, n_div_4_strided_cn) {
37515 for (uint32_t n = 8; n <= 12; n += 4) {
37516 for (size_t k = 1; k <= 40; k += 9) {
37517 GemmMicrokernelTester()
37518 .mr(3)
37519 .nr(4)
37520 .kr(8)
37521 .sr(1)
37522 .m(3)
37523 .n(n)
37524 .k(k)
37525 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080037526 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037527 }
37528 }
37529 }
37530
37531 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, n_div_4_strided_a) {
37532 for (uint32_t n = 8; n <= 12; n += 4) {
37533 for (size_t k = 1; k <= 40; k += 9) {
37534 GemmMicrokernelTester()
37535 .mr(3)
37536 .nr(4)
37537 .kr(8)
37538 .sr(1)
37539 .m(3)
37540 .n(n)
37541 .k(k)
37542 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080037543 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037544 }
37545 }
37546 }
37547
37548 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, n_div_4_subtile) {
37549 for (uint32_t n = 8; n <= 12; n += 4) {
37550 for (size_t k = 1; k <= 40; k += 9) {
37551 for (uint32_t m = 1; m <= 3; m++) {
37552 GemmMicrokernelTester()
37553 .mr(3)
37554 .nr(4)
37555 .kr(8)
37556 .sr(1)
37557 .m(m)
37558 .n(n)
37559 .k(k)
37560 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080037561 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037562 }
37563 }
37564 }
37565 }
37566
37567 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, strided_cm_subtile) {
37568 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080037569 for (uint32_t n = 1; n <= 4; n++) {
37570 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037571 GemmMicrokernelTester()
37572 .mr(3)
37573 .nr(4)
37574 .kr(8)
37575 .sr(1)
37576 .m(m)
37577 .n(n)
37578 .k(k)
37579 .cm_stride(7)
37580 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080037581 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037582 }
37583 }
37584 }
37585 }
37586
37587 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, qmin) {
37588 GemmMicrokernelTester()
37589 .mr(3)
37590 .nr(4)
37591 .kr(8)
37592 .sr(1)
37593 .m(3)
37594 .n(4)
37595 .k(8)
37596 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080037597 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037598 }
37599
37600 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, qmax) {
37601 GemmMicrokernelTester()
37602 .mr(3)
37603 .nr(4)
37604 .kr(8)
37605 .sr(1)
37606 .m(3)
37607 .n(4)
37608 .k(8)
37609 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080037610 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037611 }
37612
37613 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, strided_cm) {
37614 GemmMicrokernelTester()
37615 .mr(3)
37616 .nr(4)
37617 .kr(8)
37618 .sr(1)
37619 .m(3)
37620 .n(4)
37621 .k(8)
37622 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080037623 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037624 }
37625#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
37626
37627
37628#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
37629 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8) {
37630 GemmMicrokernelTester()
37631 .mr(3)
37632 .nr(4)
37633 .kr(8)
37634 .sr(1)
37635 .m(3)
37636 .n(4)
37637 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080037638 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037639 }
37640
37641 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, strided_cn) {
37642 GemmMicrokernelTester()
37643 .mr(3)
37644 .nr(4)
37645 .kr(8)
37646 .sr(1)
37647 .m(3)
37648 .n(4)
37649 .k(8)
37650 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080037651 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037652 }
37653
37654 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8_strided_a) {
37655 GemmMicrokernelTester()
37656 .mr(3)
37657 .nr(4)
37658 .kr(8)
37659 .sr(1)
37660 .m(3)
37661 .n(4)
37662 .k(8)
37663 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080037664 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037665 }
37666
37667 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080037668 for (uint32_t n = 1; n <= 4; n++) {
37669 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037670 GemmMicrokernelTester()
37671 .mr(3)
37672 .nr(4)
37673 .kr(8)
37674 .sr(1)
37675 .m(m)
37676 .n(n)
37677 .k(8)
37678 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080037679 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037680 }
37681 }
37682 }
37683
37684 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_m) {
37685 for (uint32_t m = 1; m <= 3; m++) {
37686 GemmMicrokernelTester()
37687 .mr(3)
37688 .nr(4)
37689 .kr(8)
37690 .sr(1)
37691 .m(m)
37692 .n(4)
37693 .k(8)
37694 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080037695 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037696 }
37697 }
37698
37699 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_n) {
37700 for (uint32_t n = 1; n <= 4; n++) {
37701 GemmMicrokernelTester()
37702 .mr(3)
37703 .nr(4)
37704 .kr(8)
37705 .sr(1)
37706 .m(3)
37707 .n(n)
37708 .k(8)
37709 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080037710 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037711 }
37712 }
37713
37714 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_lt_8) {
37715 for (size_t k = 1; k < 8; k++) {
37716 GemmMicrokernelTester()
37717 .mr(3)
37718 .nr(4)
37719 .kr(8)
37720 .sr(1)
37721 .m(3)
37722 .n(4)
37723 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080037724 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037725 }
37726 }
37727
37728 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_lt_8_strided_a) {
37729 for (size_t k = 1; k < 8; k++) {
37730 GemmMicrokernelTester()
37731 .mr(3)
37732 .nr(4)
37733 .kr(8)
37734 .sr(1)
37735 .m(3)
37736 .n(4)
37737 .k(k)
37738 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080037739 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037740 }
37741 }
37742
37743 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_lt_8_subtile) {
37744 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080037745 for (uint32_t n = 1; n <= 4; n++) {
37746 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037747 GemmMicrokernelTester()
37748 .mr(3)
37749 .nr(4)
37750 .kr(8)
37751 .sr(1)
37752 .m(m)
37753 .n(n)
37754 .k(k)
37755 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080037756 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037757 }
37758 }
37759 }
37760 }
37761
37762 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_gt_8) {
37763 for (size_t k = 9; k < 16; k++) {
37764 GemmMicrokernelTester()
37765 .mr(3)
37766 .nr(4)
37767 .kr(8)
37768 .sr(1)
37769 .m(3)
37770 .n(4)
37771 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080037772 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037773 }
37774 }
37775
37776 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_gt_8_strided_a) {
37777 for (size_t k = 9; k < 16; k++) {
37778 GemmMicrokernelTester()
37779 .mr(3)
37780 .nr(4)
37781 .kr(8)
37782 .sr(1)
37783 .m(3)
37784 .n(4)
37785 .k(k)
37786 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080037787 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037788 }
37789 }
37790
37791 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_gt_8_subtile) {
37792 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080037793 for (uint32_t n = 1; n <= 4; n++) {
37794 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037795 GemmMicrokernelTester()
37796 .mr(3)
37797 .nr(4)
37798 .kr(8)
37799 .sr(1)
37800 .m(m)
37801 .n(n)
37802 .k(k)
37803 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080037804 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037805 }
37806 }
37807 }
37808 }
37809
37810 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_div_8) {
37811 for (size_t k = 16; k <= 80; k += 8) {
37812 GemmMicrokernelTester()
37813 .mr(3)
37814 .nr(4)
37815 .kr(8)
37816 .sr(1)
37817 .m(3)
37818 .n(4)
37819 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080037820 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037821 }
37822 }
37823
37824 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_div_8_strided_a) {
37825 for (size_t k = 16; k <= 80; k += 8) {
37826 GemmMicrokernelTester()
37827 .mr(3)
37828 .nr(4)
37829 .kr(8)
37830 .sr(1)
37831 .m(3)
37832 .n(4)
37833 .k(k)
37834 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080037835 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037836 }
37837 }
37838
37839 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_div_8_subtile) {
37840 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080037841 for (uint32_t n = 1; n <= 4; n++) {
37842 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037843 GemmMicrokernelTester()
37844 .mr(3)
37845 .nr(4)
37846 .kr(8)
37847 .sr(1)
37848 .m(m)
37849 .n(n)
37850 .k(k)
37851 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080037852 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037853 }
37854 }
37855 }
37856 }
37857
37858 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_gt_4) {
37859 for (uint32_t n = 5; n < 8; n++) {
37860 for (size_t k = 1; k <= 40; k += 9) {
37861 GemmMicrokernelTester()
37862 .mr(3)
37863 .nr(4)
37864 .kr(8)
37865 .sr(1)
37866 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080037867 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037868 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080037869 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037870 }
37871 }
37872 }
37873
37874 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_gt_4_strided_cn) {
37875 for (uint32_t n = 5; n < 8; n++) {
37876 for (size_t k = 1; k <= 40; k += 9) {
37877 GemmMicrokernelTester()
37878 .mr(3)
37879 .nr(4)
37880 .kr(8)
37881 .sr(1)
37882 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080037883 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037884 .k(k)
37885 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080037886 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037887 }
37888 }
37889 }
37890
37891 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_gt_4_strided_a) {
37892 for (uint32_t n = 5; n < 8; n++) {
37893 for (size_t k = 1; k <= 40; k += 9) {
37894 GemmMicrokernelTester()
37895 .mr(3)
37896 .nr(4)
37897 .kr(8)
37898 .sr(1)
37899 .m(3)
37900 .n(n)
37901 .k(k)
37902 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080037903 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037904 }
37905 }
37906 }
37907
37908 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_gt_4_subtile) {
37909 for (uint32_t n = 5; n < 8; n++) {
37910 for (size_t k = 1; k <= 40; k += 9) {
37911 for (uint32_t m = 1; m <= 3; m++) {
37912 GemmMicrokernelTester()
37913 .mr(3)
37914 .nr(4)
37915 .kr(8)
37916 .sr(1)
37917 .m(m)
37918 .n(n)
37919 .k(k)
37920 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080037921 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037922 }
37923 }
37924 }
37925 }
37926
37927 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_div_4) {
37928 for (uint32_t n = 8; n <= 12; n += 4) {
37929 for (size_t k = 1; k <= 40; k += 9) {
37930 GemmMicrokernelTester()
37931 .mr(3)
37932 .nr(4)
37933 .kr(8)
37934 .sr(1)
37935 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080037936 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037937 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080037938 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037939 }
37940 }
37941 }
37942
37943 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_div_4_strided_cn) {
37944 for (uint32_t n = 8; n <= 12; n += 4) {
37945 for (size_t k = 1; k <= 40; k += 9) {
37946 GemmMicrokernelTester()
37947 .mr(3)
37948 .nr(4)
37949 .kr(8)
37950 .sr(1)
37951 .m(3)
37952 .n(n)
37953 .k(k)
37954 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080037955 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037956 }
37957 }
37958 }
37959
37960 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_div_4_strided_a) {
37961 for (uint32_t n = 8; n <= 12; n += 4) {
37962 for (size_t k = 1; k <= 40; k += 9) {
37963 GemmMicrokernelTester()
37964 .mr(3)
37965 .nr(4)
37966 .kr(8)
37967 .sr(1)
37968 .m(3)
37969 .n(n)
37970 .k(k)
37971 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080037972 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037973 }
37974 }
37975 }
37976
37977 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_div_4_subtile) {
37978 for (uint32_t n = 8; n <= 12; n += 4) {
37979 for (size_t k = 1; k <= 40; k += 9) {
37980 for (uint32_t m = 1; m <= 3; m++) {
37981 GemmMicrokernelTester()
37982 .mr(3)
37983 .nr(4)
37984 .kr(8)
37985 .sr(1)
37986 .m(m)
37987 .n(n)
37988 .k(k)
37989 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080037990 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080037991 }
37992 }
37993 }
37994 }
37995
37996 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, strided_cm_subtile) {
37997 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080037998 for (uint32_t n = 1; n <= 4; n++) {
37999 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038000 GemmMicrokernelTester()
38001 .mr(3)
38002 .nr(4)
38003 .kr(8)
38004 .sr(1)
38005 .m(m)
38006 .n(n)
38007 .k(k)
38008 .cm_stride(7)
38009 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038010 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038011 }
38012 }
38013 }
38014 }
38015
38016 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, qmin) {
38017 GemmMicrokernelTester()
38018 .mr(3)
38019 .nr(4)
38020 .kr(8)
38021 .sr(1)
38022 .m(3)
38023 .n(4)
38024 .k(8)
38025 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080038026 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038027 }
38028
38029 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, qmax) {
38030 GemmMicrokernelTester()
38031 .mr(3)
38032 .nr(4)
38033 .kr(8)
38034 .sr(1)
38035 .m(3)
38036 .n(4)
38037 .k(8)
38038 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080038039 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038040 }
38041
38042 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, strided_cm) {
38043 GemmMicrokernelTester()
38044 .mr(3)
38045 .nr(4)
38046 .kr(8)
38047 .sr(1)
38048 .m(3)
38049 .n(4)
38050 .k(8)
38051 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080038052 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038053 }
38054#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
38055
38056
38057#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
38058 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8) {
38059 GemmMicrokernelTester()
38060 .mr(2)
38061 .nr(4)
38062 .kr(8)
38063 .sr(1)
38064 .m(2)
38065 .n(4)
38066 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080038067 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038068 }
38069
38070 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, strided_cn) {
38071 GemmMicrokernelTester()
38072 .mr(2)
38073 .nr(4)
38074 .kr(8)
38075 .sr(1)
38076 .m(2)
38077 .n(4)
38078 .k(8)
38079 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080038080 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038081 }
38082
38083 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8_strided_a) {
38084 GemmMicrokernelTester()
38085 .mr(2)
38086 .nr(4)
38087 .kr(8)
38088 .sr(1)
38089 .m(2)
38090 .n(4)
38091 .k(8)
38092 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080038093 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038094 }
38095
38096 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080038097 for (uint32_t n = 1; n <= 4; n++) {
38098 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038099 GemmMicrokernelTester()
38100 .mr(2)
38101 .nr(4)
38102 .kr(8)
38103 .sr(1)
38104 .m(m)
38105 .n(n)
38106 .k(8)
38107 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038108 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038109 }
38110 }
38111 }
38112
38113 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_m) {
38114 for (uint32_t m = 1; m <= 2; m++) {
38115 GemmMicrokernelTester()
38116 .mr(2)
38117 .nr(4)
38118 .kr(8)
38119 .sr(1)
38120 .m(m)
38121 .n(4)
38122 .k(8)
38123 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038124 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038125 }
38126 }
38127
38128 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_n) {
38129 for (uint32_t n = 1; n <= 4; n++) {
38130 GemmMicrokernelTester()
38131 .mr(2)
38132 .nr(4)
38133 .kr(8)
38134 .sr(1)
38135 .m(2)
38136 .n(n)
38137 .k(8)
38138 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038139 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038140 }
38141 }
38142
38143 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_lt_8) {
38144 for (size_t k = 1; k < 8; k++) {
38145 GemmMicrokernelTester()
38146 .mr(2)
38147 .nr(4)
38148 .kr(8)
38149 .sr(1)
38150 .m(2)
38151 .n(4)
38152 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080038153 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038154 }
38155 }
38156
38157 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_lt_8_strided_a) {
38158 for (size_t k = 1; k < 8; k++) {
38159 GemmMicrokernelTester()
38160 .mr(2)
38161 .nr(4)
38162 .kr(8)
38163 .sr(1)
38164 .m(2)
38165 .n(4)
38166 .k(k)
38167 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080038168 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038169 }
38170 }
38171
38172 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_lt_8_subtile) {
38173 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080038174 for (uint32_t n = 1; n <= 4; n++) {
38175 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038176 GemmMicrokernelTester()
38177 .mr(2)
38178 .nr(4)
38179 .kr(8)
38180 .sr(1)
38181 .m(m)
38182 .n(n)
38183 .k(k)
38184 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038185 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038186 }
38187 }
38188 }
38189 }
38190
38191 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_gt_8) {
38192 for (size_t k = 9; k < 16; k++) {
38193 GemmMicrokernelTester()
38194 .mr(2)
38195 .nr(4)
38196 .kr(8)
38197 .sr(1)
38198 .m(2)
38199 .n(4)
38200 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080038201 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038202 }
38203 }
38204
38205 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_gt_8_strided_a) {
38206 for (size_t k = 9; k < 16; k++) {
38207 GemmMicrokernelTester()
38208 .mr(2)
38209 .nr(4)
38210 .kr(8)
38211 .sr(1)
38212 .m(2)
38213 .n(4)
38214 .k(k)
38215 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080038216 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038217 }
38218 }
38219
38220 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_gt_8_subtile) {
38221 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080038222 for (uint32_t n = 1; n <= 4; n++) {
38223 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038224 GemmMicrokernelTester()
38225 .mr(2)
38226 .nr(4)
38227 .kr(8)
38228 .sr(1)
38229 .m(m)
38230 .n(n)
38231 .k(k)
38232 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038233 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038234 }
38235 }
38236 }
38237 }
38238
38239 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_div_8) {
38240 for (size_t k = 16; k <= 80; k += 8) {
38241 GemmMicrokernelTester()
38242 .mr(2)
38243 .nr(4)
38244 .kr(8)
38245 .sr(1)
38246 .m(2)
38247 .n(4)
38248 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080038249 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038250 }
38251 }
38252
38253 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_div_8_strided_a) {
38254 for (size_t k = 16; k <= 80; k += 8) {
38255 GemmMicrokernelTester()
38256 .mr(2)
38257 .nr(4)
38258 .kr(8)
38259 .sr(1)
38260 .m(2)
38261 .n(4)
38262 .k(k)
38263 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080038264 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038265 }
38266 }
38267
38268 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_div_8_subtile) {
38269 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080038270 for (uint32_t n = 1; n <= 4; n++) {
38271 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038272 GemmMicrokernelTester()
38273 .mr(2)
38274 .nr(4)
38275 .kr(8)
38276 .sr(1)
38277 .m(m)
38278 .n(n)
38279 .k(k)
38280 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038281 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038282 }
38283 }
38284 }
38285 }
38286
38287 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_gt_4) {
38288 for (uint32_t n = 5; n < 8; n++) {
38289 for (size_t k = 1; k <= 40; k += 9) {
38290 GemmMicrokernelTester()
38291 .mr(2)
38292 .nr(4)
38293 .kr(8)
38294 .sr(1)
38295 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080038296 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038297 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080038298 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038299 }
38300 }
38301 }
38302
38303 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_gt_4_strided_cn) {
38304 for (uint32_t n = 5; n < 8; n++) {
38305 for (size_t k = 1; k <= 40; k += 9) {
38306 GemmMicrokernelTester()
38307 .mr(2)
38308 .nr(4)
38309 .kr(8)
38310 .sr(1)
38311 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080038312 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038313 .k(k)
38314 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080038315 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038316 }
38317 }
38318 }
38319
38320 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_gt_4_strided_a) {
38321 for (uint32_t n = 5; n < 8; n++) {
38322 for (size_t k = 1; k <= 40; k += 9) {
38323 GemmMicrokernelTester()
38324 .mr(2)
38325 .nr(4)
38326 .kr(8)
38327 .sr(1)
38328 .m(2)
38329 .n(n)
38330 .k(k)
38331 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080038332 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038333 }
38334 }
38335 }
38336
38337 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_gt_4_subtile) {
38338 for (uint32_t n = 5; n < 8; n++) {
38339 for (size_t k = 1; k <= 40; k += 9) {
38340 for (uint32_t m = 1; m <= 2; m++) {
38341 GemmMicrokernelTester()
38342 .mr(2)
38343 .nr(4)
38344 .kr(8)
38345 .sr(1)
38346 .m(m)
38347 .n(n)
38348 .k(k)
38349 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038350 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038351 }
38352 }
38353 }
38354 }
38355
38356 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_div_4) {
38357 for (uint32_t n = 8; n <= 12; n += 4) {
38358 for (size_t k = 1; k <= 40; k += 9) {
38359 GemmMicrokernelTester()
38360 .mr(2)
38361 .nr(4)
38362 .kr(8)
38363 .sr(1)
38364 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080038365 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038366 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080038367 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038368 }
38369 }
38370 }
38371
38372 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_div_4_strided_cn) {
38373 for (uint32_t n = 8; n <= 12; n += 4) {
38374 for (size_t k = 1; k <= 40; k += 9) {
38375 GemmMicrokernelTester()
38376 .mr(2)
38377 .nr(4)
38378 .kr(8)
38379 .sr(1)
38380 .m(2)
38381 .n(n)
38382 .k(k)
38383 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080038384 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038385 }
38386 }
38387 }
38388
38389 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_div_4_strided_a) {
38390 for (uint32_t n = 8; n <= 12; n += 4) {
38391 for (size_t k = 1; k <= 40; k += 9) {
38392 GemmMicrokernelTester()
38393 .mr(2)
38394 .nr(4)
38395 .kr(8)
38396 .sr(1)
38397 .m(2)
38398 .n(n)
38399 .k(k)
38400 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080038401 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038402 }
38403 }
38404 }
38405
38406 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_div_4_subtile) {
38407 for (uint32_t n = 8; n <= 12; n += 4) {
38408 for (size_t k = 1; k <= 40; k += 9) {
38409 for (uint32_t m = 1; m <= 2; m++) {
38410 GemmMicrokernelTester()
38411 .mr(2)
38412 .nr(4)
38413 .kr(8)
38414 .sr(1)
38415 .m(m)
38416 .n(n)
38417 .k(k)
38418 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038419 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038420 }
38421 }
38422 }
38423 }
38424
38425 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, strided_cm_subtile) {
38426 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080038427 for (uint32_t n = 1; n <= 4; n++) {
38428 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038429 GemmMicrokernelTester()
38430 .mr(2)
38431 .nr(4)
38432 .kr(8)
38433 .sr(1)
38434 .m(m)
38435 .n(n)
38436 .k(k)
38437 .cm_stride(7)
38438 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038439 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038440 }
38441 }
38442 }
38443 }
38444
38445 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, qmin) {
38446 GemmMicrokernelTester()
38447 .mr(2)
38448 .nr(4)
38449 .kr(8)
38450 .sr(1)
38451 .m(2)
38452 .n(4)
38453 .k(8)
38454 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080038455 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038456 }
38457
38458 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, qmax) {
38459 GemmMicrokernelTester()
38460 .mr(2)
38461 .nr(4)
38462 .kr(8)
38463 .sr(1)
38464 .m(2)
38465 .n(4)
38466 .k(8)
38467 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080038468 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038469 }
38470
38471 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, strided_cm) {
38472 GemmMicrokernelTester()
38473 .mr(2)
38474 .nr(4)
38475 .kr(8)
38476 .sr(1)
38477 .m(2)
38478 .n(4)
38479 .k(8)
38480 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080038481 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080038482 }
38483#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038484
38485
38486#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
38487 TEST(QC8_GEMM_MINMAX_FP32_4X2__WASM_FMAGIC, k_eq_1) {
38488 GemmMicrokernelTester()
38489 .mr(4)
38490 .nr(2)
38491 .kr(1)
38492 .sr(1)
38493 .m(4)
38494 .n(2)
38495 .k(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038496 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038497 }
38498
38499 TEST(QC8_GEMM_MINMAX_FP32_4X2__WASM_FMAGIC, strided_cn) {
38500 GemmMicrokernelTester()
38501 .mr(4)
38502 .nr(2)
38503 .kr(1)
38504 .sr(1)
38505 .m(4)
38506 .n(2)
38507 .k(1)
38508 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080038509 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038510 }
38511
38512 TEST(QC8_GEMM_MINMAX_FP32_4X2__WASM_FMAGIC, k_eq_1_strided_a) {
38513 GemmMicrokernelTester()
38514 .mr(4)
38515 .nr(2)
38516 .kr(1)
38517 .sr(1)
38518 .m(4)
38519 .n(2)
38520 .k(1)
38521 .a_stride(3)
Marat Dukhan50323b82022-01-11 00:12:01 -080038522 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038523 }
38524
38525 TEST(QC8_GEMM_MINMAX_FP32_4X2__WASM_FMAGIC, k_eq_1_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080038526 for (uint32_t n = 1; n <= 2; n++) {
38527 for (uint32_t m = 1; m <= 4; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038528 GemmMicrokernelTester()
38529 .mr(4)
38530 .nr(2)
38531 .kr(1)
38532 .sr(1)
38533 .m(m)
38534 .n(n)
38535 .k(1)
38536 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038537 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038538 }
38539 }
38540 }
38541
38542 TEST(QC8_GEMM_MINMAX_FP32_4X2__WASM_FMAGIC, k_eq_1_subtile_m) {
38543 for (uint32_t m = 1; m <= 4; m++) {
38544 GemmMicrokernelTester()
38545 .mr(4)
38546 .nr(2)
38547 .kr(1)
38548 .sr(1)
38549 .m(m)
38550 .n(2)
38551 .k(1)
38552 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038553 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038554 }
38555 }
38556
38557 TEST(QC8_GEMM_MINMAX_FP32_4X2__WASM_FMAGIC, k_eq_1_subtile_n) {
38558 for (uint32_t n = 1; n <= 2; n++) {
38559 GemmMicrokernelTester()
38560 .mr(4)
38561 .nr(2)
38562 .kr(1)
38563 .sr(1)
38564 .m(4)
38565 .n(n)
38566 .k(1)
38567 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038568 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038569 }
38570 }
38571
38572 TEST(QC8_GEMM_MINMAX_FP32_4X2__WASM_FMAGIC, k_gt_1) {
38573 for (size_t k = 2; k < 10; k++) {
38574 GemmMicrokernelTester()
38575 .mr(4)
38576 .nr(2)
38577 .kr(1)
38578 .sr(1)
38579 .m(4)
38580 .n(2)
38581 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080038582 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038583 }
38584 }
38585
38586 TEST(QC8_GEMM_MINMAX_FP32_4X2__WASM_FMAGIC, k_gt_1_strided_a) {
38587 for (size_t k = 2; k < 10; k++) {
38588 GemmMicrokernelTester()
38589 .mr(4)
38590 .nr(2)
38591 .kr(1)
38592 .sr(1)
38593 .m(4)
38594 .n(2)
38595 .k(k)
38596 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080038597 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038598 }
38599 }
38600
38601 TEST(QC8_GEMM_MINMAX_FP32_4X2__WASM_FMAGIC, k_gt_1_subtile) {
38602 for (size_t k = 2; k < 10; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080038603 for (uint32_t n = 1; n <= 2; n++) {
38604 for (uint32_t m = 1; m <= 4; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038605 GemmMicrokernelTester()
38606 .mr(4)
38607 .nr(2)
38608 .kr(1)
38609 .sr(1)
38610 .m(m)
38611 .n(n)
38612 .k(k)
38613 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038614 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038615 }
38616 }
38617 }
38618 }
38619
38620 TEST(QC8_GEMM_MINMAX_FP32_4X2__WASM_FMAGIC, n_gt_2) {
38621 for (uint32_t n = 3; n < 4; n++) {
38622 for (size_t k = 1; k <= 5; k += 2) {
38623 GemmMicrokernelTester()
38624 .mr(4)
38625 .nr(2)
38626 .kr(1)
38627 .sr(1)
38628 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080038629 .n(n)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038630 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080038631 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038632 }
38633 }
38634 }
38635
38636 TEST(QC8_GEMM_MINMAX_FP32_4X2__WASM_FMAGIC, n_gt_2_strided_cn) {
38637 for (uint32_t n = 3; n < 4; n++) {
38638 for (size_t k = 1; k <= 5; k += 2) {
38639 GemmMicrokernelTester()
38640 .mr(4)
38641 .nr(2)
38642 .kr(1)
38643 .sr(1)
38644 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080038645 .n(n)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038646 .k(k)
38647 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080038648 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038649 }
38650 }
38651 }
38652
38653 TEST(QC8_GEMM_MINMAX_FP32_4X2__WASM_FMAGIC, n_gt_2_strided_a) {
38654 for (uint32_t n = 3; n < 4; n++) {
38655 for (size_t k = 1; k <= 5; k += 2) {
38656 GemmMicrokernelTester()
38657 .mr(4)
38658 .nr(2)
38659 .kr(1)
38660 .sr(1)
38661 .m(4)
38662 .n(n)
38663 .k(k)
38664 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080038665 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038666 }
38667 }
38668 }
38669
38670 TEST(QC8_GEMM_MINMAX_FP32_4X2__WASM_FMAGIC, n_gt_2_subtile) {
38671 for (uint32_t n = 3; n < 4; n++) {
38672 for (size_t k = 1; k <= 5; k += 2) {
38673 for (uint32_t m = 1; m <= 4; m++) {
38674 GemmMicrokernelTester()
38675 .mr(4)
38676 .nr(2)
38677 .kr(1)
38678 .sr(1)
38679 .m(m)
38680 .n(n)
38681 .k(k)
38682 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038683 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038684 }
38685 }
38686 }
38687 }
38688
38689 TEST(QC8_GEMM_MINMAX_FP32_4X2__WASM_FMAGIC, n_div_2) {
38690 for (uint32_t n = 4; n <= 6; n += 2) {
38691 for (size_t k = 1; k <= 5; k += 2) {
38692 GemmMicrokernelTester()
38693 .mr(4)
38694 .nr(2)
38695 .kr(1)
38696 .sr(1)
38697 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080038698 .n(n)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038699 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080038700 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038701 }
38702 }
38703 }
38704
38705 TEST(QC8_GEMM_MINMAX_FP32_4X2__WASM_FMAGIC, n_div_2_strided_cn) {
38706 for (uint32_t n = 4; n <= 6; n += 2) {
38707 for (size_t k = 1; k <= 5; k += 2) {
38708 GemmMicrokernelTester()
38709 .mr(4)
38710 .nr(2)
38711 .kr(1)
38712 .sr(1)
38713 .m(4)
38714 .n(n)
38715 .k(k)
38716 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080038717 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038718 }
38719 }
38720 }
38721
38722 TEST(QC8_GEMM_MINMAX_FP32_4X2__WASM_FMAGIC, n_div_2_strided_a) {
38723 for (uint32_t n = 4; n <= 6; n += 2) {
38724 for (size_t k = 1; k <= 5; k += 2) {
38725 GemmMicrokernelTester()
38726 .mr(4)
38727 .nr(2)
38728 .kr(1)
38729 .sr(1)
38730 .m(4)
38731 .n(n)
38732 .k(k)
38733 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080038734 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038735 }
38736 }
38737 }
38738
38739 TEST(QC8_GEMM_MINMAX_FP32_4X2__WASM_FMAGIC, n_div_2_subtile) {
38740 for (uint32_t n = 4; n <= 6; n += 2) {
38741 for (size_t k = 1; k <= 5; k += 2) {
38742 for (uint32_t m = 1; m <= 4; m++) {
38743 GemmMicrokernelTester()
38744 .mr(4)
38745 .nr(2)
38746 .kr(1)
38747 .sr(1)
38748 .m(m)
38749 .n(n)
38750 .k(k)
38751 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038752 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038753 }
38754 }
38755 }
38756 }
38757
38758 TEST(QC8_GEMM_MINMAX_FP32_4X2__WASM_FMAGIC, strided_cm_subtile) {
38759 for (size_t k = 1; k <= 5; k += 2) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080038760 for (uint32_t n = 1; n <= 2; n++) {
38761 for (uint32_t m = 1; m <= 4; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038762 GemmMicrokernelTester()
38763 .mr(4)
38764 .nr(2)
38765 .kr(1)
38766 .sr(1)
38767 .m(m)
38768 .n(n)
38769 .k(k)
38770 .cm_stride(5)
38771 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038772 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038773 }
38774 }
38775 }
38776 }
38777
38778 TEST(QC8_GEMM_MINMAX_FP32_4X2__WASM_FMAGIC, qmin) {
38779 GemmMicrokernelTester()
38780 .mr(4)
38781 .nr(2)
38782 .kr(1)
38783 .sr(1)
38784 .m(4)
38785 .n(2)
38786 .k(1)
38787 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080038788 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038789 }
38790
38791 TEST(QC8_GEMM_MINMAX_FP32_4X2__WASM_FMAGIC, qmax) {
38792 GemmMicrokernelTester()
38793 .mr(4)
38794 .nr(2)
38795 .kr(1)
38796 .sr(1)
38797 .m(4)
38798 .n(2)
38799 .k(1)
38800 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080038801 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038802 }
38803
38804 TEST(QC8_GEMM_MINMAX_FP32_4X2__WASM_FMAGIC, strided_cm) {
38805 GemmMicrokernelTester()
38806 .mr(4)
38807 .nr(2)
38808 .kr(1)
38809 .sr(1)
38810 .m(4)
38811 .n(2)
38812 .k(1)
38813 .cm_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080038814 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038815 }
38816#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
38817
38818
38819#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
38820 TEST(QC8_GEMM_MINMAX_FP32_1X4__WASM_FMAGIC, k_eq_1) {
38821 GemmMicrokernelTester()
38822 .mr(1)
38823 .nr(4)
38824 .kr(1)
38825 .sr(1)
38826 .m(1)
38827 .n(4)
38828 .k(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038829 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038830 }
38831
38832 TEST(QC8_GEMM_MINMAX_FP32_1X4__WASM_FMAGIC, strided_cn) {
38833 GemmMicrokernelTester()
38834 .mr(1)
38835 .nr(4)
38836 .kr(1)
38837 .sr(1)
38838 .m(1)
38839 .n(4)
38840 .k(1)
38841 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080038842 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038843 }
38844
38845 TEST(QC8_GEMM_MINMAX_FP32_1X4__WASM_FMAGIC, k_eq_1_strided_a) {
38846 GemmMicrokernelTester()
38847 .mr(1)
38848 .nr(4)
38849 .kr(1)
38850 .sr(1)
38851 .m(1)
38852 .n(4)
38853 .k(1)
38854 .a_stride(3)
Marat Dukhan50323b82022-01-11 00:12:01 -080038855 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038856 }
38857
38858 TEST(QC8_GEMM_MINMAX_FP32_1X4__WASM_FMAGIC, k_eq_1_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080038859 for (uint32_t n = 1; n <= 4; n++) {
38860 for (uint32_t m = 1; m <= 1; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038861 GemmMicrokernelTester()
38862 .mr(1)
38863 .nr(4)
38864 .kr(1)
38865 .sr(1)
38866 .m(m)
38867 .n(n)
38868 .k(1)
38869 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038870 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038871 }
38872 }
38873 }
38874
38875 TEST(QC8_GEMM_MINMAX_FP32_1X4__WASM_FMAGIC, k_eq_1_subtile_m) {
38876 for (uint32_t m = 1; m <= 1; m++) {
38877 GemmMicrokernelTester()
38878 .mr(1)
38879 .nr(4)
38880 .kr(1)
38881 .sr(1)
38882 .m(m)
38883 .n(4)
38884 .k(1)
38885 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038886 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038887 }
38888 }
38889
38890 TEST(QC8_GEMM_MINMAX_FP32_1X4__WASM_FMAGIC, k_eq_1_subtile_n) {
38891 for (uint32_t n = 1; n <= 4; n++) {
38892 GemmMicrokernelTester()
38893 .mr(1)
38894 .nr(4)
38895 .kr(1)
38896 .sr(1)
38897 .m(1)
38898 .n(n)
38899 .k(1)
38900 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038901 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038902 }
38903 }
38904
38905 TEST(QC8_GEMM_MINMAX_FP32_1X4__WASM_FMAGIC, k_gt_1) {
38906 for (size_t k = 2; k < 10; k++) {
38907 GemmMicrokernelTester()
38908 .mr(1)
38909 .nr(4)
38910 .kr(1)
38911 .sr(1)
38912 .m(1)
38913 .n(4)
38914 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080038915 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038916 }
38917 }
38918
38919 TEST(QC8_GEMM_MINMAX_FP32_1X4__WASM_FMAGIC, k_gt_1_strided_a) {
38920 for (size_t k = 2; k < 10; k++) {
38921 GemmMicrokernelTester()
38922 .mr(1)
38923 .nr(4)
38924 .kr(1)
38925 .sr(1)
38926 .m(1)
38927 .n(4)
38928 .k(k)
38929 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080038930 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038931 }
38932 }
38933
38934 TEST(QC8_GEMM_MINMAX_FP32_1X4__WASM_FMAGIC, k_gt_1_subtile) {
38935 for (size_t k = 2; k < 10; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080038936 for (uint32_t n = 1; n <= 4; n++) {
38937 for (uint32_t m = 1; m <= 1; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038938 GemmMicrokernelTester()
38939 .mr(1)
38940 .nr(4)
38941 .kr(1)
38942 .sr(1)
38943 .m(m)
38944 .n(n)
38945 .k(k)
38946 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080038947 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038948 }
38949 }
38950 }
38951 }
38952
38953 TEST(QC8_GEMM_MINMAX_FP32_1X4__WASM_FMAGIC, n_gt_4) {
38954 for (uint32_t n = 5; n < 8; n++) {
38955 for (size_t k = 1; k <= 5; k += 2) {
38956 GemmMicrokernelTester()
38957 .mr(1)
38958 .nr(4)
38959 .kr(1)
38960 .sr(1)
38961 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080038962 .n(n)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038963 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080038964 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038965 }
38966 }
38967 }
38968
38969 TEST(QC8_GEMM_MINMAX_FP32_1X4__WASM_FMAGIC, n_gt_4_strided_cn) {
38970 for (uint32_t n = 5; n < 8; n++) {
38971 for (size_t k = 1; k <= 5; k += 2) {
38972 GemmMicrokernelTester()
38973 .mr(1)
38974 .nr(4)
38975 .kr(1)
38976 .sr(1)
38977 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080038978 .n(n)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038979 .k(k)
38980 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080038981 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038982 }
38983 }
38984 }
38985
38986 TEST(QC8_GEMM_MINMAX_FP32_1X4__WASM_FMAGIC, n_gt_4_strided_a) {
38987 for (uint32_t n = 5; n < 8; n++) {
38988 for (size_t k = 1; k <= 5; k += 2) {
38989 GemmMicrokernelTester()
38990 .mr(1)
38991 .nr(4)
38992 .kr(1)
38993 .sr(1)
38994 .m(1)
38995 .n(n)
38996 .k(k)
38997 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080038998 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080038999 }
39000 }
39001 }
39002
39003 TEST(QC8_GEMM_MINMAX_FP32_1X4__WASM_FMAGIC, n_gt_4_subtile) {
39004 for (uint32_t n = 5; n < 8; n++) {
39005 for (size_t k = 1; k <= 5; k += 2) {
39006 for (uint32_t m = 1; m <= 1; m++) {
39007 GemmMicrokernelTester()
39008 .mr(1)
39009 .nr(4)
39010 .kr(1)
39011 .sr(1)
39012 .m(m)
39013 .n(n)
39014 .k(k)
39015 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039016 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039017 }
39018 }
39019 }
39020 }
39021
39022 TEST(QC8_GEMM_MINMAX_FP32_1X4__WASM_FMAGIC, n_div_4) {
39023 for (uint32_t n = 8; n <= 12; n += 4) {
39024 for (size_t k = 1; k <= 5; k += 2) {
39025 GemmMicrokernelTester()
39026 .mr(1)
39027 .nr(4)
39028 .kr(1)
39029 .sr(1)
39030 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080039031 .n(n)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039032 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080039033 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039034 }
39035 }
39036 }
39037
39038 TEST(QC8_GEMM_MINMAX_FP32_1X4__WASM_FMAGIC, n_div_4_strided_cn) {
39039 for (uint32_t n = 8; n <= 12; n += 4) {
39040 for (size_t k = 1; k <= 5; k += 2) {
39041 GemmMicrokernelTester()
39042 .mr(1)
39043 .nr(4)
39044 .kr(1)
39045 .sr(1)
39046 .m(1)
39047 .n(n)
39048 .k(k)
39049 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080039050 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039051 }
39052 }
39053 }
39054
39055 TEST(QC8_GEMM_MINMAX_FP32_1X4__WASM_FMAGIC, n_div_4_strided_a) {
39056 for (uint32_t n = 8; n <= 12; n += 4) {
39057 for (size_t k = 1; k <= 5; k += 2) {
39058 GemmMicrokernelTester()
39059 .mr(1)
39060 .nr(4)
39061 .kr(1)
39062 .sr(1)
39063 .m(1)
39064 .n(n)
39065 .k(k)
39066 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080039067 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039068 }
39069 }
39070 }
39071
39072 TEST(QC8_GEMM_MINMAX_FP32_1X4__WASM_FMAGIC, n_div_4_subtile) {
39073 for (uint32_t n = 8; n <= 12; n += 4) {
39074 for (size_t k = 1; k <= 5; k += 2) {
39075 for (uint32_t m = 1; m <= 1; m++) {
39076 GemmMicrokernelTester()
39077 .mr(1)
39078 .nr(4)
39079 .kr(1)
39080 .sr(1)
39081 .m(m)
39082 .n(n)
39083 .k(k)
39084 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039085 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039086 }
39087 }
39088 }
39089 }
39090
39091 TEST(QC8_GEMM_MINMAX_FP32_1X4__WASM_FMAGIC, strided_cm_subtile) {
39092 for (size_t k = 1; k <= 5; k += 2) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080039093 for (uint32_t n = 1; n <= 4; n++) {
39094 for (uint32_t m = 1; m <= 1; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039095 GemmMicrokernelTester()
39096 .mr(1)
39097 .nr(4)
39098 .kr(1)
39099 .sr(1)
39100 .m(m)
39101 .n(n)
39102 .k(k)
39103 .cm_stride(7)
39104 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039105 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039106 }
39107 }
39108 }
39109 }
39110
39111 TEST(QC8_GEMM_MINMAX_FP32_1X4__WASM_FMAGIC, qmin) {
39112 GemmMicrokernelTester()
39113 .mr(1)
39114 .nr(4)
39115 .kr(1)
39116 .sr(1)
39117 .m(1)
39118 .n(4)
39119 .k(1)
39120 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080039121 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039122 }
39123
39124 TEST(QC8_GEMM_MINMAX_FP32_1X4__WASM_FMAGIC, qmax) {
39125 GemmMicrokernelTester()
39126 .mr(1)
39127 .nr(4)
39128 .kr(1)
39129 .sr(1)
39130 .m(1)
39131 .n(4)
39132 .k(1)
39133 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080039134 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039135 }
39136
39137 TEST(QC8_GEMM_MINMAX_FP32_1X4__WASM_FMAGIC, strided_cm) {
39138 GemmMicrokernelTester()
39139 .mr(1)
39140 .nr(4)
39141 .kr(1)
39142 .sr(1)
39143 .m(1)
39144 .n(4)
39145 .k(1)
39146 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080039147 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039148 }
39149#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
39150
39151
39152#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
39153 TEST(QC8_GEMM_MINMAX_FP32_2X4__WASM_FMAGIC, k_eq_1) {
39154 GemmMicrokernelTester()
39155 .mr(2)
39156 .nr(4)
39157 .kr(1)
39158 .sr(1)
39159 .m(2)
39160 .n(4)
39161 .k(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039162 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039163 }
39164
39165 TEST(QC8_GEMM_MINMAX_FP32_2X4__WASM_FMAGIC, strided_cn) {
39166 GemmMicrokernelTester()
39167 .mr(2)
39168 .nr(4)
39169 .kr(1)
39170 .sr(1)
39171 .m(2)
39172 .n(4)
39173 .k(1)
39174 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080039175 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039176 }
39177
39178 TEST(QC8_GEMM_MINMAX_FP32_2X4__WASM_FMAGIC, k_eq_1_strided_a) {
39179 GemmMicrokernelTester()
39180 .mr(2)
39181 .nr(4)
39182 .kr(1)
39183 .sr(1)
39184 .m(2)
39185 .n(4)
39186 .k(1)
39187 .a_stride(3)
Marat Dukhan50323b82022-01-11 00:12:01 -080039188 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039189 }
39190
39191 TEST(QC8_GEMM_MINMAX_FP32_2X4__WASM_FMAGIC, k_eq_1_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080039192 for (uint32_t n = 1; n <= 4; n++) {
39193 for (uint32_t m = 1; m <= 2; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039194 GemmMicrokernelTester()
39195 .mr(2)
39196 .nr(4)
39197 .kr(1)
39198 .sr(1)
39199 .m(m)
39200 .n(n)
39201 .k(1)
39202 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039203 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039204 }
39205 }
39206 }
39207
39208 TEST(QC8_GEMM_MINMAX_FP32_2X4__WASM_FMAGIC, k_eq_1_subtile_m) {
39209 for (uint32_t m = 1; m <= 2; m++) {
39210 GemmMicrokernelTester()
39211 .mr(2)
39212 .nr(4)
39213 .kr(1)
39214 .sr(1)
39215 .m(m)
39216 .n(4)
39217 .k(1)
39218 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039219 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039220 }
39221 }
39222
39223 TEST(QC8_GEMM_MINMAX_FP32_2X4__WASM_FMAGIC, k_eq_1_subtile_n) {
39224 for (uint32_t n = 1; n <= 4; n++) {
39225 GemmMicrokernelTester()
39226 .mr(2)
39227 .nr(4)
39228 .kr(1)
39229 .sr(1)
39230 .m(2)
39231 .n(n)
39232 .k(1)
39233 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039234 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039235 }
39236 }
39237
39238 TEST(QC8_GEMM_MINMAX_FP32_2X4__WASM_FMAGIC, k_gt_1) {
39239 for (size_t k = 2; k < 10; k++) {
39240 GemmMicrokernelTester()
39241 .mr(2)
39242 .nr(4)
39243 .kr(1)
39244 .sr(1)
39245 .m(2)
39246 .n(4)
39247 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080039248 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039249 }
39250 }
39251
39252 TEST(QC8_GEMM_MINMAX_FP32_2X4__WASM_FMAGIC, k_gt_1_strided_a) {
39253 for (size_t k = 2; k < 10; k++) {
39254 GemmMicrokernelTester()
39255 .mr(2)
39256 .nr(4)
39257 .kr(1)
39258 .sr(1)
39259 .m(2)
39260 .n(4)
39261 .k(k)
39262 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080039263 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039264 }
39265 }
39266
39267 TEST(QC8_GEMM_MINMAX_FP32_2X4__WASM_FMAGIC, k_gt_1_subtile) {
39268 for (size_t k = 2; k < 10; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080039269 for (uint32_t n = 1; n <= 4; n++) {
39270 for (uint32_t m = 1; m <= 2; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039271 GemmMicrokernelTester()
39272 .mr(2)
39273 .nr(4)
39274 .kr(1)
39275 .sr(1)
39276 .m(m)
39277 .n(n)
39278 .k(k)
39279 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039280 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039281 }
39282 }
39283 }
39284 }
39285
39286 TEST(QC8_GEMM_MINMAX_FP32_2X4__WASM_FMAGIC, n_gt_4) {
39287 for (uint32_t n = 5; n < 8; n++) {
39288 for (size_t k = 1; k <= 5; k += 2) {
39289 GemmMicrokernelTester()
39290 .mr(2)
39291 .nr(4)
39292 .kr(1)
39293 .sr(1)
39294 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080039295 .n(n)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039296 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080039297 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039298 }
39299 }
39300 }
39301
39302 TEST(QC8_GEMM_MINMAX_FP32_2X4__WASM_FMAGIC, n_gt_4_strided_cn) {
39303 for (uint32_t n = 5; n < 8; n++) {
39304 for (size_t k = 1; k <= 5; k += 2) {
39305 GemmMicrokernelTester()
39306 .mr(2)
39307 .nr(4)
39308 .kr(1)
39309 .sr(1)
39310 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080039311 .n(n)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039312 .k(k)
39313 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080039314 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039315 }
39316 }
39317 }
39318
39319 TEST(QC8_GEMM_MINMAX_FP32_2X4__WASM_FMAGIC, n_gt_4_strided_a) {
39320 for (uint32_t n = 5; n < 8; n++) {
39321 for (size_t k = 1; k <= 5; k += 2) {
39322 GemmMicrokernelTester()
39323 .mr(2)
39324 .nr(4)
39325 .kr(1)
39326 .sr(1)
39327 .m(2)
39328 .n(n)
39329 .k(k)
39330 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080039331 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039332 }
39333 }
39334 }
39335
39336 TEST(QC8_GEMM_MINMAX_FP32_2X4__WASM_FMAGIC, n_gt_4_subtile) {
39337 for (uint32_t n = 5; n < 8; n++) {
39338 for (size_t k = 1; k <= 5; k += 2) {
39339 for (uint32_t m = 1; m <= 2; m++) {
39340 GemmMicrokernelTester()
39341 .mr(2)
39342 .nr(4)
39343 .kr(1)
39344 .sr(1)
39345 .m(m)
39346 .n(n)
39347 .k(k)
39348 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039349 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039350 }
39351 }
39352 }
39353 }
39354
39355 TEST(QC8_GEMM_MINMAX_FP32_2X4__WASM_FMAGIC, n_div_4) {
39356 for (uint32_t n = 8; n <= 12; n += 4) {
39357 for (size_t k = 1; k <= 5; k += 2) {
39358 GemmMicrokernelTester()
39359 .mr(2)
39360 .nr(4)
39361 .kr(1)
39362 .sr(1)
39363 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080039364 .n(n)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039365 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080039366 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039367 }
39368 }
39369 }
39370
39371 TEST(QC8_GEMM_MINMAX_FP32_2X4__WASM_FMAGIC, n_div_4_strided_cn) {
39372 for (uint32_t n = 8; n <= 12; n += 4) {
39373 for (size_t k = 1; k <= 5; k += 2) {
39374 GemmMicrokernelTester()
39375 .mr(2)
39376 .nr(4)
39377 .kr(1)
39378 .sr(1)
39379 .m(2)
39380 .n(n)
39381 .k(k)
39382 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080039383 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039384 }
39385 }
39386 }
39387
39388 TEST(QC8_GEMM_MINMAX_FP32_2X4__WASM_FMAGIC, n_div_4_strided_a) {
39389 for (uint32_t n = 8; n <= 12; n += 4) {
39390 for (size_t k = 1; k <= 5; k += 2) {
39391 GemmMicrokernelTester()
39392 .mr(2)
39393 .nr(4)
39394 .kr(1)
39395 .sr(1)
39396 .m(2)
39397 .n(n)
39398 .k(k)
39399 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080039400 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039401 }
39402 }
39403 }
39404
39405 TEST(QC8_GEMM_MINMAX_FP32_2X4__WASM_FMAGIC, n_div_4_subtile) {
39406 for (uint32_t n = 8; n <= 12; n += 4) {
39407 for (size_t k = 1; k <= 5; k += 2) {
39408 for (uint32_t m = 1; m <= 2; m++) {
39409 GemmMicrokernelTester()
39410 .mr(2)
39411 .nr(4)
39412 .kr(1)
39413 .sr(1)
39414 .m(m)
39415 .n(n)
39416 .k(k)
39417 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039418 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039419 }
39420 }
39421 }
39422 }
39423
39424 TEST(QC8_GEMM_MINMAX_FP32_2X4__WASM_FMAGIC, strided_cm_subtile) {
39425 for (size_t k = 1; k <= 5; k += 2) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080039426 for (uint32_t n = 1; n <= 4; n++) {
39427 for (uint32_t m = 1; m <= 2; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039428 GemmMicrokernelTester()
39429 .mr(2)
39430 .nr(4)
39431 .kr(1)
39432 .sr(1)
39433 .m(m)
39434 .n(n)
39435 .k(k)
39436 .cm_stride(7)
39437 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039438 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039439 }
39440 }
39441 }
39442 }
39443
39444 TEST(QC8_GEMM_MINMAX_FP32_2X4__WASM_FMAGIC, qmin) {
39445 GemmMicrokernelTester()
39446 .mr(2)
39447 .nr(4)
39448 .kr(1)
39449 .sr(1)
39450 .m(2)
39451 .n(4)
39452 .k(1)
39453 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080039454 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039455 }
39456
39457 TEST(QC8_GEMM_MINMAX_FP32_2X4__WASM_FMAGIC, qmax) {
39458 GemmMicrokernelTester()
39459 .mr(2)
39460 .nr(4)
39461 .kr(1)
39462 .sr(1)
39463 .m(2)
39464 .n(4)
39465 .k(1)
39466 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080039467 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039468 }
39469
39470 TEST(QC8_GEMM_MINMAX_FP32_2X4__WASM_FMAGIC, strided_cm) {
39471 GemmMicrokernelTester()
39472 .mr(2)
39473 .nr(4)
39474 .kr(1)
39475 .sr(1)
39476 .m(2)
39477 .n(4)
39478 .k(1)
39479 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080039480 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039481 }
39482#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
39483
39484
39485#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039486 TEST(QC8_GEMM_MINMAX_FP32_4X4__WASM_FMAGIC, k_eq_1) {
39487 GemmMicrokernelTester()
39488 .mr(4)
39489 .nr(4)
39490 .kr(1)
39491 .sr(1)
39492 .m(4)
39493 .n(4)
39494 .k(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039495 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039496 }
39497
39498 TEST(QC8_GEMM_MINMAX_FP32_4X4__WASM_FMAGIC, strided_cn) {
39499 GemmMicrokernelTester()
39500 .mr(4)
39501 .nr(4)
39502 .kr(1)
39503 .sr(1)
39504 .m(4)
39505 .n(4)
39506 .k(1)
39507 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080039508 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039509 }
39510
39511 TEST(QC8_GEMM_MINMAX_FP32_4X4__WASM_FMAGIC, k_eq_1_strided_a) {
39512 GemmMicrokernelTester()
39513 .mr(4)
39514 .nr(4)
39515 .kr(1)
39516 .sr(1)
39517 .m(4)
39518 .n(4)
39519 .k(1)
39520 .a_stride(3)
Marat Dukhan50323b82022-01-11 00:12:01 -080039521 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039522 }
39523
39524 TEST(QC8_GEMM_MINMAX_FP32_4X4__WASM_FMAGIC, k_eq_1_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080039525 for (uint32_t n = 1; n <= 4; n++) {
39526 for (uint32_t m = 1; m <= 4; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039527 GemmMicrokernelTester()
39528 .mr(4)
39529 .nr(4)
39530 .kr(1)
39531 .sr(1)
39532 .m(m)
39533 .n(n)
39534 .k(1)
39535 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039536 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039537 }
39538 }
39539 }
39540
39541 TEST(QC8_GEMM_MINMAX_FP32_4X4__WASM_FMAGIC, k_eq_1_subtile_m) {
39542 for (uint32_t m = 1; m <= 4; m++) {
39543 GemmMicrokernelTester()
39544 .mr(4)
39545 .nr(4)
39546 .kr(1)
39547 .sr(1)
39548 .m(m)
39549 .n(4)
39550 .k(1)
39551 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039552 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039553 }
39554 }
39555
39556 TEST(QC8_GEMM_MINMAX_FP32_4X4__WASM_FMAGIC, k_eq_1_subtile_n) {
39557 for (uint32_t n = 1; n <= 4; n++) {
39558 GemmMicrokernelTester()
39559 .mr(4)
39560 .nr(4)
39561 .kr(1)
39562 .sr(1)
39563 .m(4)
39564 .n(n)
39565 .k(1)
39566 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039567 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039568 }
39569 }
39570
39571 TEST(QC8_GEMM_MINMAX_FP32_4X4__WASM_FMAGIC, k_gt_1) {
39572 for (size_t k = 2; k < 10; k++) {
39573 GemmMicrokernelTester()
39574 .mr(4)
39575 .nr(4)
39576 .kr(1)
39577 .sr(1)
39578 .m(4)
39579 .n(4)
39580 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080039581 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039582 }
39583 }
39584
39585 TEST(QC8_GEMM_MINMAX_FP32_4X4__WASM_FMAGIC, k_gt_1_strided_a) {
39586 for (size_t k = 2; k < 10; k++) {
39587 GemmMicrokernelTester()
39588 .mr(4)
39589 .nr(4)
39590 .kr(1)
39591 .sr(1)
39592 .m(4)
39593 .n(4)
39594 .k(k)
39595 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080039596 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039597 }
39598 }
39599
39600 TEST(QC8_GEMM_MINMAX_FP32_4X4__WASM_FMAGIC, k_gt_1_subtile) {
39601 for (size_t k = 2; k < 10; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080039602 for (uint32_t n = 1; n <= 4; n++) {
39603 for (uint32_t m = 1; m <= 4; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039604 GemmMicrokernelTester()
39605 .mr(4)
39606 .nr(4)
39607 .kr(1)
39608 .sr(1)
39609 .m(m)
39610 .n(n)
39611 .k(k)
39612 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039613 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039614 }
39615 }
39616 }
39617 }
39618
39619 TEST(QC8_GEMM_MINMAX_FP32_4X4__WASM_FMAGIC, n_gt_4) {
39620 for (uint32_t n = 5; n < 8; n++) {
39621 for (size_t k = 1; k <= 5; k += 2) {
39622 GemmMicrokernelTester()
39623 .mr(4)
39624 .nr(4)
39625 .kr(1)
39626 .sr(1)
39627 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080039628 .n(n)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039629 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080039630 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039631 }
39632 }
39633 }
39634
39635 TEST(QC8_GEMM_MINMAX_FP32_4X4__WASM_FMAGIC, n_gt_4_strided_cn) {
39636 for (uint32_t n = 5; n < 8; n++) {
39637 for (size_t k = 1; k <= 5; k += 2) {
39638 GemmMicrokernelTester()
39639 .mr(4)
39640 .nr(4)
39641 .kr(1)
39642 .sr(1)
39643 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080039644 .n(n)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039645 .k(k)
39646 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080039647 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039648 }
39649 }
39650 }
39651
39652 TEST(QC8_GEMM_MINMAX_FP32_4X4__WASM_FMAGIC, n_gt_4_strided_a) {
39653 for (uint32_t n = 5; n < 8; n++) {
39654 for (size_t k = 1; k <= 5; k += 2) {
39655 GemmMicrokernelTester()
39656 .mr(4)
39657 .nr(4)
39658 .kr(1)
39659 .sr(1)
39660 .m(4)
39661 .n(n)
39662 .k(k)
39663 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080039664 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039665 }
39666 }
39667 }
39668
39669 TEST(QC8_GEMM_MINMAX_FP32_4X4__WASM_FMAGIC, n_gt_4_subtile) {
39670 for (uint32_t n = 5; n < 8; n++) {
39671 for (size_t k = 1; k <= 5; k += 2) {
39672 for (uint32_t m = 1; m <= 4; m++) {
39673 GemmMicrokernelTester()
39674 .mr(4)
39675 .nr(4)
39676 .kr(1)
39677 .sr(1)
39678 .m(m)
39679 .n(n)
39680 .k(k)
39681 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039682 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039683 }
39684 }
39685 }
39686 }
39687
39688 TEST(QC8_GEMM_MINMAX_FP32_4X4__WASM_FMAGIC, n_div_4) {
39689 for (uint32_t n = 8; n <= 12; n += 4) {
39690 for (size_t k = 1; k <= 5; k += 2) {
39691 GemmMicrokernelTester()
39692 .mr(4)
39693 .nr(4)
39694 .kr(1)
39695 .sr(1)
39696 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080039697 .n(n)
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039698 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080039699 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039700 }
39701 }
39702 }
39703
39704 TEST(QC8_GEMM_MINMAX_FP32_4X4__WASM_FMAGIC, n_div_4_strided_cn) {
39705 for (uint32_t n = 8; n <= 12; n += 4) {
39706 for (size_t k = 1; k <= 5; k += 2) {
39707 GemmMicrokernelTester()
39708 .mr(4)
39709 .nr(4)
39710 .kr(1)
39711 .sr(1)
39712 .m(4)
39713 .n(n)
39714 .k(k)
39715 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080039716 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039717 }
39718 }
39719 }
39720
39721 TEST(QC8_GEMM_MINMAX_FP32_4X4__WASM_FMAGIC, n_div_4_strided_a) {
39722 for (uint32_t n = 8; n <= 12; n += 4) {
39723 for (size_t k = 1; k <= 5; k += 2) {
39724 GemmMicrokernelTester()
39725 .mr(4)
39726 .nr(4)
39727 .kr(1)
39728 .sr(1)
39729 .m(4)
39730 .n(n)
39731 .k(k)
39732 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080039733 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039734 }
39735 }
39736 }
39737
39738 TEST(QC8_GEMM_MINMAX_FP32_4X4__WASM_FMAGIC, n_div_4_subtile) {
39739 for (uint32_t n = 8; n <= 12; n += 4) {
39740 for (size_t k = 1; k <= 5; k += 2) {
39741 for (uint32_t m = 1; m <= 4; m++) {
39742 GemmMicrokernelTester()
39743 .mr(4)
39744 .nr(4)
39745 .kr(1)
39746 .sr(1)
39747 .m(m)
39748 .n(n)
39749 .k(k)
39750 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039751 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039752 }
39753 }
39754 }
39755 }
39756
39757 TEST(QC8_GEMM_MINMAX_FP32_4X4__WASM_FMAGIC, strided_cm_subtile) {
39758 for (size_t k = 1; k <= 5; k += 2) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080039759 for (uint32_t n = 1; n <= 4; n++) {
39760 for (uint32_t m = 1; m <= 4; m++) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039761 GemmMicrokernelTester()
39762 .mr(4)
39763 .nr(4)
39764 .kr(1)
39765 .sr(1)
39766 .m(m)
39767 .n(n)
39768 .k(k)
39769 .cm_stride(7)
39770 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039771 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039772 }
39773 }
39774 }
39775 }
39776
39777 TEST(QC8_GEMM_MINMAX_FP32_4X4__WASM_FMAGIC, qmin) {
39778 GemmMicrokernelTester()
39779 .mr(4)
39780 .nr(4)
39781 .kr(1)
39782 .sr(1)
39783 .m(4)
39784 .n(4)
39785 .k(1)
39786 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080039787 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039788 }
39789
39790 TEST(QC8_GEMM_MINMAX_FP32_4X4__WASM_FMAGIC, qmax) {
39791 GemmMicrokernelTester()
39792 .mr(4)
39793 .nr(4)
39794 .kr(1)
39795 .sr(1)
39796 .m(4)
39797 .n(4)
39798 .k(1)
39799 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080039800 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039801 }
39802
39803 TEST(QC8_GEMM_MINMAX_FP32_4X4__WASM_FMAGIC, strided_cm) {
39804 GemmMicrokernelTester()
39805 .mr(4)
39806 .nr(4)
39807 .kr(1)
39808 .sr(1)
39809 .m(4)
39810 .n(4)
39811 .k(1)
39812 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080039813 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan7c1115f2022-01-04 17:18:41 -080039814 }
39815#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
39816
39817
Marat Dukhan2ac722e2022-01-04 01:54:20 -080039818TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_FMAGIC, k_eq_1) {
Marat Dukhand6021542021-06-30 09:04:20 -070039819 GemmMicrokernelTester()
39820 .mr(1)
39821 .nr(2)
39822 .kr(1)
39823 .sr(1)
39824 .m(1)
39825 .n(2)
39826 .k(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039827 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070039828}
39829
Marat Dukhan2ac722e2022-01-04 01:54:20 -080039830TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_FMAGIC, strided_cn) {
Marat Dukhand6021542021-06-30 09:04:20 -070039831 GemmMicrokernelTester()
39832 .mr(1)
39833 .nr(2)
39834 .kr(1)
39835 .sr(1)
39836 .m(1)
39837 .n(2)
39838 .k(1)
39839 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080039840 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070039841}
39842
Marat Dukhan2ac722e2022-01-04 01:54:20 -080039843TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_FMAGIC, k_eq_1_strided_a) {
Marat Dukhand6021542021-06-30 09:04:20 -070039844 GemmMicrokernelTester()
39845 .mr(1)
39846 .nr(2)
39847 .kr(1)
39848 .sr(1)
39849 .m(1)
39850 .n(2)
39851 .k(1)
39852 .a_stride(3)
Marat Dukhan50323b82022-01-11 00:12:01 -080039853 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070039854}
39855
Marat Dukhan2ac722e2022-01-04 01:54:20 -080039856TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_FMAGIC, k_eq_1_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080039857 for (uint32_t n = 1; n <= 2; n++) {
39858 for (uint32_t m = 1; m <= 1; m++) {
Marat Dukhand6021542021-06-30 09:04:20 -070039859 GemmMicrokernelTester()
39860 .mr(1)
39861 .nr(2)
39862 .kr(1)
39863 .sr(1)
39864 .m(m)
39865 .n(n)
39866 .k(1)
39867 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039868 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070039869 }
39870 }
39871}
39872
Marat Dukhan2ac722e2022-01-04 01:54:20 -080039873TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_FMAGIC, k_eq_1_subtile_m) {
Marat Dukhand6021542021-06-30 09:04:20 -070039874 for (uint32_t m = 1; m <= 1; m++) {
39875 GemmMicrokernelTester()
39876 .mr(1)
39877 .nr(2)
39878 .kr(1)
39879 .sr(1)
39880 .m(m)
39881 .n(2)
39882 .k(1)
39883 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039884 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070039885 }
39886}
39887
Marat Dukhan2ac722e2022-01-04 01:54:20 -080039888TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_FMAGIC, k_eq_1_subtile_n) {
Marat Dukhand6021542021-06-30 09:04:20 -070039889 for (uint32_t n = 1; n <= 2; n++) {
39890 GemmMicrokernelTester()
39891 .mr(1)
39892 .nr(2)
39893 .kr(1)
39894 .sr(1)
39895 .m(1)
39896 .n(n)
39897 .k(1)
39898 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039899 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070039900 }
39901}
39902
Marat Dukhan2ac722e2022-01-04 01:54:20 -080039903TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_FMAGIC, k_gt_1) {
Marat Dukhand6021542021-06-30 09:04:20 -070039904 for (size_t k = 2; k < 10; k++) {
39905 GemmMicrokernelTester()
39906 .mr(1)
39907 .nr(2)
39908 .kr(1)
39909 .sr(1)
39910 .m(1)
39911 .n(2)
39912 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080039913 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070039914 }
39915}
39916
Marat Dukhan2ac722e2022-01-04 01:54:20 -080039917TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_FMAGIC, k_gt_1_strided_a) {
Marat Dukhand6021542021-06-30 09:04:20 -070039918 for (size_t k = 2; k < 10; k++) {
39919 GemmMicrokernelTester()
39920 .mr(1)
39921 .nr(2)
39922 .kr(1)
39923 .sr(1)
39924 .m(1)
39925 .n(2)
39926 .k(k)
39927 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080039928 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070039929 }
39930}
39931
Marat Dukhan2ac722e2022-01-04 01:54:20 -080039932TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_FMAGIC, k_gt_1_subtile) {
Marat Dukhand6021542021-06-30 09:04:20 -070039933 for (size_t k = 2; k < 10; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080039934 for (uint32_t n = 1; n <= 2; n++) {
39935 for (uint32_t m = 1; m <= 1; m++) {
Marat Dukhand6021542021-06-30 09:04:20 -070039936 GemmMicrokernelTester()
39937 .mr(1)
39938 .nr(2)
39939 .kr(1)
39940 .sr(1)
39941 .m(m)
39942 .n(n)
39943 .k(k)
39944 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080039945 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070039946 }
39947 }
39948 }
39949}
39950
Marat Dukhan2ac722e2022-01-04 01:54:20 -080039951TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_FMAGIC, n_gt_2) {
Marat Dukhand6021542021-06-30 09:04:20 -070039952 for (uint32_t n = 3; n < 4; n++) {
39953 for (size_t k = 1; k <= 5; k += 2) {
39954 GemmMicrokernelTester()
39955 .mr(1)
39956 .nr(2)
39957 .kr(1)
39958 .sr(1)
39959 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080039960 .n(n)
Marat Dukhand6021542021-06-30 09:04:20 -070039961 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080039962 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070039963 }
39964 }
39965}
39966
Marat Dukhan2ac722e2022-01-04 01:54:20 -080039967TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_FMAGIC, n_gt_2_strided_cn) {
Marat Dukhand6021542021-06-30 09:04:20 -070039968 for (uint32_t n = 3; n < 4; n++) {
39969 for (size_t k = 1; k <= 5; k += 2) {
39970 GemmMicrokernelTester()
39971 .mr(1)
39972 .nr(2)
39973 .kr(1)
39974 .sr(1)
39975 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080039976 .n(n)
Marat Dukhand6021542021-06-30 09:04:20 -070039977 .k(k)
39978 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080039979 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070039980 }
39981 }
39982}
39983
Marat Dukhan2ac722e2022-01-04 01:54:20 -080039984TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_FMAGIC, n_gt_2_strided_a) {
Marat Dukhand6021542021-06-30 09:04:20 -070039985 for (uint32_t n = 3; n < 4; n++) {
39986 for (size_t k = 1; k <= 5; k += 2) {
39987 GemmMicrokernelTester()
39988 .mr(1)
39989 .nr(2)
39990 .kr(1)
39991 .sr(1)
39992 .m(1)
39993 .n(n)
39994 .k(k)
39995 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080039996 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070039997 }
39998 }
39999}
40000
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040001TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_FMAGIC, n_gt_2_subtile) {
Marat Dukhand6021542021-06-30 09:04:20 -070040002 for (uint32_t n = 3; n < 4; n++) {
40003 for (size_t k = 1; k <= 5; k += 2) {
40004 for (uint32_t m = 1; m <= 1; m++) {
40005 GemmMicrokernelTester()
40006 .mr(1)
40007 .nr(2)
40008 .kr(1)
40009 .sr(1)
40010 .m(m)
40011 .n(n)
40012 .k(k)
40013 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040014 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040015 }
40016 }
40017 }
40018}
40019
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040020TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_FMAGIC, n_div_2) {
Marat Dukhand6021542021-06-30 09:04:20 -070040021 for (uint32_t n = 4; n <= 6; n += 2) {
40022 for (size_t k = 1; k <= 5; k += 2) {
40023 GemmMicrokernelTester()
40024 .mr(1)
40025 .nr(2)
40026 .kr(1)
40027 .sr(1)
40028 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080040029 .n(n)
Marat Dukhand6021542021-06-30 09:04:20 -070040030 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080040031 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040032 }
40033 }
40034}
40035
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040036TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_FMAGIC, n_div_2_strided_cn) {
Marat Dukhand6021542021-06-30 09:04:20 -070040037 for (uint32_t n = 4; n <= 6; n += 2) {
40038 for (size_t k = 1; k <= 5; k += 2) {
40039 GemmMicrokernelTester()
40040 .mr(1)
40041 .nr(2)
40042 .kr(1)
40043 .sr(1)
40044 .m(1)
40045 .n(n)
40046 .k(k)
40047 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080040048 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040049 }
40050 }
40051}
40052
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040053TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_FMAGIC, n_div_2_strided_a) {
Marat Dukhand6021542021-06-30 09:04:20 -070040054 for (uint32_t n = 4; n <= 6; n += 2) {
40055 for (size_t k = 1; k <= 5; k += 2) {
40056 GemmMicrokernelTester()
40057 .mr(1)
40058 .nr(2)
40059 .kr(1)
40060 .sr(1)
40061 .m(1)
40062 .n(n)
40063 .k(k)
40064 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080040065 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040066 }
40067 }
40068}
40069
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040070TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_FMAGIC, n_div_2_subtile) {
Marat Dukhand6021542021-06-30 09:04:20 -070040071 for (uint32_t n = 4; n <= 6; n += 2) {
40072 for (size_t k = 1; k <= 5; k += 2) {
40073 for (uint32_t m = 1; m <= 1; m++) {
40074 GemmMicrokernelTester()
40075 .mr(1)
40076 .nr(2)
40077 .kr(1)
40078 .sr(1)
40079 .m(m)
40080 .n(n)
40081 .k(k)
40082 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040083 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040084 }
40085 }
40086 }
40087}
40088
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040089TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_FMAGIC, strided_cm_subtile) {
Marat Dukhand6021542021-06-30 09:04:20 -070040090 for (size_t k = 1; k <= 5; k += 2) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080040091 for (uint32_t n = 1; n <= 2; n++) {
40092 for (uint32_t m = 1; m <= 1; m++) {
Marat Dukhand6021542021-06-30 09:04:20 -070040093 GemmMicrokernelTester()
40094 .mr(1)
40095 .nr(2)
40096 .kr(1)
40097 .sr(1)
40098 .m(m)
40099 .n(n)
40100 .k(k)
40101 .cm_stride(5)
40102 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040103 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040104 }
40105 }
40106 }
40107}
40108
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040109TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_FMAGIC, qmin) {
Marat Dukhand6021542021-06-30 09:04:20 -070040110 GemmMicrokernelTester()
40111 .mr(1)
40112 .nr(2)
40113 .kr(1)
40114 .sr(1)
40115 .m(1)
40116 .n(2)
40117 .k(1)
40118 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080040119 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040120}
40121
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040122TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_FMAGIC, qmax) {
Marat Dukhand6021542021-06-30 09:04:20 -070040123 GemmMicrokernelTester()
40124 .mr(1)
40125 .nr(2)
40126 .kr(1)
40127 .sr(1)
40128 .m(1)
40129 .n(2)
40130 .k(1)
40131 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080040132 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040133}
40134
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040135TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_FMAGIC, strided_cm) {
Marat Dukhand6021542021-06-30 09:04:20 -070040136 GemmMicrokernelTester()
40137 .mr(1)
40138 .nr(2)
40139 .kr(1)
40140 .sr(1)
40141 .m(1)
40142 .n(2)
40143 .k(1)
40144 .cm_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080040145 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040146}
40147
40148
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040149TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_FMAGIC, k_eq_1) {
Marat Dukhand6021542021-06-30 09:04:20 -070040150 GemmMicrokernelTester()
40151 .mr(2)
40152 .nr(2)
40153 .kr(1)
40154 .sr(1)
40155 .m(2)
40156 .n(2)
40157 .k(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040158 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040159}
40160
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040161TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_FMAGIC, strided_cn) {
Marat Dukhand6021542021-06-30 09:04:20 -070040162 GemmMicrokernelTester()
40163 .mr(2)
40164 .nr(2)
40165 .kr(1)
40166 .sr(1)
40167 .m(2)
40168 .n(2)
40169 .k(1)
40170 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080040171 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040172}
40173
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040174TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_FMAGIC, k_eq_1_strided_a) {
Marat Dukhand6021542021-06-30 09:04:20 -070040175 GemmMicrokernelTester()
40176 .mr(2)
40177 .nr(2)
40178 .kr(1)
40179 .sr(1)
40180 .m(2)
40181 .n(2)
40182 .k(1)
40183 .a_stride(3)
Marat Dukhan50323b82022-01-11 00:12:01 -080040184 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040185}
40186
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040187TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_FMAGIC, k_eq_1_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080040188 for (uint32_t n = 1; n <= 2; n++) {
40189 for (uint32_t m = 1; m <= 2; m++) {
Marat Dukhand6021542021-06-30 09:04:20 -070040190 GemmMicrokernelTester()
40191 .mr(2)
40192 .nr(2)
40193 .kr(1)
40194 .sr(1)
40195 .m(m)
40196 .n(n)
40197 .k(1)
40198 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040199 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040200 }
40201 }
40202}
40203
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040204TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_FMAGIC, k_eq_1_subtile_m) {
Marat Dukhand6021542021-06-30 09:04:20 -070040205 for (uint32_t m = 1; m <= 2; m++) {
40206 GemmMicrokernelTester()
40207 .mr(2)
40208 .nr(2)
40209 .kr(1)
40210 .sr(1)
40211 .m(m)
40212 .n(2)
40213 .k(1)
40214 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040215 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040216 }
40217}
40218
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040219TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_FMAGIC, k_eq_1_subtile_n) {
Marat Dukhand6021542021-06-30 09:04:20 -070040220 for (uint32_t n = 1; n <= 2; n++) {
40221 GemmMicrokernelTester()
40222 .mr(2)
40223 .nr(2)
40224 .kr(1)
40225 .sr(1)
40226 .m(2)
40227 .n(n)
40228 .k(1)
40229 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040230 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040231 }
40232}
40233
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040234TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_FMAGIC, k_gt_1) {
Marat Dukhand6021542021-06-30 09:04:20 -070040235 for (size_t k = 2; k < 10; k++) {
40236 GemmMicrokernelTester()
40237 .mr(2)
40238 .nr(2)
40239 .kr(1)
40240 .sr(1)
40241 .m(2)
40242 .n(2)
40243 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080040244 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040245 }
40246}
40247
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040248TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_FMAGIC, k_gt_1_strided_a) {
Marat Dukhand6021542021-06-30 09:04:20 -070040249 for (size_t k = 2; k < 10; k++) {
40250 GemmMicrokernelTester()
40251 .mr(2)
40252 .nr(2)
40253 .kr(1)
40254 .sr(1)
40255 .m(2)
40256 .n(2)
40257 .k(k)
40258 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080040259 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040260 }
40261}
40262
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040263TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_FMAGIC, k_gt_1_subtile) {
Marat Dukhand6021542021-06-30 09:04:20 -070040264 for (size_t k = 2; k < 10; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080040265 for (uint32_t n = 1; n <= 2; n++) {
40266 for (uint32_t m = 1; m <= 2; m++) {
Marat Dukhand6021542021-06-30 09:04:20 -070040267 GemmMicrokernelTester()
40268 .mr(2)
40269 .nr(2)
40270 .kr(1)
40271 .sr(1)
40272 .m(m)
40273 .n(n)
40274 .k(k)
40275 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040276 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040277 }
40278 }
40279 }
40280}
40281
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040282TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_FMAGIC, n_gt_2) {
Marat Dukhand6021542021-06-30 09:04:20 -070040283 for (uint32_t n = 3; n < 4; n++) {
40284 for (size_t k = 1; k <= 5; k += 2) {
40285 GemmMicrokernelTester()
40286 .mr(2)
40287 .nr(2)
40288 .kr(1)
40289 .sr(1)
40290 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080040291 .n(n)
Marat Dukhand6021542021-06-30 09:04:20 -070040292 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080040293 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040294 }
40295 }
40296}
40297
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040298TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_FMAGIC, n_gt_2_strided_cn) {
Marat Dukhand6021542021-06-30 09:04:20 -070040299 for (uint32_t n = 3; n < 4; n++) {
40300 for (size_t k = 1; k <= 5; k += 2) {
40301 GemmMicrokernelTester()
40302 .mr(2)
40303 .nr(2)
40304 .kr(1)
40305 .sr(1)
40306 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080040307 .n(n)
Marat Dukhand6021542021-06-30 09:04:20 -070040308 .k(k)
40309 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080040310 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040311 }
40312 }
40313}
40314
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040315TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_FMAGIC, n_gt_2_strided_a) {
Marat Dukhand6021542021-06-30 09:04:20 -070040316 for (uint32_t n = 3; n < 4; n++) {
40317 for (size_t k = 1; k <= 5; k += 2) {
40318 GemmMicrokernelTester()
40319 .mr(2)
40320 .nr(2)
40321 .kr(1)
40322 .sr(1)
40323 .m(2)
40324 .n(n)
40325 .k(k)
40326 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080040327 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040328 }
40329 }
40330}
40331
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040332TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_FMAGIC, n_gt_2_subtile) {
Marat Dukhand6021542021-06-30 09:04:20 -070040333 for (uint32_t n = 3; n < 4; n++) {
40334 for (size_t k = 1; k <= 5; k += 2) {
40335 for (uint32_t m = 1; m <= 2; m++) {
40336 GemmMicrokernelTester()
40337 .mr(2)
40338 .nr(2)
40339 .kr(1)
40340 .sr(1)
40341 .m(m)
40342 .n(n)
40343 .k(k)
40344 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040345 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040346 }
40347 }
40348 }
40349}
40350
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040351TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_FMAGIC, n_div_2) {
Marat Dukhand6021542021-06-30 09:04:20 -070040352 for (uint32_t n = 4; n <= 6; n += 2) {
40353 for (size_t k = 1; k <= 5; k += 2) {
40354 GemmMicrokernelTester()
40355 .mr(2)
40356 .nr(2)
40357 .kr(1)
40358 .sr(1)
40359 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080040360 .n(n)
Marat Dukhand6021542021-06-30 09:04:20 -070040361 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080040362 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040363 }
40364 }
40365}
40366
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040367TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_FMAGIC, n_div_2_strided_cn) {
Marat Dukhand6021542021-06-30 09:04:20 -070040368 for (uint32_t n = 4; n <= 6; n += 2) {
40369 for (size_t k = 1; k <= 5; k += 2) {
40370 GemmMicrokernelTester()
40371 .mr(2)
40372 .nr(2)
40373 .kr(1)
40374 .sr(1)
40375 .m(2)
40376 .n(n)
40377 .k(k)
40378 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080040379 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040380 }
40381 }
40382}
40383
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040384TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_FMAGIC, n_div_2_strided_a) {
Marat Dukhand6021542021-06-30 09:04:20 -070040385 for (uint32_t n = 4; n <= 6; n += 2) {
40386 for (size_t k = 1; k <= 5; k += 2) {
40387 GemmMicrokernelTester()
40388 .mr(2)
40389 .nr(2)
40390 .kr(1)
40391 .sr(1)
40392 .m(2)
40393 .n(n)
40394 .k(k)
40395 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080040396 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040397 }
40398 }
40399}
40400
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040401TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_FMAGIC, n_div_2_subtile) {
Marat Dukhand6021542021-06-30 09:04:20 -070040402 for (uint32_t n = 4; n <= 6; n += 2) {
40403 for (size_t k = 1; k <= 5; k += 2) {
40404 for (uint32_t m = 1; m <= 2; m++) {
40405 GemmMicrokernelTester()
40406 .mr(2)
40407 .nr(2)
40408 .kr(1)
40409 .sr(1)
40410 .m(m)
40411 .n(n)
40412 .k(k)
40413 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040414 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040415 }
40416 }
40417 }
40418}
40419
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040420TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_FMAGIC, strided_cm_subtile) {
Marat Dukhand6021542021-06-30 09:04:20 -070040421 for (size_t k = 1; k <= 5; k += 2) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080040422 for (uint32_t n = 1; n <= 2; n++) {
40423 for (uint32_t m = 1; m <= 2; m++) {
Marat Dukhand6021542021-06-30 09:04:20 -070040424 GemmMicrokernelTester()
40425 .mr(2)
40426 .nr(2)
40427 .kr(1)
40428 .sr(1)
40429 .m(m)
40430 .n(n)
40431 .k(k)
40432 .cm_stride(5)
40433 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040434 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040435 }
40436 }
40437 }
40438}
40439
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040440TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_FMAGIC, qmin) {
Marat Dukhand6021542021-06-30 09:04:20 -070040441 GemmMicrokernelTester()
40442 .mr(2)
40443 .nr(2)
40444 .kr(1)
40445 .sr(1)
40446 .m(2)
40447 .n(2)
40448 .k(1)
40449 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080040450 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040451}
40452
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040453TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_FMAGIC, qmax) {
Marat Dukhand6021542021-06-30 09:04:20 -070040454 GemmMicrokernelTester()
40455 .mr(2)
40456 .nr(2)
40457 .kr(1)
40458 .sr(1)
40459 .m(2)
40460 .n(2)
40461 .k(1)
40462 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080040463 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040464}
40465
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040466TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_FMAGIC, strided_cm) {
Marat Dukhand6021542021-06-30 09:04:20 -070040467 GemmMicrokernelTester()
40468 .mr(2)
40469 .nr(2)
40470 .kr(1)
40471 .sr(1)
40472 .m(2)
40473 .n(2)
40474 .k(1)
40475 .cm_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080040476 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040477}
40478
40479
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040480TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_FMAGIC, k_eq_1) {
Marat Dukhand6021542021-06-30 09:04:20 -070040481 GemmMicrokernelTester()
40482 .mr(1)
40483 .nr(4)
40484 .kr(1)
40485 .sr(1)
40486 .m(1)
40487 .n(4)
40488 .k(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040489 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040490}
40491
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040492TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_FMAGIC, strided_cn) {
Marat Dukhand6021542021-06-30 09:04:20 -070040493 GemmMicrokernelTester()
40494 .mr(1)
40495 .nr(4)
40496 .kr(1)
40497 .sr(1)
40498 .m(1)
40499 .n(4)
40500 .k(1)
40501 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080040502 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040503}
40504
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040505TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_FMAGIC, k_eq_1_strided_a) {
Marat Dukhand6021542021-06-30 09:04:20 -070040506 GemmMicrokernelTester()
40507 .mr(1)
40508 .nr(4)
40509 .kr(1)
40510 .sr(1)
40511 .m(1)
40512 .n(4)
40513 .k(1)
40514 .a_stride(3)
Marat Dukhan50323b82022-01-11 00:12:01 -080040515 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040516}
40517
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040518TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_FMAGIC, k_eq_1_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080040519 for (uint32_t n = 1; n <= 4; n++) {
40520 for (uint32_t m = 1; m <= 1; m++) {
Marat Dukhand6021542021-06-30 09:04:20 -070040521 GemmMicrokernelTester()
40522 .mr(1)
40523 .nr(4)
40524 .kr(1)
40525 .sr(1)
40526 .m(m)
40527 .n(n)
40528 .k(1)
40529 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040530 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040531 }
40532 }
40533}
40534
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040535TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_FMAGIC, k_eq_1_subtile_m) {
Marat Dukhand6021542021-06-30 09:04:20 -070040536 for (uint32_t m = 1; m <= 1; m++) {
40537 GemmMicrokernelTester()
40538 .mr(1)
40539 .nr(4)
40540 .kr(1)
40541 .sr(1)
40542 .m(m)
40543 .n(4)
40544 .k(1)
40545 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040546 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040547 }
40548}
40549
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040550TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_FMAGIC, k_eq_1_subtile_n) {
Marat Dukhand6021542021-06-30 09:04:20 -070040551 for (uint32_t n = 1; n <= 4; n++) {
40552 GemmMicrokernelTester()
40553 .mr(1)
40554 .nr(4)
40555 .kr(1)
40556 .sr(1)
40557 .m(1)
40558 .n(n)
40559 .k(1)
40560 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040561 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040562 }
40563}
40564
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040565TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_FMAGIC, k_gt_1) {
Marat Dukhand6021542021-06-30 09:04:20 -070040566 for (size_t k = 2; k < 10; k++) {
40567 GemmMicrokernelTester()
40568 .mr(1)
40569 .nr(4)
40570 .kr(1)
40571 .sr(1)
40572 .m(1)
40573 .n(4)
40574 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080040575 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040576 }
40577}
40578
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040579TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_FMAGIC, k_gt_1_strided_a) {
Marat Dukhand6021542021-06-30 09:04:20 -070040580 for (size_t k = 2; k < 10; k++) {
40581 GemmMicrokernelTester()
40582 .mr(1)
40583 .nr(4)
40584 .kr(1)
40585 .sr(1)
40586 .m(1)
40587 .n(4)
40588 .k(k)
40589 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080040590 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040591 }
40592}
40593
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040594TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_FMAGIC, k_gt_1_subtile) {
Marat Dukhand6021542021-06-30 09:04:20 -070040595 for (size_t k = 2; k < 10; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080040596 for (uint32_t n = 1; n <= 4; n++) {
40597 for (uint32_t m = 1; m <= 1; m++) {
Marat Dukhand6021542021-06-30 09:04:20 -070040598 GemmMicrokernelTester()
40599 .mr(1)
40600 .nr(4)
40601 .kr(1)
40602 .sr(1)
40603 .m(m)
40604 .n(n)
40605 .k(k)
40606 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040607 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040608 }
40609 }
40610 }
40611}
40612
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040613TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_FMAGIC, n_gt_4) {
Marat Dukhand6021542021-06-30 09:04:20 -070040614 for (uint32_t n = 5; n < 8; n++) {
40615 for (size_t k = 1; k <= 5; k += 2) {
40616 GemmMicrokernelTester()
40617 .mr(1)
40618 .nr(4)
40619 .kr(1)
40620 .sr(1)
40621 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080040622 .n(n)
Marat Dukhand6021542021-06-30 09:04:20 -070040623 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080040624 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040625 }
40626 }
40627}
40628
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040629TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_FMAGIC, n_gt_4_strided_cn) {
Marat Dukhand6021542021-06-30 09:04:20 -070040630 for (uint32_t n = 5; n < 8; n++) {
40631 for (size_t k = 1; k <= 5; k += 2) {
40632 GemmMicrokernelTester()
40633 .mr(1)
40634 .nr(4)
40635 .kr(1)
40636 .sr(1)
40637 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080040638 .n(n)
Marat Dukhand6021542021-06-30 09:04:20 -070040639 .k(k)
40640 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080040641 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040642 }
40643 }
40644}
40645
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040646TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_FMAGIC, n_gt_4_strided_a) {
Marat Dukhand6021542021-06-30 09:04:20 -070040647 for (uint32_t n = 5; n < 8; n++) {
40648 for (size_t k = 1; k <= 5; k += 2) {
40649 GemmMicrokernelTester()
40650 .mr(1)
40651 .nr(4)
40652 .kr(1)
40653 .sr(1)
40654 .m(1)
40655 .n(n)
40656 .k(k)
40657 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080040658 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040659 }
40660 }
40661}
40662
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040663TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_FMAGIC, n_gt_4_subtile) {
Marat Dukhand6021542021-06-30 09:04:20 -070040664 for (uint32_t n = 5; n < 8; n++) {
40665 for (size_t k = 1; k <= 5; k += 2) {
40666 for (uint32_t m = 1; m <= 1; m++) {
40667 GemmMicrokernelTester()
40668 .mr(1)
40669 .nr(4)
40670 .kr(1)
40671 .sr(1)
40672 .m(m)
40673 .n(n)
40674 .k(k)
40675 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040676 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040677 }
40678 }
40679 }
40680}
40681
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040682TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_FMAGIC, n_div_4) {
Marat Dukhand6021542021-06-30 09:04:20 -070040683 for (uint32_t n = 8; n <= 12; n += 4) {
40684 for (size_t k = 1; k <= 5; k += 2) {
40685 GemmMicrokernelTester()
40686 .mr(1)
40687 .nr(4)
40688 .kr(1)
40689 .sr(1)
40690 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080040691 .n(n)
Marat Dukhand6021542021-06-30 09:04:20 -070040692 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080040693 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040694 }
40695 }
40696}
40697
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040698TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_FMAGIC, n_div_4_strided_cn) {
Marat Dukhand6021542021-06-30 09:04:20 -070040699 for (uint32_t n = 8; n <= 12; n += 4) {
40700 for (size_t k = 1; k <= 5; k += 2) {
40701 GemmMicrokernelTester()
40702 .mr(1)
40703 .nr(4)
40704 .kr(1)
40705 .sr(1)
40706 .m(1)
40707 .n(n)
40708 .k(k)
40709 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080040710 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040711 }
40712 }
40713}
40714
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040715TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_FMAGIC, n_div_4_strided_a) {
Marat Dukhand6021542021-06-30 09:04:20 -070040716 for (uint32_t n = 8; n <= 12; n += 4) {
40717 for (size_t k = 1; k <= 5; k += 2) {
40718 GemmMicrokernelTester()
40719 .mr(1)
40720 .nr(4)
40721 .kr(1)
40722 .sr(1)
40723 .m(1)
40724 .n(n)
40725 .k(k)
40726 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080040727 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040728 }
40729 }
40730}
40731
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040732TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_FMAGIC, n_div_4_subtile) {
Marat Dukhand6021542021-06-30 09:04:20 -070040733 for (uint32_t n = 8; n <= 12; n += 4) {
40734 for (size_t k = 1; k <= 5; k += 2) {
40735 for (uint32_t m = 1; m <= 1; m++) {
40736 GemmMicrokernelTester()
40737 .mr(1)
40738 .nr(4)
40739 .kr(1)
40740 .sr(1)
40741 .m(m)
40742 .n(n)
40743 .k(k)
40744 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040745 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040746 }
40747 }
40748 }
40749}
40750
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040751TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_FMAGIC, strided_cm_subtile) {
Marat Dukhand6021542021-06-30 09:04:20 -070040752 for (size_t k = 1; k <= 5; k += 2) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080040753 for (uint32_t n = 1; n <= 4; n++) {
40754 for (uint32_t m = 1; m <= 1; m++) {
Marat Dukhand6021542021-06-30 09:04:20 -070040755 GemmMicrokernelTester()
40756 .mr(1)
40757 .nr(4)
40758 .kr(1)
40759 .sr(1)
40760 .m(m)
40761 .n(n)
40762 .k(k)
40763 .cm_stride(7)
40764 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040765 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040766 }
40767 }
40768 }
40769}
40770
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040771TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_FMAGIC, qmin) {
Marat Dukhand6021542021-06-30 09:04:20 -070040772 GemmMicrokernelTester()
40773 .mr(1)
40774 .nr(4)
40775 .kr(1)
40776 .sr(1)
40777 .m(1)
40778 .n(4)
40779 .k(1)
40780 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080040781 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040782}
40783
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040784TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_FMAGIC, qmax) {
Marat Dukhand6021542021-06-30 09:04:20 -070040785 GemmMicrokernelTester()
40786 .mr(1)
40787 .nr(4)
40788 .kr(1)
40789 .sr(1)
40790 .m(1)
40791 .n(4)
40792 .k(1)
40793 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080040794 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040795}
40796
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040797TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_FMAGIC, strided_cm) {
Marat Dukhand6021542021-06-30 09:04:20 -070040798 GemmMicrokernelTester()
40799 .mr(1)
40800 .nr(4)
40801 .kr(1)
40802 .sr(1)
40803 .m(1)
40804 .n(4)
40805 .k(1)
40806 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080040807 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040808}
40809
40810
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040811TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, k_eq_1) {
Marat Dukhand6021542021-06-30 09:04:20 -070040812 GemmMicrokernelTester()
40813 .mr(3)
40814 .nr(4)
40815 .kr(1)
40816 .sr(1)
40817 .m(3)
40818 .n(4)
40819 .k(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040820 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040821}
40822
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040823TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, strided_cn) {
Marat Dukhand6021542021-06-30 09:04:20 -070040824 GemmMicrokernelTester()
40825 .mr(3)
40826 .nr(4)
40827 .kr(1)
40828 .sr(1)
40829 .m(3)
40830 .n(4)
40831 .k(1)
40832 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080040833 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040834}
40835
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040836TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, k_eq_1_strided_a) {
Marat Dukhand6021542021-06-30 09:04:20 -070040837 GemmMicrokernelTester()
40838 .mr(3)
40839 .nr(4)
40840 .kr(1)
40841 .sr(1)
40842 .m(3)
40843 .n(4)
40844 .k(1)
40845 .a_stride(3)
Marat Dukhan50323b82022-01-11 00:12:01 -080040846 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040847}
40848
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040849TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, k_eq_1_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080040850 for (uint32_t n = 1; n <= 4; n++) {
40851 for (uint32_t m = 1; m <= 3; m++) {
Marat Dukhand6021542021-06-30 09:04:20 -070040852 GemmMicrokernelTester()
40853 .mr(3)
40854 .nr(4)
40855 .kr(1)
40856 .sr(1)
40857 .m(m)
40858 .n(n)
40859 .k(1)
40860 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040861 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040862 }
40863 }
40864}
40865
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040866TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, k_eq_1_subtile_m) {
Marat Dukhand6021542021-06-30 09:04:20 -070040867 for (uint32_t m = 1; m <= 3; m++) {
40868 GemmMicrokernelTester()
40869 .mr(3)
40870 .nr(4)
40871 .kr(1)
40872 .sr(1)
40873 .m(m)
40874 .n(4)
40875 .k(1)
40876 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040877 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040878 }
40879}
40880
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040881TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, k_eq_1_subtile_n) {
Marat Dukhand6021542021-06-30 09:04:20 -070040882 for (uint32_t n = 1; n <= 4; n++) {
40883 GemmMicrokernelTester()
40884 .mr(3)
40885 .nr(4)
40886 .kr(1)
40887 .sr(1)
40888 .m(3)
40889 .n(n)
40890 .k(1)
40891 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040892 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040893 }
40894}
40895
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040896TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, k_gt_1) {
Marat Dukhand6021542021-06-30 09:04:20 -070040897 for (size_t k = 2; k < 10; k++) {
40898 GemmMicrokernelTester()
40899 .mr(3)
40900 .nr(4)
40901 .kr(1)
40902 .sr(1)
40903 .m(3)
40904 .n(4)
40905 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080040906 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040907 }
40908}
40909
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040910TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, k_gt_1_strided_a) {
Marat Dukhand6021542021-06-30 09:04:20 -070040911 for (size_t k = 2; k < 10; k++) {
40912 GemmMicrokernelTester()
40913 .mr(3)
40914 .nr(4)
40915 .kr(1)
40916 .sr(1)
40917 .m(3)
40918 .n(4)
40919 .k(k)
40920 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080040921 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040922 }
40923}
40924
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040925TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, k_gt_1_subtile) {
Marat Dukhand6021542021-06-30 09:04:20 -070040926 for (size_t k = 2; k < 10; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080040927 for (uint32_t n = 1; n <= 4; n++) {
40928 for (uint32_t m = 1; m <= 3; m++) {
Marat Dukhand6021542021-06-30 09:04:20 -070040929 GemmMicrokernelTester()
40930 .mr(3)
40931 .nr(4)
40932 .kr(1)
40933 .sr(1)
40934 .m(m)
40935 .n(n)
40936 .k(k)
40937 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080040938 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040939 }
40940 }
40941 }
40942}
40943
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040944TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, n_gt_4) {
Marat Dukhand6021542021-06-30 09:04:20 -070040945 for (uint32_t n = 5; n < 8; n++) {
40946 for (size_t k = 1; k <= 5; k += 2) {
40947 GemmMicrokernelTester()
40948 .mr(3)
40949 .nr(4)
40950 .kr(1)
40951 .sr(1)
40952 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080040953 .n(n)
Marat Dukhand6021542021-06-30 09:04:20 -070040954 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080040955 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040956 }
40957 }
40958}
40959
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040960TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, n_gt_4_strided_cn) {
Marat Dukhand6021542021-06-30 09:04:20 -070040961 for (uint32_t n = 5; n < 8; n++) {
40962 for (size_t k = 1; k <= 5; k += 2) {
40963 GemmMicrokernelTester()
40964 .mr(3)
40965 .nr(4)
40966 .kr(1)
40967 .sr(1)
40968 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080040969 .n(n)
Marat Dukhand6021542021-06-30 09:04:20 -070040970 .k(k)
40971 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080040972 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040973 }
40974 }
40975}
40976
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040977TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, n_gt_4_strided_a) {
Marat Dukhand6021542021-06-30 09:04:20 -070040978 for (uint32_t n = 5; n < 8; n++) {
40979 for (size_t k = 1; k <= 5; k += 2) {
40980 GemmMicrokernelTester()
40981 .mr(3)
40982 .nr(4)
40983 .kr(1)
40984 .sr(1)
40985 .m(3)
40986 .n(n)
40987 .k(k)
40988 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080040989 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070040990 }
40991 }
40992}
40993
Marat Dukhan2ac722e2022-01-04 01:54:20 -080040994TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, n_gt_4_subtile) {
Marat Dukhand6021542021-06-30 09:04:20 -070040995 for (uint32_t n = 5; n < 8; n++) {
40996 for (size_t k = 1; k <= 5; k += 2) {
40997 for (uint32_t m = 1; m <= 3; m++) {
40998 GemmMicrokernelTester()
40999 .mr(3)
41000 .nr(4)
41001 .kr(1)
41002 .sr(1)
41003 .m(m)
41004 .n(n)
41005 .k(k)
41006 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080041007 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070041008 }
41009 }
41010 }
41011}
41012
Marat Dukhan2ac722e2022-01-04 01:54:20 -080041013TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, n_div_4) {
Marat Dukhand6021542021-06-30 09:04:20 -070041014 for (uint32_t n = 8; n <= 12; n += 4) {
41015 for (size_t k = 1; k <= 5; k += 2) {
41016 GemmMicrokernelTester()
41017 .mr(3)
41018 .nr(4)
41019 .kr(1)
41020 .sr(1)
41021 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080041022 .n(n)
Marat Dukhand6021542021-06-30 09:04:20 -070041023 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080041024 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070041025 }
41026 }
41027}
41028
Marat Dukhan2ac722e2022-01-04 01:54:20 -080041029TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, n_div_4_strided_cn) {
Marat Dukhand6021542021-06-30 09:04:20 -070041030 for (uint32_t n = 8; n <= 12; n += 4) {
41031 for (size_t k = 1; k <= 5; k += 2) {
41032 GemmMicrokernelTester()
41033 .mr(3)
41034 .nr(4)
41035 .kr(1)
41036 .sr(1)
41037 .m(3)
41038 .n(n)
41039 .k(k)
41040 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080041041 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070041042 }
41043 }
41044}
41045
Marat Dukhan2ac722e2022-01-04 01:54:20 -080041046TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, n_div_4_strided_a) {
Marat Dukhand6021542021-06-30 09:04:20 -070041047 for (uint32_t n = 8; n <= 12; n += 4) {
41048 for (size_t k = 1; k <= 5; k += 2) {
41049 GemmMicrokernelTester()
41050 .mr(3)
41051 .nr(4)
41052 .kr(1)
41053 .sr(1)
41054 .m(3)
41055 .n(n)
41056 .k(k)
41057 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080041058 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070041059 }
41060 }
41061}
41062
Marat Dukhan2ac722e2022-01-04 01:54:20 -080041063TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, n_div_4_subtile) {
Marat Dukhand6021542021-06-30 09:04:20 -070041064 for (uint32_t n = 8; n <= 12; n += 4) {
41065 for (size_t k = 1; k <= 5; k += 2) {
41066 for (uint32_t m = 1; m <= 3; m++) {
41067 GemmMicrokernelTester()
41068 .mr(3)
41069 .nr(4)
41070 .kr(1)
41071 .sr(1)
41072 .m(m)
41073 .n(n)
41074 .k(k)
41075 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080041076 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070041077 }
41078 }
41079 }
41080}
41081
Marat Dukhan2ac722e2022-01-04 01:54:20 -080041082TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, strided_cm_subtile) {
Marat Dukhand6021542021-06-30 09:04:20 -070041083 for (size_t k = 1; k <= 5; k += 2) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080041084 for (uint32_t n = 1; n <= 4; n++) {
41085 for (uint32_t m = 1; m <= 3; m++) {
Marat Dukhand6021542021-06-30 09:04:20 -070041086 GemmMicrokernelTester()
41087 .mr(3)
41088 .nr(4)
41089 .kr(1)
41090 .sr(1)
41091 .m(m)
41092 .n(n)
41093 .k(k)
41094 .cm_stride(7)
41095 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080041096 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070041097 }
41098 }
41099 }
41100}
41101
Marat Dukhan2ac722e2022-01-04 01:54:20 -080041102TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, qmin) {
Marat Dukhand6021542021-06-30 09:04:20 -070041103 GemmMicrokernelTester()
41104 .mr(3)
41105 .nr(4)
41106 .kr(1)
41107 .sr(1)
41108 .m(3)
41109 .n(4)
41110 .k(1)
41111 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080041112 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070041113}
41114
Marat Dukhan2ac722e2022-01-04 01:54:20 -080041115TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, qmax) {
Marat Dukhand6021542021-06-30 09:04:20 -070041116 GemmMicrokernelTester()
41117 .mr(3)
41118 .nr(4)
41119 .kr(1)
41120 .sr(1)
41121 .m(3)
41122 .n(4)
41123 .k(1)
41124 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080041125 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070041126}
41127
Marat Dukhan2ac722e2022-01-04 01:54:20 -080041128TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, strided_cm) {
Marat Dukhand6021542021-06-30 09:04:20 -070041129 GemmMicrokernelTester()
41130 .mr(3)
41131 .nr(4)
41132 .kr(1)
41133 .sr(1)
41134 .m(3)
41135 .n(4)
41136 .k(1)
41137 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080041138 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Marat Dukhand6021542021-06-30 09:04:20 -070041139}
41140
41141
Marat Dukhan272d4d92022-01-04 15:07:14 -080041142TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, k_eq_1) {
41143 GemmMicrokernelTester()
41144 .mr(1)
41145 .nr(4)
41146 .kr(1)
41147 .sr(1)
41148 .m(1)
41149 .n(4)
41150 .k(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080041151 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041152}
41153
41154TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, strided_cn) {
41155 GemmMicrokernelTester()
41156 .mr(1)
41157 .nr(4)
41158 .kr(1)
41159 .sr(1)
41160 .m(1)
41161 .n(4)
41162 .k(1)
41163 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080041164 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041165}
41166
41167TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, k_eq_1_strided_a) {
41168 GemmMicrokernelTester()
41169 .mr(1)
41170 .nr(4)
41171 .kr(1)
41172 .sr(1)
41173 .m(1)
41174 .n(4)
41175 .k(1)
41176 .a_stride(3)
Marat Dukhan50323b82022-01-11 00:12:01 -080041177 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041178}
41179
41180TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, k_eq_1_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080041181 for (uint32_t n = 1; n <= 4; n++) {
41182 for (uint32_t m = 1; m <= 1; m++) {
Marat Dukhan272d4d92022-01-04 15:07:14 -080041183 GemmMicrokernelTester()
41184 .mr(1)
41185 .nr(4)
41186 .kr(1)
41187 .sr(1)
41188 .m(m)
41189 .n(n)
41190 .k(1)
41191 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080041192 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041193 }
41194 }
41195}
41196
41197TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, k_eq_1_subtile_m) {
41198 for (uint32_t m = 1; m <= 1; m++) {
41199 GemmMicrokernelTester()
41200 .mr(1)
41201 .nr(4)
41202 .kr(1)
41203 .sr(1)
41204 .m(m)
41205 .n(4)
41206 .k(1)
41207 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080041208 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041209 }
41210}
41211
41212TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, k_eq_1_subtile_n) {
41213 for (uint32_t n = 1; n <= 4; n++) {
41214 GemmMicrokernelTester()
41215 .mr(1)
41216 .nr(4)
41217 .kr(1)
41218 .sr(1)
41219 .m(1)
41220 .n(n)
41221 .k(1)
41222 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080041223 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041224 }
41225}
41226
41227TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, k_gt_1) {
41228 for (size_t k = 2; k < 10; k++) {
41229 GemmMicrokernelTester()
41230 .mr(1)
41231 .nr(4)
41232 .kr(1)
41233 .sr(1)
41234 .m(1)
41235 .n(4)
41236 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080041237 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041238 }
41239}
41240
41241TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, k_gt_1_strided_a) {
41242 for (size_t k = 2; k < 10; k++) {
41243 GemmMicrokernelTester()
41244 .mr(1)
41245 .nr(4)
41246 .kr(1)
41247 .sr(1)
41248 .m(1)
41249 .n(4)
41250 .k(k)
41251 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080041252 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041253 }
41254}
41255
41256TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, k_gt_1_subtile) {
41257 for (size_t k = 2; k < 10; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080041258 for (uint32_t n = 1; n <= 4; n++) {
41259 for (uint32_t m = 1; m <= 1; m++) {
Marat Dukhan272d4d92022-01-04 15:07:14 -080041260 GemmMicrokernelTester()
41261 .mr(1)
41262 .nr(4)
41263 .kr(1)
41264 .sr(1)
41265 .m(m)
41266 .n(n)
41267 .k(k)
41268 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080041269 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041270 }
41271 }
41272 }
41273}
41274
41275TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, n_gt_4) {
41276 for (uint32_t n = 5; n < 8; n++) {
41277 for (size_t k = 1; k <= 5; k += 2) {
41278 GemmMicrokernelTester()
41279 .mr(1)
41280 .nr(4)
41281 .kr(1)
41282 .sr(1)
41283 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080041284 .n(n)
Marat Dukhan272d4d92022-01-04 15:07:14 -080041285 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080041286 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041287 }
41288 }
41289}
41290
41291TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, n_gt_4_strided_cn) {
41292 for (uint32_t n = 5; n < 8; n++) {
41293 for (size_t k = 1; k <= 5; k += 2) {
41294 GemmMicrokernelTester()
41295 .mr(1)
41296 .nr(4)
41297 .kr(1)
41298 .sr(1)
41299 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080041300 .n(n)
Marat Dukhan272d4d92022-01-04 15:07:14 -080041301 .k(k)
41302 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080041303 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041304 }
41305 }
41306}
41307
41308TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, n_gt_4_strided_a) {
41309 for (uint32_t n = 5; n < 8; n++) {
41310 for (size_t k = 1; k <= 5; k += 2) {
41311 GemmMicrokernelTester()
41312 .mr(1)
41313 .nr(4)
41314 .kr(1)
41315 .sr(1)
41316 .m(1)
41317 .n(n)
41318 .k(k)
41319 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080041320 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041321 }
41322 }
41323}
41324
41325TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, n_gt_4_subtile) {
41326 for (uint32_t n = 5; n < 8; n++) {
41327 for (size_t k = 1; k <= 5; k += 2) {
41328 for (uint32_t m = 1; m <= 1; m++) {
41329 GemmMicrokernelTester()
41330 .mr(1)
41331 .nr(4)
41332 .kr(1)
41333 .sr(1)
41334 .m(m)
41335 .n(n)
41336 .k(k)
41337 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080041338 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041339 }
41340 }
41341 }
41342}
41343
41344TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, n_div_4) {
41345 for (uint32_t n = 8; n <= 12; n += 4) {
41346 for (size_t k = 1; k <= 5; k += 2) {
41347 GemmMicrokernelTester()
41348 .mr(1)
41349 .nr(4)
41350 .kr(1)
41351 .sr(1)
41352 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080041353 .n(n)
Marat Dukhan272d4d92022-01-04 15:07:14 -080041354 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080041355 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041356 }
41357 }
41358}
41359
41360TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, n_div_4_strided_cn) {
41361 for (uint32_t n = 8; n <= 12; n += 4) {
41362 for (size_t k = 1; k <= 5; k += 2) {
41363 GemmMicrokernelTester()
41364 .mr(1)
41365 .nr(4)
41366 .kr(1)
41367 .sr(1)
41368 .m(1)
41369 .n(n)
41370 .k(k)
41371 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080041372 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041373 }
41374 }
41375}
41376
41377TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, n_div_4_strided_a) {
41378 for (uint32_t n = 8; n <= 12; n += 4) {
41379 for (size_t k = 1; k <= 5; k += 2) {
41380 GemmMicrokernelTester()
41381 .mr(1)
41382 .nr(4)
41383 .kr(1)
41384 .sr(1)
41385 .m(1)
41386 .n(n)
41387 .k(k)
41388 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080041389 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041390 }
41391 }
41392}
41393
41394TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, n_div_4_subtile) {
41395 for (uint32_t n = 8; n <= 12; n += 4) {
41396 for (size_t k = 1; k <= 5; k += 2) {
41397 for (uint32_t m = 1; m <= 1; m++) {
41398 GemmMicrokernelTester()
41399 .mr(1)
41400 .nr(4)
41401 .kr(1)
41402 .sr(1)
41403 .m(m)
41404 .n(n)
41405 .k(k)
41406 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080041407 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041408 }
41409 }
41410 }
41411}
41412
41413TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, strided_cm_subtile) {
41414 for (size_t k = 1; k <= 5; k += 2) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080041415 for (uint32_t n = 1; n <= 4; n++) {
41416 for (uint32_t m = 1; m <= 1; m++) {
Marat Dukhan272d4d92022-01-04 15:07:14 -080041417 GemmMicrokernelTester()
41418 .mr(1)
41419 .nr(4)
41420 .kr(1)
41421 .sr(1)
41422 .m(m)
41423 .n(n)
41424 .k(k)
41425 .cm_stride(7)
41426 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080041427 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041428 }
41429 }
41430 }
41431}
41432
41433TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, qmin) {
41434 GemmMicrokernelTester()
41435 .mr(1)
41436 .nr(4)
41437 .kr(1)
41438 .sr(1)
41439 .m(1)
41440 .n(4)
41441 .k(1)
41442 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080041443 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041444}
41445
41446TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, qmax) {
41447 GemmMicrokernelTester()
41448 .mr(1)
41449 .nr(4)
41450 .kr(1)
41451 .sr(1)
41452 .m(1)
41453 .n(4)
41454 .k(1)
41455 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080041456 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041457}
41458
41459TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, strided_cm) {
41460 GemmMicrokernelTester()
41461 .mr(1)
41462 .nr(4)
41463 .kr(1)
41464 .sr(1)
41465 .m(1)
41466 .n(4)
41467 .k(1)
41468 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080041469 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041470}
41471
41472
Marat Dukhan272d4d92022-01-04 15:07:14 -080041473TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, k_eq_1) {
41474 GemmMicrokernelTester()
41475 .mr(4)
41476 .nr(2)
41477 .kr(1)
41478 .sr(1)
41479 .m(4)
41480 .n(2)
41481 .k(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080041482 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041483}
41484
41485TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, strided_cn) {
41486 GemmMicrokernelTester()
41487 .mr(4)
41488 .nr(2)
41489 .kr(1)
41490 .sr(1)
41491 .m(4)
41492 .n(2)
41493 .k(1)
41494 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080041495 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041496}
41497
41498TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, k_eq_1_strided_a) {
41499 GemmMicrokernelTester()
41500 .mr(4)
41501 .nr(2)
41502 .kr(1)
41503 .sr(1)
41504 .m(4)
41505 .n(2)
41506 .k(1)
41507 .a_stride(3)
Marat Dukhan50323b82022-01-11 00:12:01 -080041508 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041509}
41510
41511TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, k_eq_1_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080041512 for (uint32_t n = 1; n <= 2; n++) {
41513 for (uint32_t m = 1; m <= 4; m++) {
Marat Dukhan272d4d92022-01-04 15:07:14 -080041514 GemmMicrokernelTester()
41515 .mr(4)
41516 .nr(2)
41517 .kr(1)
41518 .sr(1)
41519 .m(m)
41520 .n(n)
41521 .k(1)
41522 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080041523 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041524 }
41525 }
41526}
41527
41528TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, k_eq_1_subtile_m) {
41529 for (uint32_t m = 1; m <= 4; m++) {
41530 GemmMicrokernelTester()
41531 .mr(4)
41532 .nr(2)
41533 .kr(1)
41534 .sr(1)
41535 .m(m)
41536 .n(2)
41537 .k(1)
41538 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080041539 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041540 }
41541}
41542
41543TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, k_eq_1_subtile_n) {
41544 for (uint32_t n = 1; n <= 2; n++) {
41545 GemmMicrokernelTester()
41546 .mr(4)
41547 .nr(2)
41548 .kr(1)
41549 .sr(1)
41550 .m(4)
41551 .n(n)
41552 .k(1)
41553 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080041554 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041555 }
41556}
41557
41558TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, k_gt_1) {
41559 for (size_t k = 2; k < 10; k++) {
41560 GemmMicrokernelTester()
41561 .mr(4)
41562 .nr(2)
41563 .kr(1)
41564 .sr(1)
41565 .m(4)
41566 .n(2)
41567 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080041568 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041569 }
41570}
41571
41572TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, k_gt_1_strided_a) {
41573 for (size_t k = 2; k < 10; k++) {
41574 GemmMicrokernelTester()
41575 .mr(4)
41576 .nr(2)
41577 .kr(1)
41578 .sr(1)
41579 .m(4)
41580 .n(2)
41581 .k(k)
41582 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080041583 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041584 }
41585}
41586
41587TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, k_gt_1_subtile) {
41588 for (size_t k = 2; k < 10; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080041589 for (uint32_t n = 1; n <= 2; n++) {
41590 for (uint32_t m = 1; m <= 4; m++) {
Marat Dukhan272d4d92022-01-04 15:07:14 -080041591 GemmMicrokernelTester()
41592 .mr(4)
41593 .nr(2)
41594 .kr(1)
41595 .sr(1)
41596 .m(m)
41597 .n(n)
41598 .k(k)
41599 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080041600 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041601 }
41602 }
41603 }
41604}
41605
41606TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, n_gt_2) {
41607 for (uint32_t n = 3; n < 4; n++) {
41608 for (size_t k = 1; k <= 5; k += 2) {
41609 GemmMicrokernelTester()
41610 .mr(4)
41611 .nr(2)
41612 .kr(1)
41613 .sr(1)
41614 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080041615 .n(n)
Marat Dukhan272d4d92022-01-04 15:07:14 -080041616 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080041617 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041618 }
41619 }
41620}
41621
41622TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, n_gt_2_strided_cn) {
41623 for (uint32_t n = 3; n < 4; n++) {
41624 for (size_t k = 1; k <= 5; k += 2) {
41625 GemmMicrokernelTester()
41626 .mr(4)
41627 .nr(2)
41628 .kr(1)
41629 .sr(1)
41630 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080041631 .n(n)
Marat Dukhan272d4d92022-01-04 15:07:14 -080041632 .k(k)
41633 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080041634 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041635 }
41636 }
41637}
41638
41639TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, n_gt_2_strided_a) {
41640 for (uint32_t n = 3; n < 4; n++) {
41641 for (size_t k = 1; k <= 5; k += 2) {
41642 GemmMicrokernelTester()
41643 .mr(4)
41644 .nr(2)
41645 .kr(1)
41646 .sr(1)
41647 .m(4)
41648 .n(n)
41649 .k(k)
41650 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080041651 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041652 }
41653 }
41654}
41655
41656TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, n_gt_2_subtile) {
41657 for (uint32_t n = 3; n < 4; n++) {
41658 for (size_t k = 1; k <= 5; k += 2) {
41659 for (uint32_t m = 1; m <= 4; m++) {
41660 GemmMicrokernelTester()
41661 .mr(4)
41662 .nr(2)
41663 .kr(1)
41664 .sr(1)
41665 .m(m)
41666 .n(n)
41667 .k(k)
41668 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080041669 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041670 }
41671 }
41672 }
41673}
41674
41675TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, n_div_2) {
41676 for (uint32_t n = 4; n <= 6; n += 2) {
41677 for (size_t k = 1; k <= 5; k += 2) {
41678 GemmMicrokernelTester()
41679 .mr(4)
41680 .nr(2)
41681 .kr(1)
41682 .sr(1)
41683 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080041684 .n(n)
Marat Dukhan272d4d92022-01-04 15:07:14 -080041685 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080041686 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041687 }
41688 }
41689}
41690
41691TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, n_div_2_strided_cn) {
41692 for (uint32_t n = 4; n <= 6; n += 2) {
41693 for (size_t k = 1; k <= 5; k += 2) {
41694 GemmMicrokernelTester()
41695 .mr(4)
41696 .nr(2)
41697 .kr(1)
41698 .sr(1)
41699 .m(4)
41700 .n(n)
41701 .k(k)
41702 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080041703 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041704 }
41705 }
41706}
41707
41708TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, n_div_2_strided_a) {
41709 for (uint32_t n = 4; n <= 6; n += 2) {
41710 for (size_t k = 1; k <= 5; k += 2) {
41711 GemmMicrokernelTester()
41712 .mr(4)
41713 .nr(2)
41714 .kr(1)
41715 .sr(1)
41716 .m(4)
41717 .n(n)
41718 .k(k)
41719 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080041720 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041721 }
41722 }
41723}
41724
41725TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, n_div_2_subtile) {
41726 for (uint32_t n = 4; n <= 6; n += 2) {
41727 for (size_t k = 1; k <= 5; k += 2) {
41728 for (uint32_t m = 1; m <= 4; m++) {
41729 GemmMicrokernelTester()
41730 .mr(4)
41731 .nr(2)
41732 .kr(1)
41733 .sr(1)
41734 .m(m)
41735 .n(n)
41736 .k(k)
41737 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080041738 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041739 }
41740 }
41741 }
41742}
41743
41744TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, strided_cm_subtile) {
41745 for (size_t k = 1; k <= 5; k += 2) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080041746 for (uint32_t n = 1; n <= 2; n++) {
41747 for (uint32_t m = 1; m <= 4; m++) {
Marat Dukhan272d4d92022-01-04 15:07:14 -080041748 GemmMicrokernelTester()
41749 .mr(4)
41750 .nr(2)
41751 .kr(1)
41752 .sr(1)
41753 .m(m)
41754 .n(n)
41755 .k(k)
41756 .cm_stride(5)
41757 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080041758 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041759 }
41760 }
41761 }
41762}
41763
41764TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, qmin) {
41765 GemmMicrokernelTester()
41766 .mr(4)
41767 .nr(2)
41768 .kr(1)
41769 .sr(1)
41770 .m(4)
41771 .n(2)
41772 .k(1)
41773 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080041774 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041775}
41776
41777TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, qmax) {
41778 GemmMicrokernelTester()
41779 .mr(4)
41780 .nr(2)
41781 .kr(1)
41782 .sr(1)
41783 .m(4)
41784 .n(2)
41785 .k(1)
41786 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080041787 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041788}
41789
41790TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, strided_cm) {
41791 GemmMicrokernelTester()
41792 .mr(4)
41793 .nr(2)
41794 .kr(1)
41795 .sr(1)
41796 .m(4)
41797 .n(2)
41798 .k(1)
41799 .cm_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080041800 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Marat Dukhan272d4d92022-01-04 15:07:14 -080041801}
41802
41803
Zhi An Ng16b734c2022-01-06 13:54:40 -080041804#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT
41805 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_eq_8) {
41806 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080041807 GemmMicrokernelTester()
41808 .mr(4)
41809 .nr(8)
41810 .kr(4)
41811 .sr(1)
41812 .m(4)
41813 .n(8)
41814 .k(8)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080041815 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080041816 }
41817
41818 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, strided_cn) {
41819 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080041820 GemmMicrokernelTester()
41821 .mr(4)
41822 .nr(8)
41823 .kr(4)
41824 .sr(1)
41825 .m(4)
41826 .n(8)
41827 .k(8)
41828 .cn_stride(11)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080041829 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080041830 }
41831
41832 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_eq_8_strided_a) {
41833 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080041834 GemmMicrokernelTester()
41835 .mr(4)
41836 .nr(8)
41837 .kr(4)
41838 .sr(1)
41839 .m(4)
41840 .n(8)
41841 .k(8)
41842 .a_stride(11)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080041843 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080041844 }
41845
41846 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_eq_8_subtile) {
41847 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng83844ae2022-01-14 09:52:25 -080041848 for (uint32_t n = 1; n <= 8; n++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080041849 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080041850 GemmMicrokernelTester()
41851 .mr(4)
41852 .nr(8)
41853 .kr(4)
41854 .sr(1)
41855 .m(m)
41856 .n(n)
41857 .k(8)
41858 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080041859 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080041860 }
41861 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080041862 }
41863
41864 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_eq_8_subtile_m) {
41865 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080041866 for (uint32_t m = 1; m <= 4; m++) {
41867 GemmMicrokernelTester()
41868 .mr(4)
41869 .nr(8)
41870 .kr(4)
41871 .sr(1)
41872 .m(m)
41873 .n(8)
41874 .k(8)
41875 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080041876 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080041877 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080041878 }
41879
41880 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_eq_8_subtile_n) {
41881 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080041882 for (uint32_t n = 1; n <= 8; n++) {
41883 GemmMicrokernelTester()
41884 .mr(4)
41885 .nr(8)
41886 .kr(4)
41887 .sr(1)
41888 .m(4)
41889 .n(n)
41890 .k(8)
41891 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080041892 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080041893 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080041894 }
41895
41896 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_lt_8) {
41897 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080041898 for (size_t k = 1; k < 8; k++) {
41899 GemmMicrokernelTester()
41900 .mr(4)
41901 .nr(8)
41902 .kr(4)
41903 .sr(1)
41904 .m(4)
41905 .n(8)
41906 .k(k)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080041907 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080041908 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080041909 }
41910
41911 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_lt_8_strided_a) {
41912 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080041913 for (size_t k = 1; k < 8; k++) {
41914 GemmMicrokernelTester()
41915 .mr(4)
41916 .nr(8)
41917 .kr(4)
41918 .sr(1)
41919 .m(4)
41920 .n(8)
41921 .k(k)
41922 .a_stride(11)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080041923 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080041924 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080041925 }
41926
41927 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_lt_8_subtile) {
41928 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080041929 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080041930 for (uint32_t n = 1; n <= 8; n++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080041931 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080041932 GemmMicrokernelTester()
41933 .mr(4)
41934 .nr(8)
41935 .kr(4)
41936 .sr(1)
41937 .m(m)
41938 .n(n)
41939 .k(k)
41940 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080041941 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080041942 }
41943 }
41944 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080041945 }
41946
41947 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_gt_8) {
41948 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080041949 for (size_t k = 9; k < 16; k++) {
41950 GemmMicrokernelTester()
41951 .mr(4)
41952 .nr(8)
41953 .kr(4)
41954 .sr(1)
41955 .m(4)
41956 .n(8)
41957 .k(k)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080041958 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080041959 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080041960 }
41961
41962 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_gt_8_strided_a) {
41963 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080041964 for (size_t k = 9; k < 16; k++) {
41965 GemmMicrokernelTester()
41966 .mr(4)
41967 .nr(8)
41968 .kr(4)
41969 .sr(1)
41970 .m(4)
41971 .n(8)
41972 .k(k)
41973 .a_stride(19)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080041974 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080041975 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080041976 }
41977
41978 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_gt_8_subtile) {
41979 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080041980 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080041981 for (uint32_t n = 1; n <= 8; n++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080041982 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080041983 GemmMicrokernelTester()
41984 .mr(4)
41985 .nr(8)
41986 .kr(4)
41987 .sr(1)
41988 .m(m)
41989 .n(n)
41990 .k(k)
41991 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080041992 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080041993 }
41994 }
41995 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080041996 }
41997
41998 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_div_8) {
41999 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042000 for (size_t k = 16; k <= 80; k += 8) {
42001 GemmMicrokernelTester()
42002 .mr(4)
42003 .nr(8)
42004 .kr(4)
42005 .sr(1)
42006 .m(4)
42007 .n(8)
42008 .k(k)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042009 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042010 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042011 }
42012
42013 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_div_8_strided_a) {
42014 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042015 for (size_t k = 16; k <= 80; k += 8) {
42016 GemmMicrokernelTester()
42017 .mr(4)
42018 .nr(8)
42019 .kr(4)
42020 .sr(1)
42021 .m(4)
42022 .n(8)
42023 .k(k)
42024 .a_stride(83)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042025 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042026 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042027 }
42028
42029 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_div_8_subtile) {
42030 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042031 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080042032 for (uint32_t n = 1; n <= 8; n++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080042033 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042034 GemmMicrokernelTester()
42035 .mr(4)
42036 .nr(8)
42037 .kr(4)
42038 .sr(1)
42039 .m(m)
42040 .n(n)
42041 .k(k)
42042 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042043 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042044 }
42045 }
42046 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042047 }
42048
42049 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, n_gt_8) {
42050 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042051 for (uint32_t n = 9; n < 16; n++) {
42052 for (size_t k = 1; k <= 40; k += 9) {
42053 GemmMicrokernelTester()
42054 .mr(4)
42055 .nr(8)
42056 .kr(4)
42057 .sr(1)
42058 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080042059 .n(n)
Zhi An Ng16b734c2022-01-06 13:54:40 -080042060 .k(k)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042061 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042062 }
42063 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042064 }
42065
42066 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, n_gt_8_strided_cn) {
42067 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042068 for (uint32_t n = 9; n < 16; n++) {
42069 for (size_t k = 1; k <= 40; k += 9) {
42070 GemmMicrokernelTester()
42071 .mr(4)
42072 .nr(8)
42073 .kr(4)
42074 .sr(1)
42075 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080042076 .n(n)
Zhi An Ng16b734c2022-01-06 13:54:40 -080042077 .k(k)
42078 .cn_stride(11)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042079 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042080 }
42081 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042082 }
42083
42084 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, n_gt_8_strided_a) {
42085 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042086 for (uint32_t n = 9; n < 16; n++) {
42087 for (size_t k = 1; k <= 40; k += 9) {
42088 GemmMicrokernelTester()
42089 .mr(4)
42090 .nr(8)
42091 .kr(4)
42092 .sr(1)
42093 .m(4)
42094 .n(n)
42095 .k(k)
42096 .a_stride(43)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042097 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042098 }
42099 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042100 }
42101
42102 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, n_gt_8_subtile) {
42103 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042104 for (uint32_t n = 9; n < 16; n++) {
42105 for (size_t k = 1; k <= 40; k += 9) {
42106 for (uint32_t m = 1; m <= 4; m++) {
42107 GemmMicrokernelTester()
42108 .mr(4)
42109 .nr(8)
42110 .kr(4)
42111 .sr(1)
42112 .m(m)
42113 .n(n)
42114 .k(k)
42115 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042116 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042117 }
42118 }
42119 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042120 }
42121
42122 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, n_div_8) {
42123 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042124 for (uint32_t n = 16; n <= 24; n += 8) {
42125 for (size_t k = 1; k <= 40; k += 9) {
42126 GemmMicrokernelTester()
42127 .mr(4)
42128 .nr(8)
42129 .kr(4)
42130 .sr(1)
42131 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080042132 .n(n)
Zhi An Ng16b734c2022-01-06 13:54:40 -080042133 .k(k)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042134 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042135 }
42136 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042137 }
42138
42139 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, n_div_8_strided_cn) {
42140 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042141 for (uint32_t n = 16; n <= 24; n += 8) {
42142 for (size_t k = 1; k <= 40; k += 9) {
42143 GemmMicrokernelTester()
42144 .mr(4)
42145 .nr(8)
42146 .kr(4)
42147 .sr(1)
42148 .m(4)
42149 .n(n)
42150 .k(k)
42151 .cn_stride(11)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042152 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042153 }
42154 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042155 }
42156
42157 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, n_div_8_strided_a) {
42158 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042159 for (uint32_t n = 16; n <= 24; n += 8) {
42160 for (size_t k = 1; k <= 40; k += 9) {
42161 GemmMicrokernelTester()
42162 .mr(4)
42163 .nr(8)
42164 .kr(4)
42165 .sr(1)
42166 .m(4)
42167 .n(n)
42168 .k(k)
42169 .a_stride(43)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042170 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042171 }
42172 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042173 }
42174
42175 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, n_div_8_subtile) {
42176 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042177 for (uint32_t n = 16; n <= 24; n += 8) {
42178 for (size_t k = 1; k <= 40; k += 9) {
42179 for (uint32_t m = 1; m <= 4; m++) {
42180 GemmMicrokernelTester()
42181 .mr(4)
42182 .nr(8)
42183 .kr(4)
42184 .sr(1)
42185 .m(m)
42186 .n(n)
42187 .k(k)
42188 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042189 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042190 }
42191 }
42192 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042193 }
42194
42195 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, strided_cm_subtile) {
42196 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042197 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080042198 for (uint32_t n = 1; n <= 8; n++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080042199 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042200 GemmMicrokernelTester()
42201 .mr(4)
42202 .nr(8)
42203 .kr(4)
42204 .sr(1)
42205 .m(m)
42206 .n(n)
42207 .k(k)
42208 .cm_stride(11)
42209 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042210 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042211 }
42212 }
42213 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042214 }
42215
42216 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, qmin) {
42217 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042218 GemmMicrokernelTester()
42219 .mr(4)
42220 .nr(8)
42221 .kr(4)
42222 .sr(1)
42223 .m(4)
42224 .n(8)
42225 .k(8)
42226 .qmin(128)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042227 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042228 }
42229
42230 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, qmax) {
42231 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042232 GemmMicrokernelTester()
42233 .mr(4)
42234 .nr(8)
42235 .kr(4)
42236 .sr(1)
42237 .m(4)
42238 .n(8)
42239 .k(8)
42240 .qmax(128)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042241 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042242 }
42243
42244 TEST(GENERATE_QC8_GEMM_FP32_4X8C4__AARCH32_NEONDOT_LD64, strided_cm) {
42245 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042246 GemmMicrokernelTester()
42247 .mr(4)
42248 .nr(8)
42249 .kr(4)
42250 .sr(1)
42251 .m(4)
42252 .n(8)
42253 .k(8)
42254 .cm_stride(11)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042255 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042256 }
42257#endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT
42258
42259
42260#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042261 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_eq_8) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042262 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042263 GemmMicrokernelTester()
42264 .mr(4)
42265 .nr(8)
42266 .kr(1)
42267 .sr(1)
42268 .m(4)
42269 .n(8)
42270 .k(8)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042271 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042272 }
42273
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042274 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, strided_cn) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042275 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042276 GemmMicrokernelTester()
42277 .mr(4)
42278 .nr(8)
42279 .kr(1)
42280 .sr(1)
42281 .m(4)
42282 .n(8)
42283 .k(8)
42284 .cn_stride(11)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042285 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042286 }
42287
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042288 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_eq_8_strided_a) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042289 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042290 GemmMicrokernelTester()
42291 .mr(4)
42292 .nr(8)
42293 .kr(1)
42294 .sr(1)
42295 .m(4)
42296 .n(8)
42297 .k(8)
42298 .a_stride(11)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042299 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042300 }
42301
42302 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
42303 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -080042304 for (uint32_t n = 1; n <= 8; n++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080042305 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042306 GemmMicrokernelTester()
42307 .mr(4)
42308 .nr(8)
42309 .kr(1)
42310 .sr(1)
42311 .m(m)
42312 .n(n)
42313 .k(8)
42314 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042315 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042316 }
42317 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042318 }
42319
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042320 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042321 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042322 for (uint32_t m = 1; m <= 4; m++) {
42323 GemmMicrokernelTester()
42324 .mr(4)
42325 .nr(8)
42326 .kr(1)
42327 .sr(1)
42328 .m(m)
42329 .n(8)
42330 .k(8)
42331 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042332 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042333 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042334 }
42335
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042336 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042337 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042338 for (uint32_t n = 1; n <= 8; n++) {
42339 GemmMicrokernelTester()
42340 .mr(4)
42341 .nr(8)
42342 .kr(1)
42343 .sr(1)
42344 .m(4)
42345 .n(n)
42346 .k(8)
42347 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042348 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042349 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042350 }
42351
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042352 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_lt_8) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042353 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042354 for (size_t k = 1; k < 8; k++) {
42355 GemmMicrokernelTester()
42356 .mr(4)
42357 .nr(8)
42358 .kr(1)
42359 .sr(1)
42360 .m(4)
42361 .n(8)
42362 .k(k)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042363 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042364 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042365 }
42366
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042367 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_lt_8_strided_a) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042368 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042369 for (size_t k = 1; k < 8; k++) {
42370 GemmMicrokernelTester()
42371 .mr(4)
42372 .nr(8)
42373 .kr(1)
42374 .sr(1)
42375 .m(4)
42376 .n(8)
42377 .k(k)
42378 .a_stride(11)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042379 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042380 }
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042381 }
42382
42383 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
42384 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042385 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080042386 for (uint32_t n = 1; n <= 8; n++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080042387 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042388 GemmMicrokernelTester()
42389 .mr(4)
42390 .nr(8)
42391 .kr(1)
42392 .sr(1)
42393 .m(m)
42394 .n(n)
42395 .k(k)
42396 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042397 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042398 }
42399 }
42400 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042401 }
42402
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042403 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_gt_8) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042404 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042405 for (size_t k = 9; k < 16; k++) {
42406 GemmMicrokernelTester()
42407 .mr(4)
42408 .nr(8)
42409 .kr(1)
42410 .sr(1)
42411 .m(4)
42412 .n(8)
42413 .k(k)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042414 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042415 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042416 }
42417
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042418 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_gt_8_strided_a) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042419 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042420 for (size_t k = 9; k < 16; k++) {
42421 GemmMicrokernelTester()
42422 .mr(4)
42423 .nr(8)
42424 .kr(1)
42425 .sr(1)
42426 .m(4)
42427 .n(8)
42428 .k(k)
42429 .a_stride(19)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042430 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042431 }
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042432 }
42433
42434 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
42435 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042436 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080042437 for (uint32_t n = 1; n <= 8; n++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080042438 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042439 GemmMicrokernelTester()
42440 .mr(4)
42441 .nr(8)
42442 .kr(1)
42443 .sr(1)
42444 .m(m)
42445 .n(n)
42446 .k(k)
42447 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042448 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042449 }
42450 }
42451 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042452 }
42453
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042454 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_div_8) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042455 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042456 for (size_t k = 16; k <= 80; k += 8) {
42457 GemmMicrokernelTester()
42458 .mr(4)
42459 .nr(8)
42460 .kr(1)
42461 .sr(1)
42462 .m(4)
42463 .n(8)
42464 .k(k)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042465 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042466 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042467 }
42468
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042469 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_div_8_strided_a) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042470 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042471 for (size_t k = 16; k <= 80; k += 8) {
42472 GemmMicrokernelTester()
42473 .mr(4)
42474 .nr(8)
42475 .kr(1)
42476 .sr(1)
42477 .m(4)
42478 .n(8)
42479 .k(k)
42480 .a_stride(83)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042481 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042482 }
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042483 }
42484
42485 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
42486 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042487 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080042488 for (uint32_t n = 1; n <= 8; n++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080042489 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042490 GemmMicrokernelTester()
42491 .mr(4)
42492 .nr(8)
42493 .kr(1)
42494 .sr(1)
42495 .m(m)
42496 .n(n)
42497 .k(k)
42498 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042499 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042500 }
42501 }
42502 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042503 }
42504
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042505 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, n_gt_8) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042506 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042507 for (uint32_t n = 9; n < 16; n++) {
42508 for (size_t k = 1; k <= 40; k += 9) {
42509 GemmMicrokernelTester()
42510 .mr(4)
42511 .nr(8)
42512 .kr(1)
42513 .sr(1)
42514 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080042515 .n(n)
Zhi An Ng16b734c2022-01-06 13:54:40 -080042516 .k(k)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042517 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042518 }
42519 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042520 }
42521
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042522 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, n_gt_8_strided_cn) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042523 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042524 for (uint32_t n = 9; n < 16; n++) {
42525 for (size_t k = 1; k <= 40; k += 9) {
42526 GemmMicrokernelTester()
42527 .mr(4)
42528 .nr(8)
42529 .kr(1)
42530 .sr(1)
42531 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080042532 .n(n)
Zhi An Ng16b734c2022-01-06 13:54:40 -080042533 .k(k)
42534 .cn_stride(11)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042535 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042536 }
42537 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042538 }
42539
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042540 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, n_gt_8_strided_a) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042541 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042542 for (uint32_t n = 9; n < 16; n++) {
42543 for (size_t k = 1; k <= 40; k += 9) {
42544 GemmMicrokernelTester()
42545 .mr(4)
42546 .nr(8)
42547 .kr(1)
42548 .sr(1)
42549 .m(4)
42550 .n(n)
42551 .k(k)
42552 .a_stride(43)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042553 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042554 }
42555 }
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042556 }
42557
42558 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, n_gt_8_subtile) {
42559 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042560 for (uint32_t n = 9; n < 16; n++) {
42561 for (size_t k = 1; k <= 40; k += 9) {
42562 for (uint32_t m = 1; m <= 4; m++) {
42563 GemmMicrokernelTester()
42564 .mr(4)
42565 .nr(8)
42566 .kr(1)
42567 .sr(1)
42568 .m(m)
42569 .n(n)
42570 .k(k)
42571 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042572 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042573 }
42574 }
42575 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042576 }
42577
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042578 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, n_div_8) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042579 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042580 for (uint32_t n = 16; n <= 24; n += 8) {
42581 for (size_t k = 1; k <= 40; k += 9) {
42582 GemmMicrokernelTester()
42583 .mr(4)
42584 .nr(8)
42585 .kr(1)
42586 .sr(1)
42587 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080042588 .n(n)
Zhi An Ng16b734c2022-01-06 13:54:40 -080042589 .k(k)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042590 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042591 }
42592 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042593 }
42594
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042595 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, n_div_8_strided_cn) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042596 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042597 for (uint32_t n = 16; n <= 24; n += 8) {
42598 for (size_t k = 1; k <= 40; k += 9) {
42599 GemmMicrokernelTester()
42600 .mr(4)
42601 .nr(8)
42602 .kr(1)
42603 .sr(1)
42604 .m(4)
42605 .n(n)
42606 .k(k)
42607 .cn_stride(11)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042608 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042609 }
42610 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042611 }
42612
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042613 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, n_div_8_strided_a) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042614 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042615 for (uint32_t n = 16; n <= 24; n += 8) {
42616 for (size_t k = 1; k <= 40; k += 9) {
42617 GemmMicrokernelTester()
42618 .mr(4)
42619 .nr(8)
42620 .kr(1)
42621 .sr(1)
42622 .m(4)
42623 .n(n)
42624 .k(k)
42625 .a_stride(43)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042626 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042627 }
42628 }
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042629 }
42630
42631 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, n_div_8_subtile) {
42632 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042633 for (uint32_t n = 16; n <= 24; n += 8) {
42634 for (size_t k = 1; k <= 40; k += 9) {
42635 for (uint32_t m = 1; m <= 4; m++) {
42636 GemmMicrokernelTester()
42637 .mr(4)
42638 .nr(8)
42639 .kr(1)
42640 .sr(1)
42641 .m(m)
42642 .n(n)
42643 .k(k)
42644 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042645 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042646 }
42647 }
42648 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042649 }
42650
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042651 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042652 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042653 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080042654 for (uint32_t n = 1; n <= 8; n++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080042655 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042656 GemmMicrokernelTester()
42657 .mr(4)
42658 .nr(8)
42659 .kr(1)
42660 .sr(1)
42661 .m(m)
42662 .n(n)
42663 .k(k)
42664 .cm_stride(11)
42665 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042666 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042667 }
42668 }
42669 }
Zhi An Ng16b734c2022-01-06 13:54:40 -080042670 }
42671
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042672 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, qmin) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042673 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042674 GemmMicrokernelTester()
42675 .mr(4)
42676 .nr(8)
42677 .kr(1)
42678 .sr(1)
42679 .m(4)
42680 .n(8)
42681 .k(8)
42682 .qmin(128)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042683 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042684 }
42685
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042686 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, qmax) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042687 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042688 GemmMicrokernelTester()
42689 .mr(4)
42690 .nr(8)
42691 .kr(1)
42692 .sr(1)
42693 .m(4)
42694 .n(8)
42695 .k(8)
42696 .qmax(128)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042697 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042698 }
42699
Zhi An Ng0afdfab2022-01-06 17:07:50 -080042700 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, strided_cm) {
Zhi An Ng16b734c2022-01-06 13:54:40 -080042701 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng16b734c2022-01-06 13:54:40 -080042702 GemmMicrokernelTester()
42703 .mr(4)
42704 .nr(8)
42705 .kr(1)
42706 .sr(1)
42707 .m(4)
42708 .n(8)
42709 .k(8)
42710 .cm_stride(11)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080042711 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng16b734c2022-01-06 13:54:40 -080042712 }
42713#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT