blob: a112b720105bee13d5b499e522e2bf849bd0d370 [file] [log] [blame]
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8//
9// Auto-generated file. Do not edit!
10// Specification: test/qc8-gemm-minmax-fp32.yaml
11// Generator: tools/generate-gemm-test.py
12
13
14#include <gtest/gtest.h>
15
16#include <xnnpack/allocator.h>
17#include <xnnpack/common.h>
18#include <xnnpack/isa-checks.h>
19
20#include <xnnpack/gemm.h>
21#include <xnnpack/igemm.h>
22#include <xnnpack/ppmm.h>
23#include "gemm-microkernel-tester.h"
24
25
Frank Barchardac654f12022-01-24 23:51:04 -080026#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS && XNN_ENABLE_ASSEMBLY
27 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_eq_8) {
28 TEST_REQUIRES_ARM_NEON_DOT;
29 GemmMicrokernelTester()
30 .mr(4)
31 .nr(8)
32 .kr(4)
33 .sr(1)
34 .m(4)
35 .n(8)
36 .k(8)
37 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
38 }
39
40 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, strided_cn) {
41 TEST_REQUIRES_ARM_NEON_DOT;
42 GemmMicrokernelTester()
43 .mr(4)
44 .nr(8)
45 .kr(4)
46 .sr(1)
47 .m(4)
48 .n(8)
49 .k(8)
50 .cn_stride(11)
51 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
52 }
53
54 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_eq_8_strided_a) {
55 TEST_REQUIRES_ARM_NEON_DOT;
56 GemmMicrokernelTester()
57 .mr(4)
58 .nr(8)
59 .kr(4)
60 .sr(1)
61 .m(4)
62 .n(8)
63 .k(8)
64 .a_stride(11)
65 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
66 }
67
68 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_eq_8_subtile) {
69 TEST_REQUIRES_ARM_NEON_DOT;
70 for (uint32_t n = 1; n <= 8; n++) {
71 for (uint32_t m = 1; m <= 4; m++) {
72 GemmMicrokernelTester()
73 .mr(4)
74 .nr(8)
75 .kr(4)
76 .sr(1)
77 .m(m)
78 .n(n)
79 .k(8)
80 .iterations(1)
81 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
82 }
83 }
84 }
85
86 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_eq_8_subtile_m) {
87 TEST_REQUIRES_ARM_NEON_DOT;
88 for (uint32_t m = 1; m <= 4; m++) {
89 GemmMicrokernelTester()
90 .mr(4)
91 .nr(8)
92 .kr(4)
93 .sr(1)
94 .m(m)
95 .n(8)
96 .k(8)
97 .iterations(1)
98 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
99 }
100 }
101
102 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_eq_8_subtile_n) {
103 TEST_REQUIRES_ARM_NEON_DOT;
104 for (uint32_t n = 1; n <= 8; n++) {
105 GemmMicrokernelTester()
106 .mr(4)
107 .nr(8)
108 .kr(4)
109 .sr(1)
110 .m(4)
111 .n(n)
112 .k(8)
113 .iterations(1)
114 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
115 }
116 }
117
118 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_lt_8) {
119 TEST_REQUIRES_ARM_NEON_DOT;
120 for (size_t k = 1; k < 8; k++) {
121 GemmMicrokernelTester()
122 .mr(4)
123 .nr(8)
124 .kr(4)
125 .sr(1)
126 .m(4)
127 .n(8)
128 .k(k)
129 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
130 }
131 }
132
133 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_lt_8_strided_a) {
134 TEST_REQUIRES_ARM_NEON_DOT;
135 for (size_t k = 1; k < 8; k++) {
136 GemmMicrokernelTester()
137 .mr(4)
138 .nr(8)
139 .kr(4)
140 .sr(1)
141 .m(4)
142 .n(8)
143 .k(k)
144 .a_stride(11)
145 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
146 }
147 }
148
149 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_lt_8_subtile) {
150 TEST_REQUIRES_ARM_NEON_DOT;
151 for (size_t k = 1; k < 8; k++) {
152 for (uint32_t n = 1; n <= 8; n++) {
153 for (uint32_t m = 1; m <= 4; m++) {
154 GemmMicrokernelTester()
155 .mr(4)
156 .nr(8)
157 .kr(4)
158 .sr(1)
159 .m(m)
160 .n(n)
161 .k(k)
162 .iterations(1)
163 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
164 }
165 }
166 }
167 }
168
169 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_gt_8) {
170 TEST_REQUIRES_ARM_NEON_DOT;
171 for (size_t k = 9; k < 16; k++) {
172 GemmMicrokernelTester()
173 .mr(4)
174 .nr(8)
175 .kr(4)
176 .sr(1)
177 .m(4)
178 .n(8)
179 .k(k)
180 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
181 }
182 }
183
184 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_gt_8_strided_a) {
185 TEST_REQUIRES_ARM_NEON_DOT;
186 for (size_t k = 9; k < 16; k++) {
187 GemmMicrokernelTester()
188 .mr(4)
189 .nr(8)
190 .kr(4)
191 .sr(1)
192 .m(4)
193 .n(8)
194 .k(k)
195 .a_stride(19)
196 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
197 }
198 }
199
200 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_gt_8_subtile) {
201 TEST_REQUIRES_ARM_NEON_DOT;
202 for (size_t k = 9; k < 16; k++) {
203 for (uint32_t n = 1; n <= 8; n++) {
204 for (uint32_t m = 1; m <= 4; m++) {
205 GemmMicrokernelTester()
206 .mr(4)
207 .nr(8)
208 .kr(4)
209 .sr(1)
210 .m(m)
211 .n(n)
212 .k(k)
213 .iterations(1)
214 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
215 }
216 }
217 }
218 }
219
220 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_div_8) {
221 TEST_REQUIRES_ARM_NEON_DOT;
222 for (size_t k = 16; k <= 80; k += 8) {
223 GemmMicrokernelTester()
224 .mr(4)
225 .nr(8)
226 .kr(4)
227 .sr(1)
228 .m(4)
229 .n(8)
230 .k(k)
231 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
232 }
233 }
234
235 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_div_8_strided_a) {
236 TEST_REQUIRES_ARM_NEON_DOT;
237 for (size_t k = 16; k <= 80; k += 8) {
238 GemmMicrokernelTester()
239 .mr(4)
240 .nr(8)
241 .kr(4)
242 .sr(1)
243 .m(4)
244 .n(8)
245 .k(k)
246 .a_stride(83)
247 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
248 }
249 }
250
251 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, k_div_8_subtile) {
252 TEST_REQUIRES_ARM_NEON_DOT;
253 for (size_t k = 16; k <= 80; k += 8) {
254 for (uint32_t n = 1; n <= 8; n++) {
255 for (uint32_t m = 1; m <= 4; m++) {
256 GemmMicrokernelTester()
257 .mr(4)
258 .nr(8)
259 .kr(4)
260 .sr(1)
261 .m(m)
262 .n(n)
263 .k(k)
264 .iterations(1)
265 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
266 }
267 }
268 }
269 }
270
271 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, n_gt_8) {
272 TEST_REQUIRES_ARM_NEON_DOT;
273 for (uint32_t n = 9; n < 16; n++) {
274 for (size_t k = 1; k <= 40; k += 9) {
275 GemmMicrokernelTester()
276 .mr(4)
277 .nr(8)
278 .kr(4)
279 .sr(1)
280 .m(4)
281 .n(n)
282 .k(k)
283 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
284 }
285 }
286 }
287
288 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, n_gt_8_strided_cn) {
289 TEST_REQUIRES_ARM_NEON_DOT;
290 for (uint32_t n = 9; n < 16; n++) {
291 for (size_t k = 1; k <= 40; k += 9) {
292 GemmMicrokernelTester()
293 .mr(4)
294 .nr(8)
295 .kr(4)
296 .sr(1)
297 .m(4)
298 .n(n)
299 .k(k)
300 .cn_stride(11)
301 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
302 }
303 }
304 }
305
306 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, n_gt_8_strided_a) {
307 TEST_REQUIRES_ARM_NEON_DOT;
308 for (uint32_t n = 9; n < 16; n++) {
309 for (size_t k = 1; k <= 40; k += 9) {
310 GemmMicrokernelTester()
311 .mr(4)
312 .nr(8)
313 .kr(4)
314 .sr(1)
315 .m(4)
316 .n(n)
317 .k(k)
318 .a_stride(43)
319 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
320 }
321 }
322 }
323
324 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, n_gt_8_subtile) {
325 TEST_REQUIRES_ARM_NEON_DOT;
326 for (uint32_t n = 9; n < 16; n++) {
327 for (size_t k = 1; k <= 40; k += 9) {
328 for (uint32_t m = 1; m <= 4; m++) {
329 GemmMicrokernelTester()
330 .mr(4)
331 .nr(8)
332 .kr(4)
333 .sr(1)
334 .m(m)
335 .n(n)
336 .k(k)
337 .iterations(1)
338 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
339 }
340 }
341 }
342 }
343
344 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, n_div_8) {
345 TEST_REQUIRES_ARM_NEON_DOT;
346 for (uint32_t n = 16; n <= 24; n += 8) {
347 for (size_t k = 1; k <= 40; k += 9) {
348 GemmMicrokernelTester()
349 .mr(4)
350 .nr(8)
351 .kr(4)
352 .sr(1)
353 .m(4)
354 .n(n)
355 .k(k)
356 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
357 }
358 }
359 }
360
361 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, n_div_8_strided_cn) {
362 TEST_REQUIRES_ARM_NEON_DOT;
363 for (uint32_t n = 16; n <= 24; n += 8) {
364 for (size_t k = 1; k <= 40; k += 9) {
365 GemmMicrokernelTester()
366 .mr(4)
367 .nr(8)
368 .kr(4)
369 .sr(1)
370 .m(4)
371 .n(n)
372 .k(k)
373 .cn_stride(11)
374 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
375 }
376 }
377 }
378
379 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, n_div_8_strided_a) {
380 TEST_REQUIRES_ARM_NEON_DOT;
381 for (uint32_t n = 16; n <= 24; n += 8) {
382 for (size_t k = 1; k <= 40; k += 9) {
383 GemmMicrokernelTester()
384 .mr(4)
385 .nr(8)
386 .kr(4)
387 .sr(1)
388 .m(4)
389 .n(n)
390 .k(k)
391 .a_stride(43)
392 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
393 }
394 }
395 }
396
397 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, n_div_8_subtile) {
398 TEST_REQUIRES_ARM_NEON_DOT;
399 for (uint32_t n = 16; n <= 24; n += 8) {
400 for (size_t k = 1; k <= 40; k += 9) {
401 for (uint32_t m = 1; m <= 4; m++) {
402 GemmMicrokernelTester()
403 .mr(4)
404 .nr(8)
405 .kr(4)
406 .sr(1)
407 .m(m)
408 .n(n)
409 .k(k)
410 .iterations(1)
411 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
412 }
413 }
414 }
415 }
416
417 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, strided_cm_subtile) {
418 TEST_REQUIRES_ARM_NEON_DOT;
419 for (size_t k = 1; k <= 40; k += 9) {
420 for (uint32_t n = 1; n <= 8; n++) {
421 for (uint32_t m = 1; m <= 4; m++) {
422 GemmMicrokernelTester()
423 .mr(4)
424 .nr(8)
425 .kr(4)
426 .sr(1)
427 .m(m)
428 .n(n)
429 .k(k)
430 .cm_stride(11)
431 .iterations(1)
432 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
433 }
434 }
435 }
436 }
437
438 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, qmin) {
439 TEST_REQUIRES_ARM_NEON_DOT;
440 GemmMicrokernelTester()
441 .mr(4)
442 .nr(8)
443 .kr(4)
444 .sr(1)
445 .m(4)
446 .n(8)
447 .k(8)
448 .qmin(128)
449 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
450 }
451
452 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, qmax) {
453 TEST_REQUIRES_ARM_NEON_DOT;
454 GemmMicrokernelTester()
455 .mr(4)
456 .nr(8)
457 .kr(4)
458 .sr(1)
459 .m(4)
460 .n(8)
461 .k(8)
462 .qmax(128)
463 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
464 }
465
466 TEST(QC8_GEMM_MINMAX_FP32_4X8C4__AARCH32_NEONDOT_LD64, strided_cm) {
467 TEST_REQUIRES_ARM_NEON_DOT;
468 GemmMicrokernelTester()
469 .mr(4)
470 .nr(8)
471 .kr(4)
472 .sr(1)
473 .m(4)
474 .n(8)
475 .k(8)
476 .cm_stride(11)
477 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
478 }
479#endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS && XNN_ENABLE_ASSEMBLY
480
481
Frank Barchard5e1a3032022-01-14 13:12:41 -0800482#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
Frank Barchard9e4d2aa2022-02-02 00:31:21 -0800483 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, k_eq_8) {
484 TEST_REQUIRES_ARM_NEON_V8;
485 GemmMicrokernelTester()
486 .mr(4)
487 .nr(8)
488 .kr(1)
489 .sr(1)
490 .m(4)
491 .n(8)
492 .k(8)
493 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
494 }
495
496 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, strided_cn) {
497 TEST_REQUIRES_ARM_NEON_V8;
498 GemmMicrokernelTester()
499 .mr(4)
500 .nr(8)
501 .kr(1)
502 .sr(1)
503 .m(4)
504 .n(8)
505 .k(8)
506 .cn_stride(11)
507 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
508 }
509
510 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, k_eq_8_strided_a) {
511 TEST_REQUIRES_ARM_NEON_V8;
512 GemmMicrokernelTester()
513 .mr(4)
514 .nr(8)
515 .kr(1)
516 .sr(1)
517 .m(4)
518 .n(8)
519 .k(8)
520 .a_stride(11)
521 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
522 }
523
524 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, k_eq_8_subtile) {
525 TEST_REQUIRES_ARM_NEON_V8;
526 for (uint32_t n = 1; n <= 8; n++) {
527 for (uint32_t m = 1; m <= 4; m++) {
528 GemmMicrokernelTester()
529 .mr(4)
530 .nr(8)
531 .kr(1)
532 .sr(1)
533 .m(m)
534 .n(n)
535 .k(8)
536 .iterations(1)
537 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
538 }
539 }
540 }
541
542 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, k_eq_8_subtile_m) {
543 TEST_REQUIRES_ARM_NEON_V8;
544 for (uint32_t m = 1; m <= 4; m++) {
545 GemmMicrokernelTester()
546 .mr(4)
547 .nr(8)
548 .kr(1)
549 .sr(1)
550 .m(m)
551 .n(8)
552 .k(8)
553 .iterations(1)
554 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
555 }
556 }
557
558 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, k_eq_8_subtile_n) {
559 TEST_REQUIRES_ARM_NEON_V8;
560 for (uint32_t n = 1; n <= 8; n++) {
561 GemmMicrokernelTester()
562 .mr(4)
563 .nr(8)
564 .kr(1)
565 .sr(1)
566 .m(4)
567 .n(n)
568 .k(8)
569 .iterations(1)
570 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
571 }
572 }
573
574 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, k_lt_8) {
575 TEST_REQUIRES_ARM_NEON_V8;
576 for (size_t k = 1; k < 8; k++) {
577 GemmMicrokernelTester()
578 .mr(4)
579 .nr(8)
580 .kr(1)
581 .sr(1)
582 .m(4)
583 .n(8)
584 .k(k)
585 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
586 }
587 }
588
589 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, k_lt_8_strided_a) {
590 TEST_REQUIRES_ARM_NEON_V8;
591 for (size_t k = 1; k < 8; k++) {
592 GemmMicrokernelTester()
593 .mr(4)
594 .nr(8)
595 .kr(1)
596 .sr(1)
597 .m(4)
598 .n(8)
599 .k(k)
600 .a_stride(11)
601 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
602 }
603 }
604
605 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, k_lt_8_subtile) {
606 TEST_REQUIRES_ARM_NEON_V8;
607 for (size_t k = 1; k < 8; k++) {
608 for (uint32_t n = 1; n <= 8; n++) {
609 for (uint32_t m = 1; m <= 4; m++) {
610 GemmMicrokernelTester()
611 .mr(4)
612 .nr(8)
613 .kr(1)
614 .sr(1)
615 .m(m)
616 .n(n)
617 .k(k)
618 .iterations(1)
619 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
620 }
621 }
622 }
623 }
624
625 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, k_gt_8) {
626 TEST_REQUIRES_ARM_NEON_V8;
627 for (size_t k = 9; k < 16; k++) {
628 GemmMicrokernelTester()
629 .mr(4)
630 .nr(8)
631 .kr(1)
632 .sr(1)
633 .m(4)
634 .n(8)
635 .k(k)
636 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
637 }
638 }
639
640 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, k_gt_8_strided_a) {
641 TEST_REQUIRES_ARM_NEON_V8;
642 for (size_t k = 9; k < 16; k++) {
643 GemmMicrokernelTester()
644 .mr(4)
645 .nr(8)
646 .kr(1)
647 .sr(1)
648 .m(4)
649 .n(8)
650 .k(k)
651 .a_stride(19)
652 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
653 }
654 }
655
656 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, k_gt_8_subtile) {
657 TEST_REQUIRES_ARM_NEON_V8;
658 for (size_t k = 9; k < 16; k++) {
659 for (uint32_t n = 1; n <= 8; n++) {
660 for (uint32_t m = 1; m <= 4; m++) {
661 GemmMicrokernelTester()
662 .mr(4)
663 .nr(8)
664 .kr(1)
665 .sr(1)
666 .m(m)
667 .n(n)
668 .k(k)
669 .iterations(1)
670 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
671 }
672 }
673 }
674 }
675
676 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, k_div_8) {
677 TEST_REQUIRES_ARM_NEON_V8;
678 for (size_t k = 16; k <= 80; k += 8) {
679 GemmMicrokernelTester()
680 .mr(4)
681 .nr(8)
682 .kr(1)
683 .sr(1)
684 .m(4)
685 .n(8)
686 .k(k)
687 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
688 }
689 }
690
691 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, k_div_8_strided_a) {
692 TEST_REQUIRES_ARM_NEON_V8;
693 for (size_t k = 16; k <= 80; k += 8) {
694 GemmMicrokernelTester()
695 .mr(4)
696 .nr(8)
697 .kr(1)
698 .sr(1)
699 .m(4)
700 .n(8)
701 .k(k)
702 .a_stride(83)
703 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
704 }
705 }
706
707 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, k_div_8_subtile) {
708 TEST_REQUIRES_ARM_NEON_V8;
709 for (size_t k = 16; k <= 80; k += 8) {
710 for (uint32_t n = 1; n <= 8; n++) {
711 for (uint32_t m = 1; m <= 4; m++) {
712 GemmMicrokernelTester()
713 .mr(4)
714 .nr(8)
715 .kr(1)
716 .sr(1)
717 .m(m)
718 .n(n)
719 .k(k)
720 .iterations(1)
721 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
722 }
723 }
724 }
725 }
726
727 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, n_gt_8) {
728 TEST_REQUIRES_ARM_NEON_V8;
729 for (uint32_t n = 9; n < 16; n++) {
730 for (size_t k = 1; k <= 40; k += 9) {
731 GemmMicrokernelTester()
732 .mr(4)
733 .nr(8)
734 .kr(1)
735 .sr(1)
736 .m(4)
737 .n(n)
738 .k(k)
739 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
740 }
741 }
742 }
743
744 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, n_gt_8_strided_cn) {
745 TEST_REQUIRES_ARM_NEON_V8;
746 for (uint32_t n = 9; n < 16; n++) {
747 for (size_t k = 1; k <= 40; k += 9) {
748 GemmMicrokernelTester()
749 .mr(4)
750 .nr(8)
751 .kr(1)
752 .sr(1)
753 .m(4)
754 .n(n)
755 .k(k)
756 .cn_stride(11)
757 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
758 }
759 }
760 }
761
762 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, n_gt_8_strided_a) {
763 TEST_REQUIRES_ARM_NEON_V8;
764 for (uint32_t n = 9; n < 16; n++) {
765 for (size_t k = 1; k <= 40; k += 9) {
766 GemmMicrokernelTester()
767 .mr(4)
768 .nr(8)
769 .kr(1)
770 .sr(1)
771 .m(4)
772 .n(n)
773 .k(k)
774 .a_stride(43)
775 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
776 }
777 }
778 }
779
780 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, n_gt_8_subtile) {
781 TEST_REQUIRES_ARM_NEON_V8;
782 for (uint32_t n = 9; n < 16; n++) {
783 for (size_t k = 1; k <= 40; k += 9) {
784 for (uint32_t m = 1; m <= 4; m++) {
785 GemmMicrokernelTester()
786 .mr(4)
787 .nr(8)
788 .kr(1)
789 .sr(1)
790 .m(m)
791 .n(n)
792 .k(k)
793 .iterations(1)
794 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
795 }
796 }
797 }
798 }
799
800 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, n_div_8) {
801 TEST_REQUIRES_ARM_NEON_V8;
802 for (uint32_t n = 16; n <= 24; n += 8) {
803 for (size_t k = 1; k <= 40; k += 9) {
804 GemmMicrokernelTester()
805 .mr(4)
806 .nr(8)
807 .kr(1)
808 .sr(1)
809 .m(4)
810 .n(n)
811 .k(k)
812 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
813 }
814 }
815 }
816
817 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, n_div_8_strided_cn) {
818 TEST_REQUIRES_ARM_NEON_V8;
819 for (uint32_t n = 16; n <= 24; n += 8) {
820 for (size_t k = 1; k <= 40; k += 9) {
821 GemmMicrokernelTester()
822 .mr(4)
823 .nr(8)
824 .kr(1)
825 .sr(1)
826 .m(4)
827 .n(n)
828 .k(k)
829 .cn_stride(11)
830 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
831 }
832 }
833 }
834
835 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, n_div_8_strided_a) {
836 TEST_REQUIRES_ARM_NEON_V8;
837 for (uint32_t n = 16; n <= 24; n += 8) {
838 for (size_t k = 1; k <= 40; k += 9) {
839 GemmMicrokernelTester()
840 .mr(4)
841 .nr(8)
842 .kr(1)
843 .sr(1)
844 .m(4)
845 .n(n)
846 .k(k)
847 .a_stride(43)
848 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
849 }
850 }
851 }
852
853 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, n_div_8_subtile) {
854 TEST_REQUIRES_ARM_NEON_V8;
855 for (uint32_t n = 16; n <= 24; n += 8) {
856 for (size_t k = 1; k <= 40; k += 9) {
857 for (uint32_t m = 1; m <= 4; m++) {
858 GemmMicrokernelTester()
859 .mr(4)
860 .nr(8)
861 .kr(1)
862 .sr(1)
863 .m(m)
864 .n(n)
865 .k(k)
866 .iterations(1)
867 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
868 }
869 }
870 }
871 }
872
873 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, strided_cm_subtile) {
874 TEST_REQUIRES_ARM_NEON_V8;
875 for (size_t k = 1; k <= 40; k += 9) {
876 for (uint32_t n = 1; n <= 8; n++) {
877 for (uint32_t m = 1; m <= 4; m++) {
878 GemmMicrokernelTester()
879 .mr(4)
880 .nr(8)
881 .kr(1)
882 .sr(1)
883 .m(m)
884 .n(n)
885 .k(k)
886 .cm_stride(11)
887 .iterations(1)
888 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
889 }
890 }
891 }
892 }
893
894 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, qmin) {
895 TEST_REQUIRES_ARM_NEON_V8;
896 GemmMicrokernelTester()
897 .mr(4)
898 .nr(8)
899 .kr(1)
900 .sr(1)
901 .m(4)
902 .n(8)
903 .k(8)
904 .qmin(128)
905 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
906 }
907
908 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, qmax) {
909 TEST_REQUIRES_ARM_NEON_V8;
910 GemmMicrokernelTester()
911 .mr(4)
912 .nr(8)
913 .kr(1)
914 .sr(1)
915 .m(4)
916 .n(8)
917 .k(8)
918 .qmax(128)
919 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
920 }
921
922 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_CORTEX_A53, strided_cm) {
923 TEST_REQUIRES_ARM_NEON_V8;
924 GemmMicrokernelTester()
925 .mr(4)
926 .nr(8)
927 .kr(1)
928 .sr(1)
929 .m(4)
930 .n(8)
931 .k(8)
932 .cm_stride(11)
933 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
934 }
935#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
936
937
938#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
Frank Barchard34251d82022-02-02 11:57:11 -0800939 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_eq_8) {
940 TEST_REQUIRES_ARM_NEON;
941 GemmMicrokernelTester()
942 .mr(4)
943 .nr(8)
944 .kr(1)
945 .sr(1)
946 .m(4)
947 .n(8)
948 .k(8)
949 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
950 }
951
952 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, strided_cn) {
953 TEST_REQUIRES_ARM_NEON;
954 GemmMicrokernelTester()
955 .mr(4)
956 .nr(8)
957 .kr(1)
958 .sr(1)
959 .m(4)
960 .n(8)
961 .k(8)
962 .cn_stride(11)
963 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
964 }
965
966 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_eq_8_strided_a) {
967 TEST_REQUIRES_ARM_NEON;
968 GemmMicrokernelTester()
969 .mr(4)
970 .nr(8)
971 .kr(1)
972 .sr(1)
973 .m(4)
974 .n(8)
975 .k(8)
976 .a_stride(11)
977 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
978 }
979
980 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_eq_8_subtile) {
981 TEST_REQUIRES_ARM_NEON;
982 for (uint32_t n = 1; n <= 8; n++) {
983 for (uint32_t m = 1; m <= 4; m++) {
984 GemmMicrokernelTester()
985 .mr(4)
986 .nr(8)
987 .kr(1)
988 .sr(1)
989 .m(m)
990 .n(n)
991 .k(8)
992 .iterations(1)
993 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
994 }
995 }
996 }
997
998 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_eq_8_subtile_m) {
999 TEST_REQUIRES_ARM_NEON;
1000 for (uint32_t m = 1; m <= 4; m++) {
1001 GemmMicrokernelTester()
1002 .mr(4)
1003 .nr(8)
1004 .kr(1)
1005 .sr(1)
1006 .m(m)
1007 .n(8)
1008 .k(8)
1009 .iterations(1)
1010 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1011 }
1012 }
1013
1014 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_eq_8_subtile_n) {
1015 TEST_REQUIRES_ARM_NEON;
1016 for (uint32_t n = 1; n <= 8; n++) {
1017 GemmMicrokernelTester()
1018 .mr(4)
1019 .nr(8)
1020 .kr(1)
1021 .sr(1)
1022 .m(4)
1023 .n(n)
1024 .k(8)
1025 .iterations(1)
1026 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1027 }
1028 }
1029
1030 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_lt_8) {
1031 TEST_REQUIRES_ARM_NEON;
1032 for (size_t k = 1; k < 8; k++) {
1033 GemmMicrokernelTester()
1034 .mr(4)
1035 .nr(8)
1036 .kr(1)
1037 .sr(1)
1038 .m(4)
1039 .n(8)
1040 .k(k)
1041 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1042 }
1043 }
1044
1045 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_lt_8_strided_a) {
1046 TEST_REQUIRES_ARM_NEON;
1047 for (size_t k = 1; k < 8; k++) {
1048 GemmMicrokernelTester()
1049 .mr(4)
1050 .nr(8)
1051 .kr(1)
1052 .sr(1)
1053 .m(4)
1054 .n(8)
1055 .k(k)
1056 .a_stride(11)
1057 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1058 }
1059 }
1060
1061 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_lt_8_subtile) {
1062 TEST_REQUIRES_ARM_NEON;
1063 for (size_t k = 1; k < 8; k++) {
1064 for (uint32_t n = 1; n <= 8; n++) {
1065 for (uint32_t m = 1; m <= 4; m++) {
1066 GemmMicrokernelTester()
1067 .mr(4)
1068 .nr(8)
1069 .kr(1)
1070 .sr(1)
1071 .m(m)
1072 .n(n)
1073 .k(k)
1074 .iterations(1)
1075 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1076 }
1077 }
1078 }
1079 }
1080
1081 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_gt_8) {
1082 TEST_REQUIRES_ARM_NEON;
1083 for (size_t k = 9; k < 16; k++) {
1084 GemmMicrokernelTester()
1085 .mr(4)
1086 .nr(8)
1087 .kr(1)
1088 .sr(1)
1089 .m(4)
1090 .n(8)
1091 .k(k)
1092 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1093 }
1094 }
1095
1096 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_gt_8_strided_a) {
1097 TEST_REQUIRES_ARM_NEON;
1098 for (size_t k = 9; k < 16; k++) {
1099 GemmMicrokernelTester()
1100 .mr(4)
1101 .nr(8)
1102 .kr(1)
1103 .sr(1)
1104 .m(4)
1105 .n(8)
1106 .k(k)
1107 .a_stride(19)
1108 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1109 }
1110 }
1111
1112 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_gt_8_subtile) {
1113 TEST_REQUIRES_ARM_NEON;
1114 for (size_t k = 9; k < 16; k++) {
1115 for (uint32_t n = 1; n <= 8; n++) {
1116 for (uint32_t m = 1; m <= 4; m++) {
1117 GemmMicrokernelTester()
1118 .mr(4)
1119 .nr(8)
1120 .kr(1)
1121 .sr(1)
1122 .m(m)
1123 .n(n)
1124 .k(k)
1125 .iterations(1)
1126 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1127 }
1128 }
1129 }
1130 }
1131
1132 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_div_8) {
1133 TEST_REQUIRES_ARM_NEON;
1134 for (size_t k = 16; k <= 80; k += 8) {
1135 GemmMicrokernelTester()
1136 .mr(4)
1137 .nr(8)
1138 .kr(1)
1139 .sr(1)
1140 .m(4)
1141 .n(8)
1142 .k(k)
1143 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1144 }
1145 }
1146
1147 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_div_8_strided_a) {
1148 TEST_REQUIRES_ARM_NEON;
1149 for (size_t k = 16; k <= 80; k += 8) {
1150 GemmMicrokernelTester()
1151 .mr(4)
1152 .nr(8)
1153 .kr(1)
1154 .sr(1)
1155 .m(4)
1156 .n(8)
1157 .k(k)
1158 .a_stride(83)
1159 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1160 }
1161 }
1162
1163 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_div_8_subtile) {
1164 TEST_REQUIRES_ARM_NEON;
1165 for (size_t k = 16; k <= 80; k += 8) {
1166 for (uint32_t n = 1; n <= 8; n++) {
1167 for (uint32_t m = 1; m <= 4; m++) {
1168 GemmMicrokernelTester()
1169 .mr(4)
1170 .nr(8)
1171 .kr(1)
1172 .sr(1)
1173 .m(m)
1174 .n(n)
1175 .k(k)
1176 .iterations(1)
1177 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1178 }
1179 }
1180 }
1181 }
1182
1183 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, n_gt_8) {
1184 TEST_REQUIRES_ARM_NEON;
1185 for (uint32_t n = 9; n < 16; n++) {
1186 for (size_t k = 1; k <= 40; k += 9) {
1187 GemmMicrokernelTester()
1188 .mr(4)
1189 .nr(8)
1190 .kr(1)
1191 .sr(1)
1192 .m(4)
1193 .n(n)
1194 .k(k)
1195 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1196 }
1197 }
1198 }
1199
1200 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, n_gt_8_strided_cn) {
1201 TEST_REQUIRES_ARM_NEON;
1202 for (uint32_t n = 9; n < 16; n++) {
1203 for (size_t k = 1; k <= 40; k += 9) {
1204 GemmMicrokernelTester()
1205 .mr(4)
1206 .nr(8)
1207 .kr(1)
1208 .sr(1)
1209 .m(4)
1210 .n(n)
1211 .k(k)
1212 .cn_stride(11)
1213 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1214 }
1215 }
1216 }
1217
1218 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, n_gt_8_strided_a) {
1219 TEST_REQUIRES_ARM_NEON;
1220 for (uint32_t n = 9; n < 16; n++) {
1221 for (size_t k = 1; k <= 40; k += 9) {
1222 GemmMicrokernelTester()
1223 .mr(4)
1224 .nr(8)
1225 .kr(1)
1226 .sr(1)
1227 .m(4)
1228 .n(n)
1229 .k(k)
1230 .a_stride(43)
1231 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1232 }
1233 }
1234 }
1235
1236 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, n_gt_8_subtile) {
1237 TEST_REQUIRES_ARM_NEON;
1238 for (uint32_t n = 9; n < 16; n++) {
1239 for (size_t k = 1; k <= 40; k += 9) {
1240 for (uint32_t m = 1; m <= 4; m++) {
1241 GemmMicrokernelTester()
1242 .mr(4)
1243 .nr(8)
1244 .kr(1)
1245 .sr(1)
1246 .m(m)
1247 .n(n)
1248 .k(k)
1249 .iterations(1)
1250 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1251 }
1252 }
1253 }
1254 }
1255
1256 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, n_div_8) {
1257 TEST_REQUIRES_ARM_NEON;
1258 for (uint32_t n = 16; n <= 24; n += 8) {
1259 for (size_t k = 1; k <= 40; k += 9) {
1260 GemmMicrokernelTester()
1261 .mr(4)
1262 .nr(8)
1263 .kr(1)
1264 .sr(1)
1265 .m(4)
1266 .n(n)
1267 .k(k)
1268 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1269 }
1270 }
1271 }
1272
1273 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, n_div_8_strided_cn) {
1274 TEST_REQUIRES_ARM_NEON;
1275 for (uint32_t n = 16; n <= 24; n += 8) {
1276 for (size_t k = 1; k <= 40; k += 9) {
1277 GemmMicrokernelTester()
1278 .mr(4)
1279 .nr(8)
1280 .kr(1)
1281 .sr(1)
1282 .m(4)
1283 .n(n)
1284 .k(k)
1285 .cn_stride(11)
1286 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1287 }
1288 }
1289 }
1290
1291 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, n_div_8_strided_a) {
1292 TEST_REQUIRES_ARM_NEON;
1293 for (uint32_t n = 16; n <= 24; n += 8) {
1294 for (size_t k = 1; k <= 40; k += 9) {
1295 GemmMicrokernelTester()
1296 .mr(4)
1297 .nr(8)
1298 .kr(1)
1299 .sr(1)
1300 .m(4)
1301 .n(n)
1302 .k(k)
1303 .a_stride(43)
1304 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1305 }
1306 }
1307 }
1308
1309 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, n_div_8_subtile) {
1310 TEST_REQUIRES_ARM_NEON;
1311 for (uint32_t n = 16; n <= 24; n += 8) {
1312 for (size_t k = 1; k <= 40; k += 9) {
1313 for (uint32_t m = 1; m <= 4; m++) {
1314 GemmMicrokernelTester()
1315 .mr(4)
1316 .nr(8)
1317 .kr(1)
1318 .sr(1)
1319 .m(m)
1320 .n(n)
1321 .k(k)
1322 .iterations(1)
1323 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1324 }
1325 }
1326 }
1327 }
1328
1329 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, strided_cm_subtile) {
1330 TEST_REQUIRES_ARM_NEON;
1331 for (size_t k = 1; k <= 40; k += 9) {
1332 for (uint32_t n = 1; n <= 8; n++) {
1333 for (uint32_t m = 1; m <= 4; m++) {
1334 GemmMicrokernelTester()
1335 .mr(4)
1336 .nr(8)
1337 .kr(1)
1338 .sr(1)
1339 .m(m)
1340 .n(n)
1341 .k(k)
1342 .cm_stride(11)
1343 .iterations(1)
1344 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1345 }
1346 }
1347 }
1348 }
1349
1350 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, qmin) {
1351 TEST_REQUIRES_ARM_NEON;
1352 GemmMicrokernelTester()
1353 .mr(4)
1354 .nr(8)
1355 .kr(1)
1356 .sr(1)
1357 .m(4)
1358 .n(8)
1359 .k(8)
1360 .qmin(128)
1361 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1362 }
1363
1364 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, qmax) {
1365 TEST_REQUIRES_ARM_NEON;
1366 GemmMicrokernelTester()
1367 .mr(4)
1368 .nr(8)
1369 .kr(1)
1370 .sr(1)
1371 .m(4)
1372 .n(8)
1373 .k(8)
1374 .qmax(128)
1375 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1376 }
1377
1378 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, strided_cm) {
1379 TEST_REQUIRES_ARM_NEON;
1380 GemmMicrokernelTester()
1381 .mr(4)
1382 .nr(8)
1383 .kr(1)
1384 .sr(1)
1385 .m(4)
1386 .n(8)
1387 .k(8)
1388 .cm_stride(11)
1389 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
1390 }
1391#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
1392
1393
1394#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
Frank Barchard5e1a3032022-01-14 13:12:41 -08001395 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_eq_8) {
1396 TEST_REQUIRES_ARM_NEON_V8;
1397 GemmMicrokernelTester()
1398 .mr(4)
1399 .nr(8)
1400 .kr(1)
1401 .sr(1)
1402 .m(4)
1403 .n(8)
1404 .k(8)
1405 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1406 }
1407
1408 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, strided_cn) {
1409 TEST_REQUIRES_ARM_NEON_V8;
1410 GemmMicrokernelTester()
1411 .mr(4)
1412 .nr(8)
1413 .kr(1)
1414 .sr(1)
1415 .m(4)
1416 .n(8)
1417 .k(8)
1418 .cn_stride(11)
1419 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1420 }
1421
1422 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_eq_8_strided_a) {
1423 TEST_REQUIRES_ARM_NEON_V8;
1424 GemmMicrokernelTester()
1425 .mr(4)
1426 .nr(8)
1427 .kr(1)
1428 .sr(1)
1429 .m(4)
1430 .n(8)
1431 .k(8)
1432 .a_stride(11)
1433 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1434 }
1435
1436 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
1437 TEST_REQUIRES_ARM_NEON_V8;
1438 for (uint32_t n = 1; n <= 8; n++) {
1439 for (uint32_t m = 1; m <= 4; m++) {
1440 GemmMicrokernelTester()
1441 .mr(4)
1442 .nr(8)
1443 .kr(1)
1444 .sr(1)
1445 .m(m)
1446 .n(n)
1447 .k(8)
1448 .iterations(1)
1449 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1450 }
1451 }
1452 }
1453
1454 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
1455 TEST_REQUIRES_ARM_NEON_V8;
1456 for (uint32_t m = 1; m <= 4; m++) {
1457 GemmMicrokernelTester()
1458 .mr(4)
1459 .nr(8)
1460 .kr(1)
1461 .sr(1)
1462 .m(m)
1463 .n(8)
1464 .k(8)
1465 .iterations(1)
1466 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1467 }
1468 }
1469
1470 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
1471 TEST_REQUIRES_ARM_NEON_V8;
1472 for (uint32_t n = 1; n <= 8; n++) {
1473 GemmMicrokernelTester()
1474 .mr(4)
1475 .nr(8)
1476 .kr(1)
1477 .sr(1)
1478 .m(4)
1479 .n(n)
1480 .k(8)
1481 .iterations(1)
1482 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1483 }
1484 }
1485
1486 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_lt_8) {
1487 TEST_REQUIRES_ARM_NEON_V8;
1488 for (size_t k = 1; k < 8; k++) {
1489 GemmMicrokernelTester()
1490 .mr(4)
1491 .nr(8)
1492 .kr(1)
1493 .sr(1)
1494 .m(4)
1495 .n(8)
1496 .k(k)
1497 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1498 }
1499 }
1500
1501 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_lt_8_strided_a) {
1502 TEST_REQUIRES_ARM_NEON_V8;
1503 for (size_t k = 1; k < 8; k++) {
1504 GemmMicrokernelTester()
1505 .mr(4)
1506 .nr(8)
1507 .kr(1)
1508 .sr(1)
1509 .m(4)
1510 .n(8)
1511 .k(k)
1512 .a_stride(11)
1513 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1514 }
1515 }
1516
1517 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
1518 TEST_REQUIRES_ARM_NEON_V8;
1519 for (size_t k = 1; k < 8; k++) {
1520 for (uint32_t n = 1; n <= 8; n++) {
1521 for (uint32_t m = 1; m <= 4; m++) {
1522 GemmMicrokernelTester()
1523 .mr(4)
1524 .nr(8)
1525 .kr(1)
1526 .sr(1)
1527 .m(m)
1528 .n(n)
1529 .k(k)
1530 .iterations(1)
1531 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1532 }
1533 }
1534 }
1535 }
1536
1537 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_gt_8) {
1538 TEST_REQUIRES_ARM_NEON_V8;
1539 for (size_t k = 9; k < 16; k++) {
1540 GemmMicrokernelTester()
1541 .mr(4)
1542 .nr(8)
1543 .kr(1)
1544 .sr(1)
1545 .m(4)
1546 .n(8)
1547 .k(k)
1548 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1549 }
1550 }
1551
1552 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_gt_8_strided_a) {
1553 TEST_REQUIRES_ARM_NEON_V8;
1554 for (size_t k = 9; k < 16; k++) {
1555 GemmMicrokernelTester()
1556 .mr(4)
1557 .nr(8)
1558 .kr(1)
1559 .sr(1)
1560 .m(4)
1561 .n(8)
1562 .k(k)
1563 .a_stride(19)
1564 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1565 }
1566 }
1567
1568 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
1569 TEST_REQUIRES_ARM_NEON_V8;
1570 for (size_t k = 9; k < 16; k++) {
1571 for (uint32_t n = 1; n <= 8; n++) {
1572 for (uint32_t m = 1; m <= 4; m++) {
1573 GemmMicrokernelTester()
1574 .mr(4)
1575 .nr(8)
1576 .kr(1)
1577 .sr(1)
1578 .m(m)
1579 .n(n)
1580 .k(k)
1581 .iterations(1)
1582 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1583 }
1584 }
1585 }
1586 }
1587
1588 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_div_8) {
1589 TEST_REQUIRES_ARM_NEON_V8;
1590 for (size_t k = 16; k <= 80; k += 8) {
1591 GemmMicrokernelTester()
1592 .mr(4)
1593 .nr(8)
1594 .kr(1)
1595 .sr(1)
1596 .m(4)
1597 .n(8)
1598 .k(k)
1599 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1600 }
1601 }
1602
1603 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_div_8_strided_a) {
1604 TEST_REQUIRES_ARM_NEON_V8;
1605 for (size_t k = 16; k <= 80; k += 8) {
1606 GemmMicrokernelTester()
1607 .mr(4)
1608 .nr(8)
1609 .kr(1)
1610 .sr(1)
1611 .m(4)
1612 .n(8)
1613 .k(k)
1614 .a_stride(83)
1615 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1616 }
1617 }
1618
1619 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
1620 TEST_REQUIRES_ARM_NEON_V8;
1621 for (size_t k = 16; k <= 80; k += 8) {
1622 for (uint32_t n = 1; n <= 8; n++) {
1623 for (uint32_t m = 1; m <= 4; m++) {
1624 GemmMicrokernelTester()
1625 .mr(4)
1626 .nr(8)
1627 .kr(1)
1628 .sr(1)
1629 .m(m)
1630 .n(n)
1631 .k(k)
1632 .iterations(1)
1633 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1634 }
1635 }
1636 }
1637 }
1638
1639 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, n_gt_8) {
1640 TEST_REQUIRES_ARM_NEON_V8;
1641 for (uint32_t n = 9; n < 16; n++) {
1642 for (size_t k = 1; k <= 40; k += 9) {
1643 GemmMicrokernelTester()
1644 .mr(4)
1645 .nr(8)
1646 .kr(1)
1647 .sr(1)
1648 .m(4)
1649 .n(n)
1650 .k(k)
1651 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1652 }
1653 }
1654 }
1655
1656 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, n_gt_8_strided_cn) {
1657 TEST_REQUIRES_ARM_NEON_V8;
1658 for (uint32_t n = 9; n < 16; n++) {
1659 for (size_t k = 1; k <= 40; k += 9) {
1660 GemmMicrokernelTester()
1661 .mr(4)
1662 .nr(8)
1663 .kr(1)
1664 .sr(1)
1665 .m(4)
1666 .n(n)
1667 .k(k)
1668 .cn_stride(11)
1669 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1670 }
1671 }
1672 }
1673
1674 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, n_gt_8_strided_a) {
1675 TEST_REQUIRES_ARM_NEON_V8;
1676 for (uint32_t n = 9; n < 16; n++) {
1677 for (size_t k = 1; k <= 40; k += 9) {
1678 GemmMicrokernelTester()
1679 .mr(4)
1680 .nr(8)
1681 .kr(1)
1682 .sr(1)
1683 .m(4)
1684 .n(n)
1685 .k(k)
1686 .a_stride(43)
1687 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1688 }
1689 }
1690 }
1691
1692 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, n_gt_8_subtile) {
1693 TEST_REQUIRES_ARM_NEON_V8;
1694 for (uint32_t n = 9; n < 16; n++) {
1695 for (size_t k = 1; k <= 40; k += 9) {
1696 for (uint32_t m = 1; m <= 4; m++) {
1697 GemmMicrokernelTester()
1698 .mr(4)
1699 .nr(8)
1700 .kr(1)
1701 .sr(1)
1702 .m(m)
1703 .n(n)
1704 .k(k)
1705 .iterations(1)
1706 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1707 }
1708 }
1709 }
1710 }
1711
1712 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, n_div_8) {
1713 TEST_REQUIRES_ARM_NEON_V8;
1714 for (uint32_t n = 16; n <= 24; n += 8) {
1715 for (size_t k = 1; k <= 40; k += 9) {
1716 GemmMicrokernelTester()
1717 .mr(4)
1718 .nr(8)
1719 .kr(1)
1720 .sr(1)
1721 .m(4)
1722 .n(n)
1723 .k(k)
1724 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1725 }
1726 }
1727 }
1728
1729 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, n_div_8_strided_cn) {
1730 TEST_REQUIRES_ARM_NEON_V8;
1731 for (uint32_t n = 16; n <= 24; n += 8) {
1732 for (size_t k = 1; k <= 40; k += 9) {
1733 GemmMicrokernelTester()
1734 .mr(4)
1735 .nr(8)
1736 .kr(1)
1737 .sr(1)
1738 .m(4)
1739 .n(n)
1740 .k(k)
1741 .cn_stride(11)
1742 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1743 }
1744 }
1745 }
1746
1747 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, n_div_8_strided_a) {
1748 TEST_REQUIRES_ARM_NEON_V8;
1749 for (uint32_t n = 16; n <= 24; n += 8) {
1750 for (size_t k = 1; k <= 40; k += 9) {
1751 GemmMicrokernelTester()
1752 .mr(4)
1753 .nr(8)
1754 .kr(1)
1755 .sr(1)
1756 .m(4)
1757 .n(n)
1758 .k(k)
1759 .a_stride(43)
1760 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1761 }
1762 }
1763 }
1764
1765 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, n_div_8_subtile) {
1766 TEST_REQUIRES_ARM_NEON_V8;
1767 for (uint32_t n = 16; n <= 24; n += 8) {
1768 for (size_t k = 1; k <= 40; k += 9) {
1769 for (uint32_t m = 1; m <= 4; m++) {
1770 GemmMicrokernelTester()
1771 .mr(4)
1772 .nr(8)
1773 .kr(1)
1774 .sr(1)
1775 .m(m)
1776 .n(n)
1777 .k(k)
1778 .iterations(1)
1779 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1780 }
1781 }
1782 }
1783 }
1784
1785 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
1786 TEST_REQUIRES_ARM_NEON_V8;
1787 for (size_t k = 1; k <= 40; k += 9) {
1788 for (uint32_t n = 1; n <= 8; n++) {
1789 for (uint32_t m = 1; m <= 4; m++) {
1790 GemmMicrokernelTester()
1791 .mr(4)
1792 .nr(8)
1793 .kr(1)
1794 .sr(1)
1795 .m(m)
1796 .n(n)
1797 .k(k)
1798 .cm_stride(11)
1799 .iterations(1)
1800 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1801 }
1802 }
1803 }
1804 }
1805
1806 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, qmin) {
1807 TEST_REQUIRES_ARM_NEON_V8;
1808 GemmMicrokernelTester()
1809 .mr(4)
1810 .nr(8)
1811 .kr(1)
1812 .sr(1)
1813 .m(4)
1814 .n(8)
1815 .k(8)
1816 .qmin(128)
1817 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1818 }
1819
1820 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, qmax) {
1821 TEST_REQUIRES_ARM_NEON_V8;
1822 GemmMicrokernelTester()
1823 .mr(4)
1824 .nr(8)
1825 .kr(1)
1826 .sr(1)
1827 .m(4)
1828 .n(8)
1829 .k(8)
1830 .qmax(128)
1831 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1832 }
1833
1834 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_PRFM_LD64, strided_cm) {
1835 TEST_REQUIRES_ARM_NEON_V8;
1836 GemmMicrokernelTester()
1837 .mr(4)
1838 .nr(8)
1839 .kr(1)
1840 .sr(1)
1841 .m(4)
1842 .n(8)
1843 .k(8)
1844 .cm_stride(11)
1845 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1846 }
1847#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
1848
1849
1850#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
1851 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_eq_8) {
1852 TEST_REQUIRES_ARM_NEON_V8;
1853 GemmMicrokernelTester()
1854 .mr(4)
1855 .nr(8)
1856 .kr(1)
1857 .sr(1)
1858 .m(4)
1859 .n(8)
1860 .k(8)
1861 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1862 }
1863
1864 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, strided_cn) {
1865 TEST_REQUIRES_ARM_NEON_V8;
1866 GemmMicrokernelTester()
1867 .mr(4)
1868 .nr(8)
1869 .kr(1)
1870 .sr(1)
1871 .m(4)
1872 .n(8)
1873 .k(8)
1874 .cn_stride(11)
1875 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1876 }
1877
1878 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_eq_8_strided_a) {
1879 TEST_REQUIRES_ARM_NEON_V8;
1880 GemmMicrokernelTester()
1881 .mr(4)
1882 .nr(8)
1883 .kr(1)
1884 .sr(1)
1885 .m(4)
1886 .n(8)
1887 .k(8)
1888 .a_stride(11)
1889 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1890 }
1891
1892 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_eq_8_subtile) {
1893 TEST_REQUIRES_ARM_NEON_V8;
1894 for (uint32_t n = 1; n <= 8; n++) {
1895 for (uint32_t m = 1; m <= 4; m++) {
1896 GemmMicrokernelTester()
1897 .mr(4)
1898 .nr(8)
1899 .kr(1)
1900 .sr(1)
1901 .m(m)
1902 .n(n)
1903 .k(8)
1904 .iterations(1)
1905 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1906 }
1907 }
1908 }
1909
1910 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_eq_8_subtile_m) {
1911 TEST_REQUIRES_ARM_NEON_V8;
1912 for (uint32_t m = 1; m <= 4; m++) {
1913 GemmMicrokernelTester()
1914 .mr(4)
1915 .nr(8)
1916 .kr(1)
1917 .sr(1)
1918 .m(m)
1919 .n(8)
1920 .k(8)
1921 .iterations(1)
1922 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1923 }
1924 }
1925
1926 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_eq_8_subtile_n) {
1927 TEST_REQUIRES_ARM_NEON_V8;
1928 for (uint32_t n = 1; n <= 8; n++) {
1929 GemmMicrokernelTester()
1930 .mr(4)
1931 .nr(8)
1932 .kr(1)
1933 .sr(1)
1934 .m(4)
1935 .n(n)
1936 .k(8)
1937 .iterations(1)
1938 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1939 }
1940 }
1941
1942 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_lt_8) {
1943 TEST_REQUIRES_ARM_NEON_V8;
1944 for (size_t k = 1; k < 8; k++) {
1945 GemmMicrokernelTester()
1946 .mr(4)
1947 .nr(8)
1948 .kr(1)
1949 .sr(1)
1950 .m(4)
1951 .n(8)
1952 .k(k)
1953 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1954 }
1955 }
1956
1957 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_lt_8_strided_a) {
1958 TEST_REQUIRES_ARM_NEON_V8;
1959 for (size_t k = 1; k < 8; k++) {
1960 GemmMicrokernelTester()
1961 .mr(4)
1962 .nr(8)
1963 .kr(1)
1964 .sr(1)
1965 .m(4)
1966 .n(8)
1967 .k(k)
1968 .a_stride(11)
1969 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1970 }
1971 }
1972
1973 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_lt_8_subtile) {
1974 TEST_REQUIRES_ARM_NEON_V8;
1975 for (size_t k = 1; k < 8; k++) {
1976 for (uint32_t n = 1; n <= 8; n++) {
1977 for (uint32_t m = 1; m <= 4; m++) {
1978 GemmMicrokernelTester()
1979 .mr(4)
1980 .nr(8)
1981 .kr(1)
1982 .sr(1)
1983 .m(m)
1984 .n(n)
1985 .k(k)
1986 .iterations(1)
1987 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
1988 }
1989 }
1990 }
1991 }
1992
1993 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_gt_8) {
1994 TEST_REQUIRES_ARM_NEON_V8;
1995 for (size_t k = 9; k < 16; k++) {
1996 GemmMicrokernelTester()
1997 .mr(4)
1998 .nr(8)
1999 .kr(1)
2000 .sr(1)
2001 .m(4)
2002 .n(8)
2003 .k(k)
2004 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
2005 }
2006 }
2007
2008 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_gt_8_strided_a) {
2009 TEST_REQUIRES_ARM_NEON_V8;
2010 for (size_t k = 9; k < 16; k++) {
2011 GemmMicrokernelTester()
2012 .mr(4)
2013 .nr(8)
2014 .kr(1)
2015 .sr(1)
2016 .m(4)
2017 .n(8)
2018 .k(k)
2019 .a_stride(19)
2020 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
2021 }
2022 }
2023
2024 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_gt_8_subtile) {
2025 TEST_REQUIRES_ARM_NEON_V8;
2026 for (size_t k = 9; k < 16; k++) {
2027 for (uint32_t n = 1; n <= 8; n++) {
2028 for (uint32_t m = 1; m <= 4; m++) {
2029 GemmMicrokernelTester()
2030 .mr(4)
2031 .nr(8)
2032 .kr(1)
2033 .sr(1)
2034 .m(m)
2035 .n(n)
2036 .k(k)
2037 .iterations(1)
2038 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
2039 }
2040 }
2041 }
2042 }
2043
2044 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_div_8) {
2045 TEST_REQUIRES_ARM_NEON_V8;
2046 for (size_t k = 16; k <= 80; k += 8) {
2047 GemmMicrokernelTester()
2048 .mr(4)
2049 .nr(8)
2050 .kr(1)
2051 .sr(1)
2052 .m(4)
2053 .n(8)
2054 .k(k)
2055 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
2056 }
2057 }
2058
2059 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_div_8_strided_a) {
2060 TEST_REQUIRES_ARM_NEON_V8;
2061 for (size_t k = 16; k <= 80; k += 8) {
2062 GemmMicrokernelTester()
2063 .mr(4)
2064 .nr(8)
2065 .kr(1)
2066 .sr(1)
2067 .m(4)
2068 .n(8)
2069 .k(k)
2070 .a_stride(83)
2071 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
2072 }
2073 }
2074
2075 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_div_8_subtile) {
2076 TEST_REQUIRES_ARM_NEON_V8;
2077 for (size_t k = 16; k <= 80; k += 8) {
2078 for (uint32_t n = 1; n <= 8; n++) {
2079 for (uint32_t m = 1; m <= 4; m++) {
2080 GemmMicrokernelTester()
2081 .mr(4)
2082 .nr(8)
2083 .kr(1)
2084 .sr(1)
2085 .m(m)
2086 .n(n)
2087 .k(k)
2088 .iterations(1)
2089 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
2090 }
2091 }
2092 }
2093 }
2094
2095 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, n_gt_8) {
2096 TEST_REQUIRES_ARM_NEON_V8;
2097 for (uint32_t n = 9; n < 16; n++) {
2098 for (size_t k = 1; k <= 40; k += 9) {
2099 GemmMicrokernelTester()
2100 .mr(4)
2101 .nr(8)
2102 .kr(1)
2103 .sr(1)
2104 .m(4)
2105 .n(n)
2106 .k(k)
2107 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
2108 }
2109 }
2110 }
2111
2112 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, n_gt_8_strided_cn) {
2113 TEST_REQUIRES_ARM_NEON_V8;
2114 for (uint32_t n = 9; n < 16; n++) {
2115 for (size_t k = 1; k <= 40; k += 9) {
2116 GemmMicrokernelTester()
2117 .mr(4)
2118 .nr(8)
2119 .kr(1)
2120 .sr(1)
2121 .m(4)
2122 .n(n)
2123 .k(k)
2124 .cn_stride(11)
2125 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
2126 }
2127 }
2128 }
2129
2130 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, n_gt_8_strided_a) {
2131 TEST_REQUIRES_ARM_NEON_V8;
2132 for (uint32_t n = 9; n < 16; n++) {
2133 for (size_t k = 1; k <= 40; k += 9) {
2134 GemmMicrokernelTester()
2135 .mr(4)
2136 .nr(8)
2137 .kr(1)
2138 .sr(1)
2139 .m(4)
2140 .n(n)
2141 .k(k)
2142 .a_stride(43)
2143 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
2144 }
2145 }
2146 }
2147
2148 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, n_gt_8_subtile) {
2149 TEST_REQUIRES_ARM_NEON_V8;
2150 for (uint32_t n = 9; n < 16; n++) {
2151 for (size_t k = 1; k <= 40; k += 9) {
2152 for (uint32_t m = 1; m <= 4; m++) {
2153 GemmMicrokernelTester()
2154 .mr(4)
2155 .nr(8)
2156 .kr(1)
2157 .sr(1)
2158 .m(m)
2159 .n(n)
2160 .k(k)
2161 .iterations(1)
2162 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
2163 }
2164 }
2165 }
2166 }
2167
2168 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, n_div_8) {
2169 TEST_REQUIRES_ARM_NEON_V8;
2170 for (uint32_t n = 16; n <= 24; n += 8) {
2171 for (size_t k = 1; k <= 40; k += 9) {
2172 GemmMicrokernelTester()
2173 .mr(4)
2174 .nr(8)
2175 .kr(1)
2176 .sr(1)
2177 .m(4)
2178 .n(n)
2179 .k(k)
2180 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
2181 }
2182 }
2183 }
2184
2185 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, n_div_8_strided_cn) {
2186 TEST_REQUIRES_ARM_NEON_V8;
2187 for (uint32_t n = 16; n <= 24; n += 8) {
2188 for (size_t k = 1; k <= 40; k += 9) {
2189 GemmMicrokernelTester()
2190 .mr(4)
2191 .nr(8)
2192 .kr(1)
2193 .sr(1)
2194 .m(4)
2195 .n(n)
2196 .k(k)
2197 .cn_stride(11)
2198 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
2199 }
2200 }
2201 }
2202
2203 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, n_div_8_strided_a) {
2204 TEST_REQUIRES_ARM_NEON_V8;
2205 for (uint32_t n = 16; n <= 24; n += 8) {
2206 for (size_t k = 1; k <= 40; k += 9) {
2207 GemmMicrokernelTester()
2208 .mr(4)
2209 .nr(8)
2210 .kr(1)
2211 .sr(1)
2212 .m(4)
2213 .n(n)
2214 .k(k)
2215 .a_stride(43)
2216 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
2217 }
2218 }
2219 }
2220
2221 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, n_div_8_subtile) {
2222 TEST_REQUIRES_ARM_NEON_V8;
2223 for (uint32_t n = 16; n <= 24; n += 8) {
2224 for (size_t k = 1; k <= 40; k += 9) {
2225 for (uint32_t m = 1; m <= 4; m++) {
2226 GemmMicrokernelTester()
2227 .mr(4)
2228 .nr(8)
2229 .kr(1)
2230 .sr(1)
2231 .m(m)
2232 .n(n)
2233 .k(k)
2234 .iterations(1)
2235 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
2236 }
2237 }
2238 }
2239 }
2240
2241 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, strided_cm_subtile) {
2242 TEST_REQUIRES_ARM_NEON_V8;
2243 for (size_t k = 1; k <= 40; k += 9) {
2244 for (uint32_t n = 1; n <= 8; n++) {
2245 for (uint32_t m = 1; m <= 4; m++) {
2246 GemmMicrokernelTester()
2247 .mr(4)
2248 .nr(8)
2249 .kr(1)
2250 .sr(1)
2251 .m(m)
2252 .n(n)
2253 .k(k)
2254 .cm_stride(11)
2255 .iterations(1)
2256 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
2257 }
2258 }
2259 }
2260 }
2261
2262 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, qmin) {
2263 TEST_REQUIRES_ARM_NEON_V8;
2264 GemmMicrokernelTester()
2265 .mr(4)
2266 .nr(8)
2267 .kr(1)
2268 .sr(1)
2269 .m(4)
2270 .n(8)
2271 .k(8)
2272 .qmin(128)
2273 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
2274 }
2275
2276 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, qmax) {
2277 TEST_REQUIRES_ARM_NEON_V8;
2278 GemmMicrokernelTester()
2279 .mr(4)
2280 .nr(8)
2281 .kr(1)
2282 .sr(1)
2283 .m(4)
2284 .n(8)
2285 .k(8)
2286 .qmax(128)
2287 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
2288 }
2289
2290 TEST(QC8_GEMM_MINMAX_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, strided_cm) {
2291 TEST_REQUIRES_ARM_NEON_V8;
2292 GemmMicrokernelTester()
2293 .mr(4)
2294 .nr(8)
2295 .kr(1)
2296 .sr(1)
2297 .m(4)
2298 .n(8)
2299 .k(8)
2300 .cm_stride(11)
2301 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
2302 }
2303#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
2304
2305
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002306#if XNN_ARCH_ARM || XNN_ARCH_ARM64
2307 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_eq_8) {
2308 TEST_REQUIRES_ARM_NEON;
2309 GemmMicrokernelTester()
2310 .mr(4)
2311 .nr(8)
2312 .kr(1)
2313 .sr(1)
2314 .m(4)
2315 .n(8)
2316 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08002317 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002318 }
2319
2320 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, strided_cn) {
2321 TEST_REQUIRES_ARM_NEON;
2322 GemmMicrokernelTester()
2323 .mr(4)
2324 .nr(8)
2325 .kr(1)
2326 .sr(1)
2327 .m(4)
2328 .n(8)
2329 .k(8)
2330 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08002331 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002332 }
2333
2334 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_eq_8_strided_a) {
2335 TEST_REQUIRES_ARM_NEON;
2336 GemmMicrokernelTester()
2337 .mr(4)
2338 .nr(8)
2339 .kr(1)
2340 .sr(1)
2341 .m(4)
2342 .n(8)
2343 .k(8)
2344 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08002345 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002346 }
2347
2348 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_eq_8_subtile) {
2349 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -08002350 for (uint32_t n = 1; n <= 8; n++) {
2351 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002352 GemmMicrokernelTester()
2353 .mr(4)
2354 .nr(8)
2355 .kr(1)
2356 .sr(1)
2357 .m(m)
2358 .n(n)
2359 .k(8)
2360 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002361 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002362 }
2363 }
2364 }
2365
2366 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_eq_8_subtile_m) {
2367 TEST_REQUIRES_ARM_NEON;
2368 for (uint32_t m = 1; m <= 4; m++) {
2369 GemmMicrokernelTester()
2370 .mr(4)
2371 .nr(8)
2372 .kr(1)
2373 .sr(1)
2374 .m(m)
2375 .n(8)
2376 .k(8)
2377 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002378 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002379 }
2380 }
2381
2382 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_eq_8_subtile_n) {
2383 TEST_REQUIRES_ARM_NEON;
2384 for (uint32_t n = 1; n <= 8; n++) {
2385 GemmMicrokernelTester()
2386 .mr(4)
2387 .nr(8)
2388 .kr(1)
2389 .sr(1)
2390 .m(4)
2391 .n(n)
2392 .k(8)
2393 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002394 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002395 }
2396 }
2397
2398 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_lt_8) {
2399 TEST_REQUIRES_ARM_NEON;
2400 for (size_t k = 1; k < 8; k++) {
2401 GemmMicrokernelTester()
2402 .mr(4)
2403 .nr(8)
2404 .kr(1)
2405 .sr(1)
2406 .m(4)
2407 .n(8)
2408 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08002409 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002410 }
2411 }
2412
2413 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_lt_8_strided_a) {
2414 TEST_REQUIRES_ARM_NEON;
2415 for (size_t k = 1; k < 8; k++) {
2416 GemmMicrokernelTester()
2417 .mr(4)
2418 .nr(8)
2419 .kr(1)
2420 .sr(1)
2421 .m(4)
2422 .n(8)
2423 .k(k)
2424 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08002425 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002426 }
2427 }
2428
2429 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_lt_8_subtile) {
2430 TEST_REQUIRES_ARM_NEON;
2431 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08002432 for (uint32_t n = 1; n <= 8; n++) {
2433 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002434 GemmMicrokernelTester()
2435 .mr(4)
2436 .nr(8)
2437 .kr(1)
2438 .sr(1)
2439 .m(m)
2440 .n(n)
2441 .k(k)
2442 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002443 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002444 }
2445 }
2446 }
2447 }
2448
2449 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_gt_8) {
2450 TEST_REQUIRES_ARM_NEON;
2451 for (size_t k = 9; k < 16; k++) {
2452 GemmMicrokernelTester()
2453 .mr(4)
2454 .nr(8)
2455 .kr(1)
2456 .sr(1)
2457 .m(4)
2458 .n(8)
2459 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08002460 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002461 }
2462 }
2463
2464 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_gt_8_strided_a) {
2465 TEST_REQUIRES_ARM_NEON;
2466 for (size_t k = 9; k < 16; k++) {
2467 GemmMicrokernelTester()
2468 .mr(4)
2469 .nr(8)
2470 .kr(1)
2471 .sr(1)
2472 .m(4)
2473 .n(8)
2474 .k(k)
2475 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08002476 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002477 }
2478 }
2479
2480 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_gt_8_subtile) {
2481 TEST_REQUIRES_ARM_NEON;
2482 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08002483 for (uint32_t n = 1; n <= 8; n++) {
2484 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002485 GemmMicrokernelTester()
2486 .mr(4)
2487 .nr(8)
2488 .kr(1)
2489 .sr(1)
2490 .m(m)
2491 .n(n)
2492 .k(k)
2493 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002494 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002495 }
2496 }
2497 }
2498 }
2499
2500 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_div_8) {
2501 TEST_REQUIRES_ARM_NEON;
2502 for (size_t k = 16; k <= 80; k += 8) {
2503 GemmMicrokernelTester()
2504 .mr(4)
2505 .nr(8)
2506 .kr(1)
2507 .sr(1)
2508 .m(4)
2509 .n(8)
2510 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08002511 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002512 }
2513 }
2514
2515 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_div_8_strided_a) {
2516 TEST_REQUIRES_ARM_NEON;
2517 for (size_t k = 16; k <= 80; k += 8) {
2518 GemmMicrokernelTester()
2519 .mr(4)
2520 .nr(8)
2521 .kr(1)
2522 .sr(1)
2523 .m(4)
2524 .n(8)
2525 .k(k)
2526 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08002527 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002528 }
2529 }
2530
2531 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_div_8_subtile) {
2532 TEST_REQUIRES_ARM_NEON;
2533 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08002534 for (uint32_t n = 1; n <= 8; n++) {
2535 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002536 GemmMicrokernelTester()
2537 .mr(4)
2538 .nr(8)
2539 .kr(1)
2540 .sr(1)
2541 .m(m)
2542 .n(n)
2543 .k(k)
2544 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002545 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002546 }
2547 }
2548 }
2549 }
2550
2551 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_gt_8) {
2552 TEST_REQUIRES_ARM_NEON;
2553 for (uint32_t n = 9; n < 16; n++) {
2554 for (size_t k = 1; k <= 40; k += 9) {
2555 GemmMicrokernelTester()
2556 .mr(4)
2557 .nr(8)
2558 .kr(1)
2559 .sr(1)
2560 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08002561 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002562 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08002563 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002564 }
2565 }
2566 }
2567
2568 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_gt_8_strided_cn) {
2569 TEST_REQUIRES_ARM_NEON;
2570 for (uint32_t n = 9; n < 16; n++) {
2571 for (size_t k = 1; k <= 40; k += 9) {
2572 GemmMicrokernelTester()
2573 .mr(4)
2574 .nr(8)
2575 .kr(1)
2576 .sr(1)
2577 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08002578 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002579 .k(k)
2580 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08002581 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002582 }
2583 }
2584 }
2585
2586 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_gt_8_strided_a) {
2587 TEST_REQUIRES_ARM_NEON;
2588 for (uint32_t n = 9; n < 16; n++) {
2589 for (size_t k = 1; k <= 40; k += 9) {
2590 GemmMicrokernelTester()
2591 .mr(4)
2592 .nr(8)
2593 .kr(1)
2594 .sr(1)
2595 .m(4)
2596 .n(n)
2597 .k(k)
2598 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08002599 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002600 }
2601 }
2602 }
2603
2604 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_gt_8_subtile) {
2605 TEST_REQUIRES_ARM_NEON;
2606 for (uint32_t n = 9; n < 16; n++) {
2607 for (size_t k = 1; k <= 40; k += 9) {
2608 for (uint32_t m = 1; m <= 4; m++) {
2609 GemmMicrokernelTester()
2610 .mr(4)
2611 .nr(8)
2612 .kr(1)
2613 .sr(1)
2614 .m(m)
2615 .n(n)
2616 .k(k)
2617 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002618 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002619 }
2620 }
2621 }
2622 }
2623
2624 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_div_8) {
2625 TEST_REQUIRES_ARM_NEON;
2626 for (uint32_t n = 16; n <= 24; n += 8) {
2627 for (size_t k = 1; k <= 40; k += 9) {
2628 GemmMicrokernelTester()
2629 .mr(4)
2630 .nr(8)
2631 .kr(1)
2632 .sr(1)
2633 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08002634 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002635 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08002636 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002637 }
2638 }
2639 }
2640
2641 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_div_8_strided_cn) {
2642 TEST_REQUIRES_ARM_NEON;
2643 for (uint32_t n = 16; n <= 24; n += 8) {
2644 for (size_t k = 1; k <= 40; k += 9) {
2645 GemmMicrokernelTester()
2646 .mr(4)
2647 .nr(8)
2648 .kr(1)
2649 .sr(1)
2650 .m(4)
2651 .n(n)
2652 .k(k)
2653 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08002654 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002655 }
2656 }
2657 }
2658
2659 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_div_8_strided_a) {
2660 TEST_REQUIRES_ARM_NEON;
2661 for (uint32_t n = 16; n <= 24; n += 8) {
2662 for (size_t k = 1; k <= 40; k += 9) {
2663 GemmMicrokernelTester()
2664 .mr(4)
2665 .nr(8)
2666 .kr(1)
2667 .sr(1)
2668 .m(4)
2669 .n(n)
2670 .k(k)
2671 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08002672 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002673 }
2674 }
2675 }
2676
2677 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_div_8_subtile) {
2678 TEST_REQUIRES_ARM_NEON;
2679 for (uint32_t n = 16; n <= 24; n += 8) {
2680 for (size_t k = 1; k <= 40; k += 9) {
2681 for (uint32_t m = 1; m <= 4; m++) {
2682 GemmMicrokernelTester()
2683 .mr(4)
2684 .nr(8)
2685 .kr(1)
2686 .sr(1)
2687 .m(m)
2688 .n(n)
2689 .k(k)
2690 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002691 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002692 }
2693 }
2694 }
2695 }
2696
2697 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, strided_cm_subtile) {
2698 TEST_REQUIRES_ARM_NEON;
2699 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08002700 for (uint32_t n = 1; n <= 8; n++) {
2701 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002702 GemmMicrokernelTester()
2703 .mr(4)
2704 .nr(8)
2705 .kr(1)
2706 .sr(1)
2707 .m(m)
2708 .n(n)
2709 .k(k)
2710 .cm_stride(11)
2711 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002712 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002713 }
2714 }
2715 }
2716 }
2717
2718 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, qmin) {
2719 TEST_REQUIRES_ARM_NEON;
2720 GemmMicrokernelTester()
2721 .mr(4)
2722 .nr(8)
2723 .kr(1)
2724 .sr(1)
2725 .m(4)
2726 .n(8)
2727 .k(8)
2728 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08002729 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002730 }
2731
2732 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, qmax) {
2733 TEST_REQUIRES_ARM_NEON;
2734 GemmMicrokernelTester()
2735 .mr(4)
2736 .nr(8)
2737 .kr(1)
2738 .sr(1)
2739 .m(4)
2740 .n(8)
2741 .k(8)
2742 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08002743 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002744 }
2745
2746 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, strided_cm) {
2747 TEST_REQUIRES_ARM_NEON;
2748 GemmMicrokernelTester()
2749 .mr(4)
2750 .nr(8)
2751 .kr(1)
2752 .sr(1)
2753 .m(4)
2754 .n(8)
2755 .k(8)
2756 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08002757 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002758 }
2759#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
2760
2761
2762#if XNN_ARCH_ARM || XNN_ARCH_ARM64
2763 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, k_eq_8) {
2764 TEST_REQUIRES_ARM_NEON;
2765 GemmMicrokernelTester()
2766 .mr(6)
2767 .nr(8)
2768 .kr(1)
2769 .sr(1)
2770 .m(6)
2771 .n(8)
2772 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08002773 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002774 }
2775
2776 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, strided_cn) {
2777 TEST_REQUIRES_ARM_NEON;
2778 GemmMicrokernelTester()
2779 .mr(6)
2780 .nr(8)
2781 .kr(1)
2782 .sr(1)
2783 .m(6)
2784 .n(8)
2785 .k(8)
2786 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08002787 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002788 }
2789
2790 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, k_eq_8_strided_a) {
2791 TEST_REQUIRES_ARM_NEON;
2792 GemmMicrokernelTester()
2793 .mr(6)
2794 .nr(8)
2795 .kr(1)
2796 .sr(1)
2797 .m(6)
2798 .n(8)
2799 .k(8)
2800 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08002801 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002802 }
2803
2804 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, k_eq_8_subtile) {
2805 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -08002806 for (uint32_t n = 1; n <= 8; n++) {
2807 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002808 GemmMicrokernelTester()
2809 .mr(6)
2810 .nr(8)
2811 .kr(1)
2812 .sr(1)
2813 .m(m)
2814 .n(n)
2815 .k(8)
2816 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002817 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002818 }
2819 }
2820 }
2821
2822 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, k_eq_8_subtile_m) {
2823 TEST_REQUIRES_ARM_NEON;
2824 for (uint32_t m = 1; m <= 6; m++) {
2825 GemmMicrokernelTester()
2826 .mr(6)
2827 .nr(8)
2828 .kr(1)
2829 .sr(1)
2830 .m(m)
2831 .n(8)
2832 .k(8)
2833 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002834 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002835 }
2836 }
2837
2838 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, k_eq_8_subtile_n) {
2839 TEST_REQUIRES_ARM_NEON;
2840 for (uint32_t n = 1; n <= 8; n++) {
2841 GemmMicrokernelTester()
2842 .mr(6)
2843 .nr(8)
2844 .kr(1)
2845 .sr(1)
2846 .m(6)
2847 .n(n)
2848 .k(8)
2849 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002850 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002851 }
2852 }
2853
2854 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, k_lt_8) {
2855 TEST_REQUIRES_ARM_NEON;
2856 for (size_t k = 1; k < 8; k++) {
2857 GemmMicrokernelTester()
2858 .mr(6)
2859 .nr(8)
2860 .kr(1)
2861 .sr(1)
2862 .m(6)
2863 .n(8)
2864 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08002865 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002866 }
2867 }
2868
2869 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, k_lt_8_strided_a) {
2870 TEST_REQUIRES_ARM_NEON;
2871 for (size_t k = 1; k < 8; k++) {
2872 GemmMicrokernelTester()
2873 .mr(6)
2874 .nr(8)
2875 .kr(1)
2876 .sr(1)
2877 .m(6)
2878 .n(8)
2879 .k(k)
2880 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08002881 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002882 }
2883 }
2884
2885 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, k_lt_8_subtile) {
2886 TEST_REQUIRES_ARM_NEON;
2887 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08002888 for (uint32_t n = 1; n <= 8; n++) {
2889 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002890 GemmMicrokernelTester()
2891 .mr(6)
2892 .nr(8)
2893 .kr(1)
2894 .sr(1)
2895 .m(m)
2896 .n(n)
2897 .k(k)
2898 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002899 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002900 }
2901 }
2902 }
2903 }
2904
2905 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, k_gt_8) {
2906 TEST_REQUIRES_ARM_NEON;
2907 for (size_t k = 9; k < 16; k++) {
2908 GemmMicrokernelTester()
2909 .mr(6)
2910 .nr(8)
2911 .kr(1)
2912 .sr(1)
2913 .m(6)
2914 .n(8)
2915 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08002916 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002917 }
2918 }
2919
2920 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, k_gt_8_strided_a) {
2921 TEST_REQUIRES_ARM_NEON;
2922 for (size_t k = 9; k < 16; k++) {
2923 GemmMicrokernelTester()
2924 .mr(6)
2925 .nr(8)
2926 .kr(1)
2927 .sr(1)
2928 .m(6)
2929 .n(8)
2930 .k(k)
2931 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08002932 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002933 }
2934 }
2935
2936 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, k_gt_8_subtile) {
2937 TEST_REQUIRES_ARM_NEON;
2938 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08002939 for (uint32_t n = 1; n <= 8; n++) {
2940 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002941 GemmMicrokernelTester()
2942 .mr(6)
2943 .nr(8)
2944 .kr(1)
2945 .sr(1)
2946 .m(m)
2947 .n(n)
2948 .k(k)
2949 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08002950 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002951 }
2952 }
2953 }
2954 }
2955
2956 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, k_div_8) {
2957 TEST_REQUIRES_ARM_NEON;
2958 for (size_t k = 16; k <= 80; k += 8) {
2959 GemmMicrokernelTester()
2960 .mr(6)
2961 .nr(8)
2962 .kr(1)
2963 .sr(1)
2964 .m(6)
2965 .n(8)
2966 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08002967 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002968 }
2969 }
2970
2971 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, k_div_8_strided_a) {
2972 TEST_REQUIRES_ARM_NEON;
2973 for (size_t k = 16; k <= 80; k += 8) {
2974 GemmMicrokernelTester()
2975 .mr(6)
2976 .nr(8)
2977 .kr(1)
2978 .sr(1)
2979 .m(6)
2980 .n(8)
2981 .k(k)
2982 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08002983 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002984 }
2985 }
2986
2987 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, k_div_8_subtile) {
2988 TEST_REQUIRES_ARM_NEON;
2989 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08002990 for (uint32_t n = 1; n <= 8; n++) {
2991 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08002992 GemmMicrokernelTester()
2993 .mr(6)
2994 .nr(8)
2995 .kr(1)
2996 .sr(1)
2997 .m(m)
2998 .n(n)
2999 .k(k)
3000 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003001 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003002 }
3003 }
3004 }
3005 }
3006
3007 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, n_gt_8) {
3008 TEST_REQUIRES_ARM_NEON;
3009 for (uint32_t n = 9; n < 16; n++) {
3010 for (size_t k = 1; k <= 40; k += 9) {
3011 GemmMicrokernelTester()
3012 .mr(6)
3013 .nr(8)
3014 .kr(1)
3015 .sr(1)
3016 .m(6)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08003017 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003018 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08003019 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003020 }
3021 }
3022 }
3023
3024 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, n_gt_8_strided_cn) {
3025 TEST_REQUIRES_ARM_NEON;
3026 for (uint32_t n = 9; n < 16; n++) {
3027 for (size_t k = 1; k <= 40; k += 9) {
3028 GemmMicrokernelTester()
3029 .mr(6)
3030 .nr(8)
3031 .kr(1)
3032 .sr(1)
3033 .m(6)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08003034 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003035 .k(k)
3036 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08003037 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003038 }
3039 }
3040 }
3041
3042 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, n_gt_8_strided_a) {
3043 TEST_REQUIRES_ARM_NEON;
3044 for (uint32_t n = 9; n < 16; n++) {
3045 for (size_t k = 1; k <= 40; k += 9) {
3046 GemmMicrokernelTester()
3047 .mr(6)
3048 .nr(8)
3049 .kr(1)
3050 .sr(1)
3051 .m(6)
3052 .n(n)
3053 .k(k)
3054 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08003055 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003056 }
3057 }
3058 }
3059
3060 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, n_gt_8_subtile) {
3061 TEST_REQUIRES_ARM_NEON;
3062 for (uint32_t n = 9; n < 16; n++) {
3063 for (size_t k = 1; k <= 40; k += 9) {
3064 for (uint32_t m = 1; m <= 6; m++) {
3065 GemmMicrokernelTester()
3066 .mr(6)
3067 .nr(8)
3068 .kr(1)
3069 .sr(1)
3070 .m(m)
3071 .n(n)
3072 .k(k)
3073 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003074 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003075 }
3076 }
3077 }
3078 }
3079
3080 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, n_div_8) {
3081 TEST_REQUIRES_ARM_NEON;
3082 for (uint32_t n = 16; n <= 24; n += 8) {
3083 for (size_t k = 1; k <= 40; k += 9) {
3084 GemmMicrokernelTester()
3085 .mr(6)
3086 .nr(8)
3087 .kr(1)
3088 .sr(1)
3089 .m(6)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08003090 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003091 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08003092 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003093 }
3094 }
3095 }
3096
3097 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, n_div_8_strided_cn) {
3098 TEST_REQUIRES_ARM_NEON;
3099 for (uint32_t n = 16; n <= 24; n += 8) {
3100 for (size_t k = 1; k <= 40; k += 9) {
3101 GemmMicrokernelTester()
3102 .mr(6)
3103 .nr(8)
3104 .kr(1)
3105 .sr(1)
3106 .m(6)
3107 .n(n)
3108 .k(k)
3109 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08003110 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003111 }
3112 }
3113 }
3114
3115 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, n_div_8_strided_a) {
3116 TEST_REQUIRES_ARM_NEON;
3117 for (uint32_t n = 16; n <= 24; n += 8) {
3118 for (size_t k = 1; k <= 40; k += 9) {
3119 GemmMicrokernelTester()
3120 .mr(6)
3121 .nr(8)
3122 .kr(1)
3123 .sr(1)
3124 .m(6)
3125 .n(n)
3126 .k(k)
3127 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08003128 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003129 }
3130 }
3131 }
3132
3133 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, n_div_8_subtile) {
3134 TEST_REQUIRES_ARM_NEON;
3135 for (uint32_t n = 16; n <= 24; n += 8) {
3136 for (size_t k = 1; k <= 40; k += 9) {
3137 for (uint32_t m = 1; m <= 6; m++) {
3138 GemmMicrokernelTester()
3139 .mr(6)
3140 .nr(8)
3141 .kr(1)
3142 .sr(1)
3143 .m(m)
3144 .n(n)
3145 .k(k)
3146 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003147 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003148 }
3149 }
3150 }
3151 }
3152
3153 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, strided_cm_subtile) {
3154 TEST_REQUIRES_ARM_NEON;
3155 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08003156 for (uint32_t n = 1; n <= 8; n++) {
3157 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003158 GemmMicrokernelTester()
3159 .mr(6)
3160 .nr(8)
3161 .kr(1)
3162 .sr(1)
3163 .m(m)
3164 .n(n)
3165 .k(k)
3166 .cm_stride(11)
3167 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003168 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003169 }
3170 }
3171 }
3172 }
3173
3174 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, qmin) {
3175 TEST_REQUIRES_ARM_NEON;
3176 GemmMicrokernelTester()
3177 .mr(6)
3178 .nr(8)
3179 .kr(1)
3180 .sr(1)
3181 .m(6)
3182 .n(8)
3183 .k(8)
3184 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08003185 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003186 }
3187
3188 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, qmax) {
3189 TEST_REQUIRES_ARM_NEON;
3190 GemmMicrokernelTester()
3191 .mr(6)
3192 .nr(8)
3193 .kr(1)
3194 .sr(1)
3195 .m(6)
3196 .n(8)
3197 .k(8)
3198 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08003199 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003200 }
3201
3202 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE, strided_cm) {
3203 TEST_REQUIRES_ARM_NEON;
3204 GemmMicrokernelTester()
3205 .mr(6)
3206 .nr(8)
3207 .kr(1)
3208 .sr(1)
3209 .m(6)
3210 .n(8)
3211 .k(8)
3212 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08003213 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003214 }
3215#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
3216
3217
3218#if XNN_ARCH_ARM || XNN_ARCH_ARM64
3219 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, k_eq_8) {
3220 TEST_REQUIRES_ARM_NEON;
3221 GemmMicrokernelTester()
3222 .mr(2)
3223 .nr(16)
3224 .kr(1)
3225 .sr(1)
3226 .m(2)
3227 .n(16)
3228 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08003229 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003230 }
3231
3232 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, strided_cn) {
3233 TEST_REQUIRES_ARM_NEON;
3234 GemmMicrokernelTester()
3235 .mr(2)
3236 .nr(16)
3237 .kr(1)
3238 .sr(1)
3239 .m(2)
3240 .n(16)
3241 .k(8)
3242 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08003243 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003244 }
3245
3246 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, k_eq_8_strided_a) {
3247 TEST_REQUIRES_ARM_NEON;
3248 GemmMicrokernelTester()
3249 .mr(2)
3250 .nr(16)
3251 .kr(1)
3252 .sr(1)
3253 .m(2)
3254 .n(16)
3255 .k(8)
3256 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08003257 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003258 }
3259
3260 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, k_eq_8_subtile) {
3261 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -08003262 for (uint32_t n = 1; n <= 16; n++) {
3263 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003264 GemmMicrokernelTester()
3265 .mr(2)
3266 .nr(16)
3267 .kr(1)
3268 .sr(1)
3269 .m(m)
3270 .n(n)
3271 .k(8)
3272 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003273 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003274 }
3275 }
3276 }
3277
3278 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, k_eq_8_subtile_m) {
3279 TEST_REQUIRES_ARM_NEON;
3280 for (uint32_t m = 1; m <= 2; m++) {
3281 GemmMicrokernelTester()
3282 .mr(2)
3283 .nr(16)
3284 .kr(1)
3285 .sr(1)
3286 .m(m)
3287 .n(16)
3288 .k(8)
3289 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003290 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003291 }
3292 }
3293
3294 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, k_eq_8_subtile_n) {
3295 TEST_REQUIRES_ARM_NEON;
3296 for (uint32_t n = 1; n <= 16; n++) {
3297 GemmMicrokernelTester()
3298 .mr(2)
3299 .nr(16)
3300 .kr(1)
3301 .sr(1)
3302 .m(2)
3303 .n(n)
3304 .k(8)
3305 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003306 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003307 }
3308 }
3309
3310 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, k_lt_8) {
3311 TEST_REQUIRES_ARM_NEON;
3312 for (size_t k = 1; k < 8; k++) {
3313 GemmMicrokernelTester()
3314 .mr(2)
3315 .nr(16)
3316 .kr(1)
3317 .sr(1)
3318 .m(2)
3319 .n(16)
3320 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08003321 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003322 }
3323 }
3324
3325 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, k_lt_8_strided_a) {
3326 TEST_REQUIRES_ARM_NEON;
3327 for (size_t k = 1; k < 8; k++) {
3328 GemmMicrokernelTester()
3329 .mr(2)
3330 .nr(16)
3331 .kr(1)
3332 .sr(1)
3333 .m(2)
3334 .n(16)
3335 .k(k)
3336 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08003337 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003338 }
3339 }
3340
3341 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, k_lt_8_subtile) {
3342 TEST_REQUIRES_ARM_NEON;
3343 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08003344 for (uint32_t n = 1; n <= 16; n++) {
3345 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003346 GemmMicrokernelTester()
3347 .mr(2)
3348 .nr(16)
3349 .kr(1)
3350 .sr(1)
3351 .m(m)
3352 .n(n)
3353 .k(k)
3354 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003355 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003356 }
3357 }
3358 }
3359 }
3360
3361 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, k_gt_8) {
3362 TEST_REQUIRES_ARM_NEON;
3363 for (size_t k = 9; k < 16; k++) {
3364 GemmMicrokernelTester()
3365 .mr(2)
3366 .nr(16)
3367 .kr(1)
3368 .sr(1)
3369 .m(2)
3370 .n(16)
3371 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08003372 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003373 }
3374 }
3375
3376 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, k_gt_8_strided_a) {
3377 TEST_REQUIRES_ARM_NEON;
3378 for (size_t k = 9; k < 16; k++) {
3379 GemmMicrokernelTester()
3380 .mr(2)
3381 .nr(16)
3382 .kr(1)
3383 .sr(1)
3384 .m(2)
3385 .n(16)
3386 .k(k)
3387 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08003388 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003389 }
3390 }
3391
3392 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, k_gt_8_subtile) {
3393 TEST_REQUIRES_ARM_NEON;
3394 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08003395 for (uint32_t n = 1; n <= 16; n++) {
3396 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003397 GemmMicrokernelTester()
3398 .mr(2)
3399 .nr(16)
3400 .kr(1)
3401 .sr(1)
3402 .m(m)
3403 .n(n)
3404 .k(k)
3405 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003406 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003407 }
3408 }
3409 }
3410 }
3411
3412 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, k_div_8) {
3413 TEST_REQUIRES_ARM_NEON;
3414 for (size_t k = 16; k <= 80; k += 8) {
3415 GemmMicrokernelTester()
3416 .mr(2)
3417 .nr(16)
3418 .kr(1)
3419 .sr(1)
3420 .m(2)
3421 .n(16)
3422 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08003423 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003424 }
3425 }
3426
3427 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, k_div_8_strided_a) {
3428 TEST_REQUIRES_ARM_NEON;
3429 for (size_t k = 16; k <= 80; k += 8) {
3430 GemmMicrokernelTester()
3431 .mr(2)
3432 .nr(16)
3433 .kr(1)
3434 .sr(1)
3435 .m(2)
3436 .n(16)
3437 .k(k)
3438 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08003439 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003440 }
3441 }
3442
3443 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, k_div_8_subtile) {
3444 TEST_REQUIRES_ARM_NEON;
3445 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08003446 for (uint32_t n = 1; n <= 16; n++) {
3447 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003448 GemmMicrokernelTester()
3449 .mr(2)
3450 .nr(16)
3451 .kr(1)
3452 .sr(1)
3453 .m(m)
3454 .n(n)
3455 .k(k)
3456 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003457 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003458 }
3459 }
3460 }
3461 }
3462
3463 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, n_gt_16) {
3464 TEST_REQUIRES_ARM_NEON;
3465 for (uint32_t n = 17; n < 32; n++) {
3466 for (size_t k = 1; k <= 40; k += 9) {
3467 GemmMicrokernelTester()
3468 .mr(2)
3469 .nr(16)
3470 .kr(1)
3471 .sr(1)
3472 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08003473 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003474 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08003475 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003476 }
3477 }
3478 }
3479
3480 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, n_gt_16_strided_cn) {
3481 TEST_REQUIRES_ARM_NEON;
3482 for (uint32_t n = 17; n < 32; n++) {
3483 for (size_t k = 1; k <= 40; k += 9) {
3484 GemmMicrokernelTester()
3485 .mr(2)
3486 .nr(16)
3487 .kr(1)
3488 .sr(1)
3489 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08003490 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003491 .k(k)
3492 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08003493 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003494 }
3495 }
3496 }
3497
3498 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, n_gt_16_strided_a) {
3499 TEST_REQUIRES_ARM_NEON;
3500 for (uint32_t n = 17; n < 32; n++) {
3501 for (size_t k = 1; k <= 40; k += 9) {
3502 GemmMicrokernelTester()
3503 .mr(2)
3504 .nr(16)
3505 .kr(1)
3506 .sr(1)
3507 .m(2)
3508 .n(n)
3509 .k(k)
3510 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08003511 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003512 }
3513 }
3514 }
3515
3516 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, n_gt_16_subtile) {
3517 TEST_REQUIRES_ARM_NEON;
3518 for (uint32_t n = 17; n < 32; n++) {
3519 for (size_t k = 1; k <= 40; k += 9) {
3520 for (uint32_t m = 1; m <= 2; m++) {
3521 GemmMicrokernelTester()
3522 .mr(2)
3523 .nr(16)
3524 .kr(1)
3525 .sr(1)
3526 .m(m)
3527 .n(n)
3528 .k(k)
3529 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003530 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003531 }
3532 }
3533 }
3534 }
3535
3536 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, n_div_16) {
3537 TEST_REQUIRES_ARM_NEON;
3538 for (uint32_t n = 32; n <= 48; n += 16) {
3539 for (size_t k = 1; k <= 40; k += 9) {
3540 GemmMicrokernelTester()
3541 .mr(2)
3542 .nr(16)
3543 .kr(1)
3544 .sr(1)
3545 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08003546 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003547 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08003548 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003549 }
3550 }
3551 }
3552
3553 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, n_div_16_strided_cn) {
3554 TEST_REQUIRES_ARM_NEON;
3555 for (uint32_t n = 32; n <= 48; n += 16) {
3556 for (size_t k = 1; k <= 40; k += 9) {
3557 GemmMicrokernelTester()
3558 .mr(2)
3559 .nr(16)
3560 .kr(1)
3561 .sr(1)
3562 .m(2)
3563 .n(n)
3564 .k(k)
3565 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08003566 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003567 }
3568 }
3569 }
3570
3571 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, n_div_16_strided_a) {
3572 TEST_REQUIRES_ARM_NEON;
3573 for (uint32_t n = 32; n <= 48; n += 16) {
3574 for (size_t k = 1; k <= 40; k += 9) {
3575 GemmMicrokernelTester()
3576 .mr(2)
3577 .nr(16)
3578 .kr(1)
3579 .sr(1)
3580 .m(2)
3581 .n(n)
3582 .k(k)
3583 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08003584 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003585 }
3586 }
3587 }
3588
3589 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, n_div_16_subtile) {
3590 TEST_REQUIRES_ARM_NEON;
3591 for (uint32_t n = 32; n <= 48; n += 16) {
3592 for (size_t k = 1; k <= 40; k += 9) {
3593 for (uint32_t m = 1; m <= 2; m++) {
3594 GemmMicrokernelTester()
3595 .mr(2)
3596 .nr(16)
3597 .kr(1)
3598 .sr(1)
3599 .m(m)
3600 .n(n)
3601 .k(k)
3602 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003603 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003604 }
3605 }
3606 }
3607 }
3608
3609 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, strided_cm_subtile) {
3610 TEST_REQUIRES_ARM_NEON;
3611 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08003612 for (uint32_t n = 1; n <= 16; n++) {
3613 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003614 GemmMicrokernelTester()
3615 .mr(2)
3616 .nr(16)
3617 .kr(1)
3618 .sr(1)
3619 .m(m)
3620 .n(n)
3621 .k(k)
3622 .cm_stride(19)
3623 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003624 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003625 }
3626 }
3627 }
3628 }
3629
3630 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, qmin) {
3631 TEST_REQUIRES_ARM_NEON;
3632 GemmMicrokernelTester()
3633 .mr(2)
3634 .nr(16)
3635 .kr(1)
3636 .sr(1)
3637 .m(2)
3638 .n(16)
3639 .k(8)
3640 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08003641 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003642 }
3643
3644 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, qmax) {
3645 TEST_REQUIRES_ARM_NEON;
3646 GemmMicrokernelTester()
3647 .mr(2)
3648 .nr(16)
3649 .kr(1)
3650 .sr(1)
3651 .m(2)
3652 .n(16)
3653 .k(8)
3654 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08003655 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003656 }
3657
3658 TEST(QC8_GEMM_MINMAX_FP32_2X16__NEON_MLAL_LANE, strided_cm) {
3659 TEST_REQUIRES_ARM_NEON;
3660 GemmMicrokernelTester()
3661 .mr(2)
3662 .nr(16)
3663 .kr(1)
3664 .sr(1)
3665 .m(2)
3666 .n(16)
3667 .k(8)
3668 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08003669 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003670 }
3671#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
3672
3673
3674#if XNN_ARCH_ARM || XNN_ARCH_ARM64
3675 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, k_eq_8) {
3676 TEST_REQUIRES_ARM_NEON;
3677 GemmMicrokernelTester()
3678 .mr(3)
3679 .nr(8)
3680 .kr(1)
3681 .sr(1)
3682 .m(3)
3683 .n(8)
3684 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08003685 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003686 }
3687
3688 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, strided_cn) {
3689 TEST_REQUIRES_ARM_NEON;
3690 GemmMicrokernelTester()
3691 .mr(3)
3692 .nr(8)
3693 .kr(1)
3694 .sr(1)
3695 .m(3)
3696 .n(8)
3697 .k(8)
3698 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08003699 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003700 }
3701
3702 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, k_eq_8_strided_a) {
3703 TEST_REQUIRES_ARM_NEON;
3704 GemmMicrokernelTester()
3705 .mr(3)
3706 .nr(8)
3707 .kr(1)
3708 .sr(1)
3709 .m(3)
3710 .n(8)
3711 .k(8)
3712 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08003713 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003714 }
3715
3716 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, k_eq_8_subtile) {
3717 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -08003718 for (uint32_t n = 1; n <= 8; n++) {
3719 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003720 GemmMicrokernelTester()
3721 .mr(3)
3722 .nr(8)
3723 .kr(1)
3724 .sr(1)
3725 .m(m)
3726 .n(n)
3727 .k(8)
3728 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003729 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003730 }
3731 }
3732 }
3733
3734 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, k_eq_8_subtile_m) {
3735 TEST_REQUIRES_ARM_NEON;
3736 for (uint32_t m = 1; m <= 3; m++) {
3737 GemmMicrokernelTester()
3738 .mr(3)
3739 .nr(8)
3740 .kr(1)
3741 .sr(1)
3742 .m(m)
3743 .n(8)
3744 .k(8)
3745 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003746 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003747 }
3748 }
3749
3750 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, k_eq_8_subtile_n) {
3751 TEST_REQUIRES_ARM_NEON;
3752 for (uint32_t n = 1; n <= 8; n++) {
3753 GemmMicrokernelTester()
3754 .mr(3)
3755 .nr(8)
3756 .kr(1)
3757 .sr(1)
3758 .m(3)
3759 .n(n)
3760 .k(8)
3761 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003762 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003763 }
3764 }
3765
3766 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, k_lt_8) {
3767 TEST_REQUIRES_ARM_NEON;
3768 for (size_t k = 1; k < 8; k++) {
3769 GemmMicrokernelTester()
3770 .mr(3)
3771 .nr(8)
3772 .kr(1)
3773 .sr(1)
3774 .m(3)
3775 .n(8)
3776 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08003777 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003778 }
3779 }
3780
3781 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, k_lt_8_strided_a) {
3782 TEST_REQUIRES_ARM_NEON;
3783 for (size_t k = 1; k < 8; k++) {
3784 GemmMicrokernelTester()
3785 .mr(3)
3786 .nr(8)
3787 .kr(1)
3788 .sr(1)
3789 .m(3)
3790 .n(8)
3791 .k(k)
3792 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08003793 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003794 }
3795 }
3796
3797 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, k_lt_8_subtile) {
3798 TEST_REQUIRES_ARM_NEON;
3799 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08003800 for (uint32_t n = 1; n <= 8; n++) {
3801 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003802 GemmMicrokernelTester()
3803 .mr(3)
3804 .nr(8)
3805 .kr(1)
3806 .sr(1)
3807 .m(m)
3808 .n(n)
3809 .k(k)
3810 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003811 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003812 }
3813 }
3814 }
3815 }
3816
3817 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, k_gt_8) {
3818 TEST_REQUIRES_ARM_NEON;
3819 for (size_t k = 9; k < 16; k++) {
3820 GemmMicrokernelTester()
3821 .mr(3)
3822 .nr(8)
3823 .kr(1)
3824 .sr(1)
3825 .m(3)
3826 .n(8)
3827 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08003828 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003829 }
3830 }
3831
3832 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, k_gt_8_strided_a) {
3833 TEST_REQUIRES_ARM_NEON;
3834 for (size_t k = 9; k < 16; k++) {
3835 GemmMicrokernelTester()
3836 .mr(3)
3837 .nr(8)
3838 .kr(1)
3839 .sr(1)
3840 .m(3)
3841 .n(8)
3842 .k(k)
3843 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08003844 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003845 }
3846 }
3847
3848 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, k_gt_8_subtile) {
3849 TEST_REQUIRES_ARM_NEON;
3850 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08003851 for (uint32_t n = 1; n <= 8; n++) {
3852 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003853 GemmMicrokernelTester()
3854 .mr(3)
3855 .nr(8)
3856 .kr(1)
3857 .sr(1)
3858 .m(m)
3859 .n(n)
3860 .k(k)
3861 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003862 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003863 }
3864 }
3865 }
3866 }
3867
3868 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, k_div_8) {
3869 TEST_REQUIRES_ARM_NEON;
3870 for (size_t k = 16; k <= 80; k += 8) {
3871 GemmMicrokernelTester()
3872 .mr(3)
3873 .nr(8)
3874 .kr(1)
3875 .sr(1)
3876 .m(3)
3877 .n(8)
3878 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08003879 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003880 }
3881 }
3882
3883 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, k_div_8_strided_a) {
3884 TEST_REQUIRES_ARM_NEON;
3885 for (size_t k = 16; k <= 80; k += 8) {
3886 GemmMicrokernelTester()
3887 .mr(3)
3888 .nr(8)
3889 .kr(1)
3890 .sr(1)
3891 .m(3)
3892 .n(8)
3893 .k(k)
3894 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08003895 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003896 }
3897 }
3898
3899 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, k_div_8_subtile) {
3900 TEST_REQUIRES_ARM_NEON;
3901 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08003902 for (uint32_t n = 1; n <= 8; n++) {
3903 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003904 GemmMicrokernelTester()
3905 .mr(3)
3906 .nr(8)
3907 .kr(1)
3908 .sr(1)
3909 .m(m)
3910 .n(n)
3911 .k(k)
3912 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003913 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003914 }
3915 }
3916 }
3917 }
3918
3919 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, n_gt_8) {
3920 TEST_REQUIRES_ARM_NEON;
3921 for (uint32_t n = 9; n < 16; n++) {
3922 for (size_t k = 1; k <= 40; k += 9) {
3923 GemmMicrokernelTester()
3924 .mr(3)
3925 .nr(8)
3926 .kr(1)
3927 .sr(1)
3928 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08003929 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003930 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08003931 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003932 }
3933 }
3934 }
3935
3936 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, n_gt_8_strided_cn) {
3937 TEST_REQUIRES_ARM_NEON;
3938 for (uint32_t n = 9; n < 16; n++) {
3939 for (size_t k = 1; k <= 40; k += 9) {
3940 GemmMicrokernelTester()
3941 .mr(3)
3942 .nr(8)
3943 .kr(1)
3944 .sr(1)
3945 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08003946 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003947 .k(k)
3948 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08003949 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003950 }
3951 }
3952 }
3953
3954 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, n_gt_8_strided_a) {
3955 TEST_REQUIRES_ARM_NEON;
3956 for (uint32_t n = 9; n < 16; n++) {
3957 for (size_t k = 1; k <= 40; k += 9) {
3958 GemmMicrokernelTester()
3959 .mr(3)
3960 .nr(8)
3961 .kr(1)
3962 .sr(1)
3963 .m(3)
3964 .n(n)
3965 .k(k)
3966 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08003967 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003968 }
3969 }
3970 }
3971
3972 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, n_gt_8_subtile) {
3973 TEST_REQUIRES_ARM_NEON;
3974 for (uint32_t n = 9; n < 16; n++) {
3975 for (size_t k = 1; k <= 40; k += 9) {
3976 for (uint32_t m = 1; m <= 3; m++) {
3977 GemmMicrokernelTester()
3978 .mr(3)
3979 .nr(8)
3980 .kr(1)
3981 .sr(1)
3982 .m(m)
3983 .n(n)
3984 .k(k)
3985 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08003986 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08003987 }
3988 }
3989 }
3990 }
3991
3992 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, n_div_8) {
3993 TEST_REQUIRES_ARM_NEON;
3994 for (uint32_t n = 16; n <= 24; n += 8) {
3995 for (size_t k = 1; k <= 40; k += 9) {
3996 GemmMicrokernelTester()
3997 .mr(3)
3998 .nr(8)
3999 .kr(1)
4000 .sr(1)
4001 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08004002 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004003 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08004004 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004005 }
4006 }
4007 }
4008
4009 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, n_div_8_strided_cn) {
4010 TEST_REQUIRES_ARM_NEON;
4011 for (uint32_t n = 16; n <= 24; n += 8) {
4012 for (size_t k = 1; k <= 40; k += 9) {
4013 GemmMicrokernelTester()
4014 .mr(3)
4015 .nr(8)
4016 .kr(1)
4017 .sr(1)
4018 .m(3)
4019 .n(n)
4020 .k(k)
4021 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08004022 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004023 }
4024 }
4025 }
4026
4027 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, n_div_8_strided_a) {
4028 TEST_REQUIRES_ARM_NEON;
4029 for (uint32_t n = 16; n <= 24; n += 8) {
4030 for (size_t k = 1; k <= 40; k += 9) {
4031 GemmMicrokernelTester()
4032 .mr(3)
4033 .nr(8)
4034 .kr(1)
4035 .sr(1)
4036 .m(3)
4037 .n(n)
4038 .k(k)
4039 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08004040 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004041 }
4042 }
4043 }
4044
4045 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, n_div_8_subtile) {
4046 TEST_REQUIRES_ARM_NEON;
4047 for (uint32_t n = 16; n <= 24; n += 8) {
4048 for (size_t k = 1; k <= 40; k += 9) {
4049 for (uint32_t m = 1; m <= 3; m++) {
4050 GemmMicrokernelTester()
4051 .mr(3)
4052 .nr(8)
4053 .kr(1)
4054 .sr(1)
4055 .m(m)
4056 .n(n)
4057 .k(k)
4058 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004059 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004060 }
4061 }
4062 }
4063 }
4064
4065 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, strided_cm_subtile) {
4066 TEST_REQUIRES_ARM_NEON;
4067 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08004068 for (uint32_t n = 1; n <= 8; n++) {
4069 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004070 GemmMicrokernelTester()
4071 .mr(3)
4072 .nr(8)
4073 .kr(1)
4074 .sr(1)
4075 .m(m)
4076 .n(n)
4077 .k(k)
4078 .cm_stride(11)
4079 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004080 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004081 }
4082 }
4083 }
4084 }
4085
4086 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, qmin) {
4087 TEST_REQUIRES_ARM_NEON;
4088 GemmMicrokernelTester()
4089 .mr(3)
4090 .nr(8)
4091 .kr(1)
4092 .sr(1)
4093 .m(3)
4094 .n(8)
4095 .k(8)
4096 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08004097 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004098 }
4099
4100 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, qmax) {
4101 TEST_REQUIRES_ARM_NEON;
4102 GemmMicrokernelTester()
4103 .mr(3)
4104 .nr(8)
4105 .kr(1)
4106 .sr(1)
4107 .m(3)
4108 .n(8)
4109 .k(8)
4110 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08004111 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004112 }
4113
4114 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEON_MLAL_LANE_PRFM, strided_cm) {
4115 TEST_REQUIRES_ARM_NEON;
4116 GemmMicrokernelTester()
4117 .mr(3)
4118 .nr(8)
4119 .kr(1)
4120 .sr(1)
4121 .m(3)
4122 .n(8)
4123 .k(8)
4124 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08004125 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004126 }
4127#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
4128
4129
4130#if XNN_ARCH_ARM || XNN_ARCH_ARM64
4131 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, k_eq_8) {
4132 TEST_REQUIRES_ARM_NEON;
4133 GemmMicrokernelTester()
4134 .mr(6)
4135 .nr(8)
4136 .kr(1)
4137 .sr(1)
4138 .m(6)
4139 .n(8)
4140 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08004141 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004142 }
4143
4144 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, strided_cn) {
4145 TEST_REQUIRES_ARM_NEON;
4146 GemmMicrokernelTester()
4147 .mr(6)
4148 .nr(8)
4149 .kr(1)
4150 .sr(1)
4151 .m(6)
4152 .n(8)
4153 .k(8)
4154 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08004155 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004156 }
4157
4158 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, k_eq_8_strided_a) {
4159 TEST_REQUIRES_ARM_NEON;
4160 GemmMicrokernelTester()
4161 .mr(6)
4162 .nr(8)
4163 .kr(1)
4164 .sr(1)
4165 .m(6)
4166 .n(8)
4167 .k(8)
4168 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08004169 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004170 }
4171
4172 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, k_eq_8_subtile) {
4173 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -08004174 for (uint32_t n = 1; n <= 8; n++) {
4175 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004176 GemmMicrokernelTester()
4177 .mr(6)
4178 .nr(8)
4179 .kr(1)
4180 .sr(1)
4181 .m(m)
4182 .n(n)
4183 .k(8)
4184 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004185 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004186 }
4187 }
4188 }
4189
4190 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, k_eq_8_subtile_m) {
4191 TEST_REQUIRES_ARM_NEON;
4192 for (uint32_t m = 1; m <= 6; m++) {
4193 GemmMicrokernelTester()
4194 .mr(6)
4195 .nr(8)
4196 .kr(1)
4197 .sr(1)
4198 .m(m)
4199 .n(8)
4200 .k(8)
4201 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004202 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004203 }
4204 }
4205
4206 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, k_eq_8_subtile_n) {
4207 TEST_REQUIRES_ARM_NEON;
4208 for (uint32_t n = 1; n <= 8; n++) {
4209 GemmMicrokernelTester()
4210 .mr(6)
4211 .nr(8)
4212 .kr(1)
4213 .sr(1)
4214 .m(6)
4215 .n(n)
4216 .k(8)
4217 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004218 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004219 }
4220 }
4221
4222 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, k_lt_8) {
4223 TEST_REQUIRES_ARM_NEON;
4224 for (size_t k = 1; k < 8; k++) {
4225 GemmMicrokernelTester()
4226 .mr(6)
4227 .nr(8)
4228 .kr(1)
4229 .sr(1)
4230 .m(6)
4231 .n(8)
4232 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08004233 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004234 }
4235 }
4236
4237 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, k_lt_8_strided_a) {
4238 TEST_REQUIRES_ARM_NEON;
4239 for (size_t k = 1; k < 8; k++) {
4240 GemmMicrokernelTester()
4241 .mr(6)
4242 .nr(8)
4243 .kr(1)
4244 .sr(1)
4245 .m(6)
4246 .n(8)
4247 .k(k)
4248 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08004249 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004250 }
4251 }
4252
4253 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, k_lt_8_subtile) {
4254 TEST_REQUIRES_ARM_NEON;
4255 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08004256 for (uint32_t n = 1; n <= 8; n++) {
4257 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004258 GemmMicrokernelTester()
4259 .mr(6)
4260 .nr(8)
4261 .kr(1)
4262 .sr(1)
4263 .m(m)
4264 .n(n)
4265 .k(k)
4266 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004267 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004268 }
4269 }
4270 }
4271 }
4272
4273 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, k_gt_8) {
4274 TEST_REQUIRES_ARM_NEON;
4275 for (size_t k = 9; k < 16; k++) {
4276 GemmMicrokernelTester()
4277 .mr(6)
4278 .nr(8)
4279 .kr(1)
4280 .sr(1)
4281 .m(6)
4282 .n(8)
4283 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08004284 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004285 }
4286 }
4287
4288 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, k_gt_8_strided_a) {
4289 TEST_REQUIRES_ARM_NEON;
4290 for (size_t k = 9; k < 16; k++) {
4291 GemmMicrokernelTester()
4292 .mr(6)
4293 .nr(8)
4294 .kr(1)
4295 .sr(1)
4296 .m(6)
4297 .n(8)
4298 .k(k)
4299 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08004300 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004301 }
4302 }
4303
4304 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, k_gt_8_subtile) {
4305 TEST_REQUIRES_ARM_NEON;
4306 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08004307 for (uint32_t n = 1; n <= 8; n++) {
4308 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004309 GemmMicrokernelTester()
4310 .mr(6)
4311 .nr(8)
4312 .kr(1)
4313 .sr(1)
4314 .m(m)
4315 .n(n)
4316 .k(k)
4317 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004318 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004319 }
4320 }
4321 }
4322 }
4323
4324 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, k_div_8) {
4325 TEST_REQUIRES_ARM_NEON;
4326 for (size_t k = 16; k <= 80; k += 8) {
4327 GemmMicrokernelTester()
4328 .mr(6)
4329 .nr(8)
4330 .kr(1)
4331 .sr(1)
4332 .m(6)
4333 .n(8)
4334 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08004335 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004336 }
4337 }
4338
4339 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, k_div_8_strided_a) {
4340 TEST_REQUIRES_ARM_NEON;
4341 for (size_t k = 16; k <= 80; k += 8) {
4342 GemmMicrokernelTester()
4343 .mr(6)
4344 .nr(8)
4345 .kr(1)
4346 .sr(1)
4347 .m(6)
4348 .n(8)
4349 .k(k)
4350 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08004351 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004352 }
4353 }
4354
4355 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, k_div_8_subtile) {
4356 TEST_REQUIRES_ARM_NEON;
4357 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08004358 for (uint32_t n = 1; n <= 8; n++) {
4359 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004360 GemmMicrokernelTester()
4361 .mr(6)
4362 .nr(8)
4363 .kr(1)
4364 .sr(1)
4365 .m(m)
4366 .n(n)
4367 .k(k)
4368 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004369 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004370 }
4371 }
4372 }
4373 }
4374
4375 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, n_gt_8) {
4376 TEST_REQUIRES_ARM_NEON;
4377 for (uint32_t n = 9; n < 16; n++) {
4378 for (size_t k = 1; k <= 40; k += 9) {
4379 GemmMicrokernelTester()
4380 .mr(6)
4381 .nr(8)
4382 .kr(1)
4383 .sr(1)
4384 .m(6)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08004385 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004386 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08004387 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004388 }
4389 }
4390 }
4391
4392 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, n_gt_8_strided_cn) {
4393 TEST_REQUIRES_ARM_NEON;
4394 for (uint32_t n = 9; n < 16; n++) {
4395 for (size_t k = 1; k <= 40; k += 9) {
4396 GemmMicrokernelTester()
4397 .mr(6)
4398 .nr(8)
4399 .kr(1)
4400 .sr(1)
4401 .m(6)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08004402 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004403 .k(k)
4404 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08004405 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004406 }
4407 }
4408 }
4409
4410 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, n_gt_8_strided_a) {
4411 TEST_REQUIRES_ARM_NEON;
4412 for (uint32_t n = 9; n < 16; n++) {
4413 for (size_t k = 1; k <= 40; k += 9) {
4414 GemmMicrokernelTester()
4415 .mr(6)
4416 .nr(8)
4417 .kr(1)
4418 .sr(1)
4419 .m(6)
4420 .n(n)
4421 .k(k)
4422 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08004423 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004424 }
4425 }
4426 }
4427
4428 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, n_gt_8_subtile) {
4429 TEST_REQUIRES_ARM_NEON;
4430 for (uint32_t n = 9; n < 16; n++) {
4431 for (size_t k = 1; k <= 40; k += 9) {
4432 for (uint32_t m = 1; m <= 6; m++) {
4433 GemmMicrokernelTester()
4434 .mr(6)
4435 .nr(8)
4436 .kr(1)
4437 .sr(1)
4438 .m(m)
4439 .n(n)
4440 .k(k)
4441 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004442 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004443 }
4444 }
4445 }
4446 }
4447
4448 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, n_div_8) {
4449 TEST_REQUIRES_ARM_NEON;
4450 for (uint32_t n = 16; n <= 24; n += 8) {
4451 for (size_t k = 1; k <= 40; k += 9) {
4452 GemmMicrokernelTester()
4453 .mr(6)
4454 .nr(8)
4455 .kr(1)
4456 .sr(1)
4457 .m(6)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08004458 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004459 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08004460 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004461 }
4462 }
4463 }
4464
4465 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, n_div_8_strided_cn) {
4466 TEST_REQUIRES_ARM_NEON;
4467 for (uint32_t n = 16; n <= 24; n += 8) {
4468 for (size_t k = 1; k <= 40; k += 9) {
4469 GemmMicrokernelTester()
4470 .mr(6)
4471 .nr(8)
4472 .kr(1)
4473 .sr(1)
4474 .m(6)
4475 .n(n)
4476 .k(k)
4477 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08004478 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004479 }
4480 }
4481 }
4482
4483 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, n_div_8_strided_a) {
4484 TEST_REQUIRES_ARM_NEON;
4485 for (uint32_t n = 16; n <= 24; n += 8) {
4486 for (size_t k = 1; k <= 40; k += 9) {
4487 GemmMicrokernelTester()
4488 .mr(6)
4489 .nr(8)
4490 .kr(1)
4491 .sr(1)
4492 .m(6)
4493 .n(n)
4494 .k(k)
4495 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08004496 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004497 }
4498 }
4499 }
4500
4501 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, n_div_8_subtile) {
4502 TEST_REQUIRES_ARM_NEON;
4503 for (uint32_t n = 16; n <= 24; n += 8) {
4504 for (size_t k = 1; k <= 40; k += 9) {
4505 for (uint32_t m = 1; m <= 6; m++) {
4506 GemmMicrokernelTester()
4507 .mr(6)
4508 .nr(8)
4509 .kr(1)
4510 .sr(1)
4511 .m(m)
4512 .n(n)
4513 .k(k)
4514 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004515 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004516 }
4517 }
4518 }
4519 }
4520
4521 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, strided_cm_subtile) {
4522 TEST_REQUIRES_ARM_NEON;
4523 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08004524 for (uint32_t n = 1; n <= 8; n++) {
4525 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004526 GemmMicrokernelTester()
4527 .mr(6)
4528 .nr(8)
4529 .kr(1)
4530 .sr(1)
4531 .m(m)
4532 .n(n)
4533 .k(k)
4534 .cm_stride(11)
4535 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004536 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004537 }
4538 }
4539 }
4540 }
4541
4542 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, qmin) {
4543 TEST_REQUIRES_ARM_NEON;
4544 GemmMicrokernelTester()
4545 .mr(6)
4546 .nr(8)
4547 .kr(1)
4548 .sr(1)
4549 .m(6)
4550 .n(8)
4551 .k(8)
4552 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08004553 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004554 }
4555
4556 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, qmax) {
4557 TEST_REQUIRES_ARM_NEON;
4558 GemmMicrokernelTester()
4559 .mr(6)
4560 .nr(8)
4561 .kr(1)
4562 .sr(1)
4563 .m(6)
4564 .n(8)
4565 .k(8)
4566 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08004567 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004568 }
4569
4570 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEON_MLAL_LANE_PRFM, strided_cm) {
4571 TEST_REQUIRES_ARM_NEON;
4572 GemmMicrokernelTester()
4573 .mr(6)
4574 .nr(8)
4575 .kr(1)
4576 .sr(1)
4577 .m(6)
4578 .n(8)
4579 .k(8)
4580 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08004581 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004582 }
4583#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
4584
4585
4586#if XNN_ARCH_ARM || XNN_ARCH_ARM64
4587 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, k_eq_8) {
4588 TEST_REQUIRES_ARM_NEON_V8;
4589 GemmMicrokernelTester()
4590 .mr(1)
4591 .nr(8)
4592 .kr(1)
4593 .sr(1)
4594 .m(1)
4595 .n(8)
4596 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08004597 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004598 }
4599
4600 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, strided_cn) {
4601 TEST_REQUIRES_ARM_NEON_V8;
4602 GemmMicrokernelTester()
4603 .mr(1)
4604 .nr(8)
4605 .kr(1)
4606 .sr(1)
4607 .m(1)
4608 .n(8)
4609 .k(8)
4610 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08004611 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004612 }
4613
4614 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, k_eq_8_strided_a) {
4615 TEST_REQUIRES_ARM_NEON_V8;
4616 GemmMicrokernelTester()
4617 .mr(1)
4618 .nr(8)
4619 .kr(1)
4620 .sr(1)
4621 .m(1)
4622 .n(8)
4623 .k(8)
4624 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08004625 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004626 }
4627
4628 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, k_eq_8_subtile) {
4629 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -08004630 for (uint32_t n = 1; n <= 8; n++) {
4631 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004632 GemmMicrokernelTester()
4633 .mr(1)
4634 .nr(8)
4635 .kr(1)
4636 .sr(1)
4637 .m(m)
4638 .n(n)
4639 .k(8)
4640 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004641 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004642 }
4643 }
4644 }
4645
4646 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, k_eq_8_subtile_m) {
4647 TEST_REQUIRES_ARM_NEON_V8;
4648 for (uint32_t m = 1; m <= 1; m++) {
4649 GemmMicrokernelTester()
4650 .mr(1)
4651 .nr(8)
4652 .kr(1)
4653 .sr(1)
4654 .m(m)
4655 .n(8)
4656 .k(8)
4657 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004658 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004659 }
4660 }
4661
4662 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, k_eq_8_subtile_n) {
4663 TEST_REQUIRES_ARM_NEON_V8;
4664 for (uint32_t n = 1; n <= 8; n++) {
4665 GemmMicrokernelTester()
4666 .mr(1)
4667 .nr(8)
4668 .kr(1)
4669 .sr(1)
4670 .m(1)
4671 .n(n)
4672 .k(8)
4673 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004674 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004675 }
4676 }
4677
4678 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, k_lt_8) {
4679 TEST_REQUIRES_ARM_NEON_V8;
4680 for (size_t k = 1; k < 8; k++) {
4681 GemmMicrokernelTester()
4682 .mr(1)
4683 .nr(8)
4684 .kr(1)
4685 .sr(1)
4686 .m(1)
4687 .n(8)
4688 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08004689 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004690 }
4691 }
4692
4693 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, k_lt_8_strided_a) {
4694 TEST_REQUIRES_ARM_NEON_V8;
4695 for (size_t k = 1; k < 8; k++) {
4696 GemmMicrokernelTester()
4697 .mr(1)
4698 .nr(8)
4699 .kr(1)
4700 .sr(1)
4701 .m(1)
4702 .n(8)
4703 .k(k)
4704 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08004705 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004706 }
4707 }
4708
4709 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, k_lt_8_subtile) {
4710 TEST_REQUIRES_ARM_NEON_V8;
4711 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08004712 for (uint32_t n = 1; n <= 8; n++) {
4713 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004714 GemmMicrokernelTester()
4715 .mr(1)
4716 .nr(8)
4717 .kr(1)
4718 .sr(1)
4719 .m(m)
4720 .n(n)
4721 .k(k)
4722 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004723 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004724 }
4725 }
4726 }
4727 }
4728
4729 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, k_gt_8) {
4730 TEST_REQUIRES_ARM_NEON_V8;
4731 for (size_t k = 9; k < 16; k++) {
4732 GemmMicrokernelTester()
4733 .mr(1)
4734 .nr(8)
4735 .kr(1)
4736 .sr(1)
4737 .m(1)
4738 .n(8)
4739 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08004740 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004741 }
4742 }
4743
4744 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, k_gt_8_strided_a) {
4745 TEST_REQUIRES_ARM_NEON_V8;
4746 for (size_t k = 9; k < 16; k++) {
4747 GemmMicrokernelTester()
4748 .mr(1)
4749 .nr(8)
4750 .kr(1)
4751 .sr(1)
4752 .m(1)
4753 .n(8)
4754 .k(k)
4755 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08004756 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004757 }
4758 }
4759
4760 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, k_gt_8_subtile) {
4761 TEST_REQUIRES_ARM_NEON_V8;
4762 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08004763 for (uint32_t n = 1; n <= 8; n++) {
4764 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004765 GemmMicrokernelTester()
4766 .mr(1)
4767 .nr(8)
4768 .kr(1)
4769 .sr(1)
4770 .m(m)
4771 .n(n)
4772 .k(k)
4773 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004774 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004775 }
4776 }
4777 }
4778 }
4779
4780 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, k_div_8) {
4781 TEST_REQUIRES_ARM_NEON_V8;
4782 for (size_t k = 16; k <= 80; k += 8) {
4783 GemmMicrokernelTester()
4784 .mr(1)
4785 .nr(8)
4786 .kr(1)
4787 .sr(1)
4788 .m(1)
4789 .n(8)
4790 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08004791 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004792 }
4793 }
4794
4795 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, k_div_8_strided_a) {
4796 TEST_REQUIRES_ARM_NEON_V8;
4797 for (size_t k = 16; k <= 80; k += 8) {
4798 GemmMicrokernelTester()
4799 .mr(1)
4800 .nr(8)
4801 .kr(1)
4802 .sr(1)
4803 .m(1)
4804 .n(8)
4805 .k(k)
4806 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08004807 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004808 }
4809 }
4810
4811 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, k_div_8_subtile) {
4812 TEST_REQUIRES_ARM_NEON_V8;
4813 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08004814 for (uint32_t n = 1; n <= 8; n++) {
4815 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004816 GemmMicrokernelTester()
4817 .mr(1)
4818 .nr(8)
4819 .kr(1)
4820 .sr(1)
4821 .m(m)
4822 .n(n)
4823 .k(k)
4824 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004825 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004826 }
4827 }
4828 }
4829 }
4830
4831 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, n_gt_8) {
4832 TEST_REQUIRES_ARM_NEON_V8;
4833 for (uint32_t n = 9; n < 16; n++) {
4834 for (size_t k = 1; k <= 40; k += 9) {
4835 GemmMicrokernelTester()
4836 .mr(1)
4837 .nr(8)
4838 .kr(1)
4839 .sr(1)
4840 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08004841 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004842 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08004843 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004844 }
4845 }
4846 }
4847
4848 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, n_gt_8_strided_cn) {
4849 TEST_REQUIRES_ARM_NEON_V8;
4850 for (uint32_t n = 9; n < 16; n++) {
4851 for (size_t k = 1; k <= 40; k += 9) {
4852 GemmMicrokernelTester()
4853 .mr(1)
4854 .nr(8)
4855 .kr(1)
4856 .sr(1)
4857 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08004858 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004859 .k(k)
4860 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08004861 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004862 }
4863 }
4864 }
4865
4866 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, n_gt_8_strided_a) {
4867 TEST_REQUIRES_ARM_NEON_V8;
4868 for (uint32_t n = 9; n < 16; n++) {
4869 for (size_t k = 1; k <= 40; k += 9) {
4870 GemmMicrokernelTester()
4871 .mr(1)
4872 .nr(8)
4873 .kr(1)
4874 .sr(1)
4875 .m(1)
4876 .n(n)
4877 .k(k)
4878 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08004879 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004880 }
4881 }
4882 }
4883
4884 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, n_gt_8_subtile) {
4885 TEST_REQUIRES_ARM_NEON_V8;
4886 for (uint32_t n = 9; n < 16; n++) {
4887 for (size_t k = 1; k <= 40; k += 9) {
4888 for (uint32_t m = 1; m <= 1; m++) {
4889 GemmMicrokernelTester()
4890 .mr(1)
4891 .nr(8)
4892 .kr(1)
4893 .sr(1)
4894 .m(m)
4895 .n(n)
4896 .k(k)
4897 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004898 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004899 }
4900 }
4901 }
4902 }
4903
4904 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, n_div_8) {
4905 TEST_REQUIRES_ARM_NEON_V8;
4906 for (uint32_t n = 16; n <= 24; n += 8) {
4907 for (size_t k = 1; k <= 40; k += 9) {
4908 GemmMicrokernelTester()
4909 .mr(1)
4910 .nr(8)
4911 .kr(1)
4912 .sr(1)
4913 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08004914 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004915 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08004916 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004917 }
4918 }
4919 }
4920
4921 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, n_div_8_strided_cn) {
4922 TEST_REQUIRES_ARM_NEON_V8;
4923 for (uint32_t n = 16; n <= 24; n += 8) {
4924 for (size_t k = 1; k <= 40; k += 9) {
4925 GemmMicrokernelTester()
4926 .mr(1)
4927 .nr(8)
4928 .kr(1)
4929 .sr(1)
4930 .m(1)
4931 .n(n)
4932 .k(k)
4933 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08004934 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004935 }
4936 }
4937 }
4938
4939 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, n_div_8_strided_a) {
4940 TEST_REQUIRES_ARM_NEON_V8;
4941 for (uint32_t n = 16; n <= 24; n += 8) {
4942 for (size_t k = 1; k <= 40; k += 9) {
4943 GemmMicrokernelTester()
4944 .mr(1)
4945 .nr(8)
4946 .kr(1)
4947 .sr(1)
4948 .m(1)
4949 .n(n)
4950 .k(k)
4951 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08004952 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004953 }
4954 }
4955 }
4956
4957 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, n_div_8_subtile) {
4958 TEST_REQUIRES_ARM_NEON_V8;
4959 for (uint32_t n = 16; n <= 24; n += 8) {
4960 for (size_t k = 1; k <= 40; k += 9) {
4961 for (uint32_t m = 1; m <= 1; m++) {
4962 GemmMicrokernelTester()
4963 .mr(1)
4964 .nr(8)
4965 .kr(1)
4966 .sr(1)
4967 .m(m)
4968 .n(n)
4969 .k(k)
4970 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004971 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004972 }
4973 }
4974 }
4975 }
4976
4977 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, strided_cm_subtile) {
4978 TEST_REQUIRES_ARM_NEON_V8;
4979 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08004980 for (uint32_t n = 1; n <= 8; n++) {
4981 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004982 GemmMicrokernelTester()
4983 .mr(1)
4984 .nr(8)
4985 .kr(1)
4986 .sr(1)
4987 .m(m)
4988 .n(n)
4989 .k(k)
4990 .cm_stride(11)
4991 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08004992 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08004993 }
4994 }
4995 }
4996 }
4997
4998 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, qmin) {
4999 TEST_REQUIRES_ARM_NEON_V8;
5000 GemmMicrokernelTester()
5001 .mr(1)
5002 .nr(8)
5003 .kr(1)
5004 .sr(1)
5005 .m(1)
5006 .n(8)
5007 .k(8)
5008 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08005009 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005010 }
5011
5012 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, qmax) {
5013 TEST_REQUIRES_ARM_NEON_V8;
5014 GemmMicrokernelTester()
5015 .mr(1)
5016 .nr(8)
5017 .kr(1)
5018 .sr(1)
5019 .m(1)
5020 .n(8)
5021 .k(8)
5022 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08005023 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005024 }
5025
5026 TEST(QC8_GEMM_MINMAX_FP32_1X8__NEONV8_MLAL_LANE, strided_cm) {
5027 TEST_REQUIRES_ARM_NEON_V8;
5028 GemmMicrokernelTester()
5029 .mr(1)
5030 .nr(8)
5031 .kr(1)
5032 .sr(1)
5033 .m(1)
5034 .n(8)
5035 .k(8)
5036 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005037 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005038 }
5039#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
5040
5041
5042#if XNN_ARCH_ARM || XNN_ARCH_ARM64
5043 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, k_eq_8) {
5044 TEST_REQUIRES_ARM_NEON_V8;
5045 GemmMicrokernelTester()
5046 .mr(3)
5047 .nr(8)
5048 .kr(1)
5049 .sr(1)
5050 .m(3)
5051 .n(8)
5052 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08005053 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005054 }
5055
5056 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, strided_cn) {
5057 TEST_REQUIRES_ARM_NEON_V8;
5058 GemmMicrokernelTester()
5059 .mr(3)
5060 .nr(8)
5061 .kr(1)
5062 .sr(1)
5063 .m(3)
5064 .n(8)
5065 .k(8)
5066 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005067 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005068 }
5069
5070 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, k_eq_8_strided_a) {
5071 TEST_REQUIRES_ARM_NEON_V8;
5072 GemmMicrokernelTester()
5073 .mr(3)
5074 .nr(8)
5075 .kr(1)
5076 .sr(1)
5077 .m(3)
5078 .n(8)
5079 .k(8)
5080 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005081 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005082 }
5083
5084 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, k_eq_8_subtile) {
5085 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -08005086 for (uint32_t n = 1; n <= 8; n++) {
5087 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005088 GemmMicrokernelTester()
5089 .mr(3)
5090 .nr(8)
5091 .kr(1)
5092 .sr(1)
5093 .m(m)
5094 .n(n)
5095 .k(8)
5096 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005097 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005098 }
5099 }
5100 }
5101
5102 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, k_eq_8_subtile_m) {
5103 TEST_REQUIRES_ARM_NEON_V8;
5104 for (uint32_t m = 1; m <= 3; m++) {
5105 GemmMicrokernelTester()
5106 .mr(3)
5107 .nr(8)
5108 .kr(1)
5109 .sr(1)
5110 .m(m)
5111 .n(8)
5112 .k(8)
5113 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005114 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005115 }
5116 }
5117
5118 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, k_eq_8_subtile_n) {
5119 TEST_REQUIRES_ARM_NEON_V8;
5120 for (uint32_t n = 1; n <= 8; n++) {
5121 GemmMicrokernelTester()
5122 .mr(3)
5123 .nr(8)
5124 .kr(1)
5125 .sr(1)
5126 .m(3)
5127 .n(n)
5128 .k(8)
5129 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005130 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005131 }
5132 }
5133
5134 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, k_lt_8) {
5135 TEST_REQUIRES_ARM_NEON_V8;
5136 for (size_t k = 1; k < 8; k++) {
5137 GemmMicrokernelTester()
5138 .mr(3)
5139 .nr(8)
5140 .kr(1)
5141 .sr(1)
5142 .m(3)
5143 .n(8)
5144 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08005145 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005146 }
5147 }
5148
5149 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, k_lt_8_strided_a) {
5150 TEST_REQUIRES_ARM_NEON_V8;
5151 for (size_t k = 1; k < 8; k++) {
5152 GemmMicrokernelTester()
5153 .mr(3)
5154 .nr(8)
5155 .kr(1)
5156 .sr(1)
5157 .m(3)
5158 .n(8)
5159 .k(k)
5160 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005161 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005162 }
5163 }
5164
5165 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, k_lt_8_subtile) {
5166 TEST_REQUIRES_ARM_NEON_V8;
5167 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08005168 for (uint32_t n = 1; n <= 8; n++) {
5169 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005170 GemmMicrokernelTester()
5171 .mr(3)
5172 .nr(8)
5173 .kr(1)
5174 .sr(1)
5175 .m(m)
5176 .n(n)
5177 .k(k)
5178 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005179 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005180 }
5181 }
5182 }
5183 }
5184
5185 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, k_gt_8) {
5186 TEST_REQUIRES_ARM_NEON_V8;
5187 for (size_t k = 9; k < 16; k++) {
5188 GemmMicrokernelTester()
5189 .mr(3)
5190 .nr(8)
5191 .kr(1)
5192 .sr(1)
5193 .m(3)
5194 .n(8)
5195 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08005196 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005197 }
5198 }
5199
5200 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, k_gt_8_strided_a) {
5201 TEST_REQUIRES_ARM_NEON_V8;
5202 for (size_t k = 9; k < 16; k++) {
5203 GemmMicrokernelTester()
5204 .mr(3)
5205 .nr(8)
5206 .kr(1)
5207 .sr(1)
5208 .m(3)
5209 .n(8)
5210 .k(k)
5211 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08005212 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005213 }
5214 }
5215
5216 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, k_gt_8_subtile) {
5217 TEST_REQUIRES_ARM_NEON_V8;
5218 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08005219 for (uint32_t n = 1; n <= 8; n++) {
5220 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005221 GemmMicrokernelTester()
5222 .mr(3)
5223 .nr(8)
5224 .kr(1)
5225 .sr(1)
5226 .m(m)
5227 .n(n)
5228 .k(k)
5229 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005230 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005231 }
5232 }
5233 }
5234 }
5235
5236 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, k_div_8) {
5237 TEST_REQUIRES_ARM_NEON_V8;
5238 for (size_t k = 16; k <= 80; k += 8) {
5239 GemmMicrokernelTester()
5240 .mr(3)
5241 .nr(8)
5242 .kr(1)
5243 .sr(1)
5244 .m(3)
5245 .n(8)
5246 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08005247 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005248 }
5249 }
5250
5251 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, k_div_8_strided_a) {
5252 TEST_REQUIRES_ARM_NEON_V8;
5253 for (size_t k = 16; k <= 80; k += 8) {
5254 GemmMicrokernelTester()
5255 .mr(3)
5256 .nr(8)
5257 .kr(1)
5258 .sr(1)
5259 .m(3)
5260 .n(8)
5261 .k(k)
5262 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08005263 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005264 }
5265 }
5266
5267 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, k_div_8_subtile) {
5268 TEST_REQUIRES_ARM_NEON_V8;
5269 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08005270 for (uint32_t n = 1; n <= 8; n++) {
5271 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005272 GemmMicrokernelTester()
5273 .mr(3)
5274 .nr(8)
5275 .kr(1)
5276 .sr(1)
5277 .m(m)
5278 .n(n)
5279 .k(k)
5280 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005281 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005282 }
5283 }
5284 }
5285 }
5286
5287 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, n_gt_8) {
5288 TEST_REQUIRES_ARM_NEON_V8;
5289 for (uint32_t n = 9; n < 16; n++) {
5290 for (size_t k = 1; k <= 40; k += 9) {
5291 GemmMicrokernelTester()
5292 .mr(3)
5293 .nr(8)
5294 .kr(1)
5295 .sr(1)
5296 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08005297 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005298 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08005299 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005300 }
5301 }
5302 }
5303
5304 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, n_gt_8_strided_cn) {
5305 TEST_REQUIRES_ARM_NEON_V8;
5306 for (uint32_t n = 9; n < 16; n++) {
5307 for (size_t k = 1; k <= 40; k += 9) {
5308 GemmMicrokernelTester()
5309 .mr(3)
5310 .nr(8)
5311 .kr(1)
5312 .sr(1)
5313 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08005314 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005315 .k(k)
5316 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005317 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005318 }
5319 }
5320 }
5321
5322 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, n_gt_8_strided_a) {
5323 TEST_REQUIRES_ARM_NEON_V8;
5324 for (uint32_t n = 9; n < 16; n++) {
5325 for (size_t k = 1; k <= 40; k += 9) {
5326 GemmMicrokernelTester()
5327 .mr(3)
5328 .nr(8)
5329 .kr(1)
5330 .sr(1)
5331 .m(3)
5332 .n(n)
5333 .k(k)
5334 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08005335 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005336 }
5337 }
5338 }
5339
5340 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, n_gt_8_subtile) {
5341 TEST_REQUIRES_ARM_NEON_V8;
5342 for (uint32_t n = 9; n < 16; n++) {
5343 for (size_t k = 1; k <= 40; k += 9) {
5344 for (uint32_t m = 1; m <= 3; m++) {
5345 GemmMicrokernelTester()
5346 .mr(3)
5347 .nr(8)
5348 .kr(1)
5349 .sr(1)
5350 .m(m)
5351 .n(n)
5352 .k(k)
5353 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005354 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005355 }
5356 }
5357 }
5358 }
5359
5360 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, n_div_8) {
5361 TEST_REQUIRES_ARM_NEON_V8;
5362 for (uint32_t n = 16; n <= 24; n += 8) {
5363 for (size_t k = 1; k <= 40; k += 9) {
5364 GemmMicrokernelTester()
5365 .mr(3)
5366 .nr(8)
5367 .kr(1)
5368 .sr(1)
5369 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08005370 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005371 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08005372 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005373 }
5374 }
5375 }
5376
5377 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, n_div_8_strided_cn) {
5378 TEST_REQUIRES_ARM_NEON_V8;
5379 for (uint32_t n = 16; n <= 24; n += 8) {
5380 for (size_t k = 1; k <= 40; k += 9) {
5381 GemmMicrokernelTester()
5382 .mr(3)
5383 .nr(8)
5384 .kr(1)
5385 .sr(1)
5386 .m(3)
5387 .n(n)
5388 .k(k)
5389 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005390 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005391 }
5392 }
5393 }
5394
5395 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, n_div_8_strided_a) {
5396 TEST_REQUIRES_ARM_NEON_V8;
5397 for (uint32_t n = 16; n <= 24; n += 8) {
5398 for (size_t k = 1; k <= 40; k += 9) {
5399 GemmMicrokernelTester()
5400 .mr(3)
5401 .nr(8)
5402 .kr(1)
5403 .sr(1)
5404 .m(3)
5405 .n(n)
5406 .k(k)
5407 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08005408 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005409 }
5410 }
5411 }
5412
5413 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, n_div_8_subtile) {
5414 TEST_REQUIRES_ARM_NEON_V8;
5415 for (uint32_t n = 16; n <= 24; n += 8) {
5416 for (size_t k = 1; k <= 40; k += 9) {
5417 for (uint32_t m = 1; m <= 3; m++) {
5418 GemmMicrokernelTester()
5419 .mr(3)
5420 .nr(8)
5421 .kr(1)
5422 .sr(1)
5423 .m(m)
5424 .n(n)
5425 .k(k)
5426 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005427 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005428 }
5429 }
5430 }
5431 }
5432
5433 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, strided_cm_subtile) {
5434 TEST_REQUIRES_ARM_NEON_V8;
5435 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08005436 for (uint32_t n = 1; n <= 8; n++) {
5437 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005438 GemmMicrokernelTester()
5439 .mr(3)
5440 .nr(8)
5441 .kr(1)
5442 .sr(1)
5443 .m(m)
5444 .n(n)
5445 .k(k)
5446 .cm_stride(11)
5447 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005448 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005449 }
5450 }
5451 }
5452 }
5453
5454 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, qmin) {
5455 TEST_REQUIRES_ARM_NEON_V8;
5456 GemmMicrokernelTester()
5457 .mr(3)
5458 .nr(8)
5459 .kr(1)
5460 .sr(1)
5461 .m(3)
5462 .n(8)
5463 .k(8)
5464 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08005465 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005466 }
5467
5468 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, qmax) {
5469 TEST_REQUIRES_ARM_NEON_V8;
5470 GemmMicrokernelTester()
5471 .mr(3)
5472 .nr(8)
5473 .kr(1)
5474 .sr(1)
5475 .m(3)
5476 .n(8)
5477 .k(8)
5478 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08005479 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005480 }
5481
5482 TEST(QC8_GEMM_MINMAX_FP32_3X8__NEONV8_MLAL_LANE, strided_cm) {
5483 TEST_REQUIRES_ARM_NEON_V8;
5484 GemmMicrokernelTester()
5485 .mr(3)
5486 .nr(8)
5487 .kr(1)
5488 .sr(1)
5489 .m(3)
5490 .n(8)
5491 .k(8)
5492 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005493 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005494 }
5495#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
5496
5497
5498#if XNN_ARCH_ARM || XNN_ARCH_ARM64
5499 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, k_eq_8) {
5500 TEST_REQUIRES_ARM_NEON_V8;
5501 GemmMicrokernelTester()
5502 .mr(6)
5503 .nr(16)
5504 .kr(1)
5505 .sr(1)
5506 .m(6)
5507 .n(16)
5508 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08005509 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005510 }
5511
5512 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, strided_cn) {
5513 TEST_REQUIRES_ARM_NEON_V8;
5514 GemmMicrokernelTester()
5515 .mr(6)
5516 .nr(16)
5517 .kr(1)
5518 .sr(1)
5519 .m(6)
5520 .n(16)
5521 .k(8)
5522 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08005523 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005524 }
5525
5526 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, k_eq_8_strided_a) {
5527 TEST_REQUIRES_ARM_NEON_V8;
5528 GemmMicrokernelTester()
5529 .mr(6)
5530 .nr(16)
5531 .kr(1)
5532 .sr(1)
5533 .m(6)
5534 .n(16)
5535 .k(8)
5536 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005537 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005538 }
5539
5540 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, k_eq_8_subtile) {
5541 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -08005542 for (uint32_t n = 1; n <= 16; n++) {
5543 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005544 GemmMicrokernelTester()
5545 .mr(6)
5546 .nr(16)
5547 .kr(1)
5548 .sr(1)
5549 .m(m)
5550 .n(n)
5551 .k(8)
5552 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005553 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005554 }
5555 }
5556 }
5557
5558 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, k_eq_8_subtile_m) {
5559 TEST_REQUIRES_ARM_NEON_V8;
5560 for (uint32_t m = 1; m <= 6; m++) {
5561 GemmMicrokernelTester()
5562 .mr(6)
5563 .nr(16)
5564 .kr(1)
5565 .sr(1)
5566 .m(m)
5567 .n(16)
5568 .k(8)
5569 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005570 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005571 }
5572 }
5573
5574 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, k_eq_8_subtile_n) {
5575 TEST_REQUIRES_ARM_NEON_V8;
5576 for (uint32_t n = 1; n <= 16; n++) {
5577 GemmMicrokernelTester()
5578 .mr(6)
5579 .nr(16)
5580 .kr(1)
5581 .sr(1)
5582 .m(6)
5583 .n(n)
5584 .k(8)
5585 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005586 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005587 }
5588 }
5589
5590 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, k_lt_8) {
5591 TEST_REQUIRES_ARM_NEON_V8;
5592 for (size_t k = 1; k < 8; k++) {
5593 GemmMicrokernelTester()
5594 .mr(6)
5595 .nr(16)
5596 .kr(1)
5597 .sr(1)
5598 .m(6)
5599 .n(16)
5600 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08005601 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005602 }
5603 }
5604
5605 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, k_lt_8_strided_a) {
5606 TEST_REQUIRES_ARM_NEON_V8;
5607 for (size_t k = 1; k < 8; k++) {
5608 GemmMicrokernelTester()
5609 .mr(6)
5610 .nr(16)
5611 .kr(1)
5612 .sr(1)
5613 .m(6)
5614 .n(16)
5615 .k(k)
5616 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005617 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005618 }
5619 }
5620
5621 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, k_lt_8_subtile) {
5622 TEST_REQUIRES_ARM_NEON_V8;
5623 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08005624 for (uint32_t n = 1; n <= 16; n++) {
5625 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005626 GemmMicrokernelTester()
5627 .mr(6)
5628 .nr(16)
5629 .kr(1)
5630 .sr(1)
5631 .m(m)
5632 .n(n)
5633 .k(k)
5634 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005635 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005636 }
5637 }
5638 }
5639 }
5640
5641 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, k_gt_8) {
5642 TEST_REQUIRES_ARM_NEON_V8;
5643 for (size_t k = 9; k < 16; k++) {
5644 GemmMicrokernelTester()
5645 .mr(6)
5646 .nr(16)
5647 .kr(1)
5648 .sr(1)
5649 .m(6)
5650 .n(16)
5651 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08005652 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005653 }
5654 }
5655
5656 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, k_gt_8_strided_a) {
5657 TEST_REQUIRES_ARM_NEON_V8;
5658 for (size_t k = 9; k < 16; k++) {
5659 GemmMicrokernelTester()
5660 .mr(6)
5661 .nr(16)
5662 .kr(1)
5663 .sr(1)
5664 .m(6)
5665 .n(16)
5666 .k(k)
5667 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08005668 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005669 }
5670 }
5671
5672 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, k_gt_8_subtile) {
5673 TEST_REQUIRES_ARM_NEON_V8;
5674 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08005675 for (uint32_t n = 1; n <= 16; n++) {
5676 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005677 GemmMicrokernelTester()
5678 .mr(6)
5679 .nr(16)
5680 .kr(1)
5681 .sr(1)
5682 .m(m)
5683 .n(n)
5684 .k(k)
5685 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005686 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005687 }
5688 }
5689 }
5690 }
5691
5692 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, k_div_8) {
5693 TEST_REQUIRES_ARM_NEON_V8;
5694 for (size_t k = 16; k <= 80; k += 8) {
5695 GemmMicrokernelTester()
5696 .mr(6)
5697 .nr(16)
5698 .kr(1)
5699 .sr(1)
5700 .m(6)
5701 .n(16)
5702 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08005703 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005704 }
5705 }
5706
5707 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, k_div_8_strided_a) {
5708 TEST_REQUIRES_ARM_NEON_V8;
5709 for (size_t k = 16; k <= 80; k += 8) {
5710 GemmMicrokernelTester()
5711 .mr(6)
5712 .nr(16)
5713 .kr(1)
5714 .sr(1)
5715 .m(6)
5716 .n(16)
5717 .k(k)
5718 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08005719 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005720 }
5721 }
5722
5723 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, k_div_8_subtile) {
5724 TEST_REQUIRES_ARM_NEON_V8;
5725 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08005726 for (uint32_t n = 1; n <= 16; n++) {
5727 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005728 GemmMicrokernelTester()
5729 .mr(6)
5730 .nr(16)
5731 .kr(1)
5732 .sr(1)
5733 .m(m)
5734 .n(n)
5735 .k(k)
5736 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005737 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005738 }
5739 }
5740 }
5741 }
5742
5743 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, n_gt_16) {
5744 TEST_REQUIRES_ARM_NEON_V8;
5745 for (uint32_t n = 17; n < 32; n++) {
5746 for (size_t k = 1; k <= 40; k += 9) {
5747 GemmMicrokernelTester()
5748 .mr(6)
5749 .nr(16)
5750 .kr(1)
5751 .sr(1)
5752 .m(6)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08005753 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005754 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08005755 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005756 }
5757 }
5758 }
5759
5760 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, n_gt_16_strided_cn) {
5761 TEST_REQUIRES_ARM_NEON_V8;
5762 for (uint32_t n = 17; n < 32; n++) {
5763 for (size_t k = 1; k <= 40; k += 9) {
5764 GemmMicrokernelTester()
5765 .mr(6)
5766 .nr(16)
5767 .kr(1)
5768 .sr(1)
5769 .m(6)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08005770 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005771 .k(k)
5772 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08005773 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005774 }
5775 }
5776 }
5777
5778 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, n_gt_16_strided_a) {
5779 TEST_REQUIRES_ARM_NEON_V8;
5780 for (uint32_t n = 17; n < 32; n++) {
5781 for (size_t k = 1; k <= 40; k += 9) {
5782 GemmMicrokernelTester()
5783 .mr(6)
5784 .nr(16)
5785 .kr(1)
5786 .sr(1)
5787 .m(6)
5788 .n(n)
5789 .k(k)
5790 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08005791 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005792 }
5793 }
5794 }
5795
5796 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, n_gt_16_subtile) {
5797 TEST_REQUIRES_ARM_NEON_V8;
5798 for (uint32_t n = 17; n < 32; n++) {
5799 for (size_t k = 1; k <= 40; k += 9) {
5800 for (uint32_t m = 1; m <= 6; m++) {
5801 GemmMicrokernelTester()
5802 .mr(6)
5803 .nr(16)
5804 .kr(1)
5805 .sr(1)
5806 .m(m)
5807 .n(n)
5808 .k(k)
5809 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005810 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005811 }
5812 }
5813 }
5814 }
5815
5816 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, n_div_16) {
5817 TEST_REQUIRES_ARM_NEON_V8;
5818 for (uint32_t n = 32; n <= 48; n += 16) {
5819 for (size_t k = 1; k <= 40; k += 9) {
5820 GemmMicrokernelTester()
5821 .mr(6)
5822 .nr(16)
5823 .kr(1)
5824 .sr(1)
5825 .m(6)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08005826 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005827 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08005828 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005829 }
5830 }
5831 }
5832
5833 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, n_div_16_strided_cn) {
5834 TEST_REQUIRES_ARM_NEON_V8;
5835 for (uint32_t n = 32; n <= 48; n += 16) {
5836 for (size_t k = 1; k <= 40; k += 9) {
5837 GemmMicrokernelTester()
5838 .mr(6)
5839 .nr(16)
5840 .kr(1)
5841 .sr(1)
5842 .m(6)
5843 .n(n)
5844 .k(k)
5845 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08005846 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005847 }
5848 }
5849 }
5850
5851 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, n_div_16_strided_a) {
5852 TEST_REQUIRES_ARM_NEON_V8;
5853 for (uint32_t n = 32; n <= 48; n += 16) {
5854 for (size_t k = 1; k <= 40; k += 9) {
5855 GemmMicrokernelTester()
5856 .mr(6)
5857 .nr(16)
5858 .kr(1)
5859 .sr(1)
5860 .m(6)
5861 .n(n)
5862 .k(k)
5863 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08005864 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005865 }
5866 }
5867 }
5868
5869 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, n_div_16_subtile) {
5870 TEST_REQUIRES_ARM_NEON_V8;
5871 for (uint32_t n = 32; n <= 48; n += 16) {
5872 for (size_t k = 1; k <= 40; k += 9) {
5873 for (uint32_t m = 1; m <= 6; m++) {
5874 GemmMicrokernelTester()
5875 .mr(6)
5876 .nr(16)
5877 .kr(1)
5878 .sr(1)
5879 .m(m)
5880 .n(n)
5881 .k(k)
5882 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005883 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005884 }
5885 }
5886 }
5887 }
5888
5889 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, strided_cm_subtile) {
5890 TEST_REQUIRES_ARM_NEON_V8;
5891 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08005892 for (uint32_t n = 1; n <= 16; n++) {
5893 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005894 GemmMicrokernelTester()
5895 .mr(6)
5896 .nr(16)
5897 .kr(1)
5898 .sr(1)
5899 .m(m)
5900 .n(n)
5901 .k(k)
5902 .cm_stride(19)
5903 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08005904 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005905 }
5906 }
5907 }
5908 }
5909
5910 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, qmin) {
5911 TEST_REQUIRES_ARM_NEON_V8;
5912 GemmMicrokernelTester()
5913 .mr(6)
5914 .nr(16)
5915 .kr(1)
5916 .sr(1)
5917 .m(6)
5918 .n(16)
5919 .k(8)
5920 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08005921 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005922 }
5923
5924 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, qmax) {
5925 TEST_REQUIRES_ARM_NEON_V8;
5926 GemmMicrokernelTester()
5927 .mr(6)
5928 .nr(16)
5929 .kr(1)
5930 .sr(1)
5931 .m(6)
5932 .n(16)
5933 .k(8)
5934 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08005935 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005936 }
5937
5938 TEST(QC8_GEMM_MINMAX_FP32_6X16__NEONV8_MLAL_LANE, strided_cm) {
5939 TEST_REQUIRES_ARM_NEON_V8;
5940 GemmMicrokernelTester()
5941 .mr(6)
5942 .nr(16)
5943 .kr(1)
5944 .sr(1)
5945 .m(6)
5946 .n(16)
5947 .k(8)
5948 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08005949 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005950 }
5951#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
5952
5953
5954#if XNN_ARCH_ARM || XNN_ARCH_ARM64
5955 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, k_eq_8) {
5956 TEST_REQUIRES_ARM_NEON_V8;
5957 GemmMicrokernelTester()
5958 .mr(4)
5959 .nr(8)
5960 .kr(1)
5961 .sr(1)
5962 .m(4)
5963 .n(8)
5964 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08005965 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005966 }
5967
5968 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, strided_cn) {
5969 TEST_REQUIRES_ARM_NEON_V8;
5970 GemmMicrokernelTester()
5971 .mr(4)
5972 .nr(8)
5973 .kr(1)
5974 .sr(1)
5975 .m(4)
5976 .n(8)
5977 .k(8)
5978 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005979 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005980 }
5981
5982 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, k_eq_8_strided_a) {
5983 TEST_REQUIRES_ARM_NEON_V8;
5984 GemmMicrokernelTester()
5985 .mr(4)
5986 .nr(8)
5987 .kr(1)
5988 .sr(1)
5989 .m(4)
5990 .n(8)
5991 .k(8)
5992 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08005993 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08005994 }
5995
5996 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile) {
5997 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -08005998 for (uint32_t n = 1; n <= 8; n++) {
5999 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006000 GemmMicrokernelTester()
6001 .mr(4)
6002 .nr(8)
6003 .kr(1)
6004 .sr(1)
6005 .m(m)
6006 .n(n)
6007 .k(8)
6008 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006009 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006010 }
6011 }
6012 }
6013
6014 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile_m) {
6015 TEST_REQUIRES_ARM_NEON_V8;
6016 for (uint32_t m = 1; m <= 4; m++) {
6017 GemmMicrokernelTester()
6018 .mr(4)
6019 .nr(8)
6020 .kr(1)
6021 .sr(1)
6022 .m(m)
6023 .n(8)
6024 .k(8)
6025 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006026 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006027 }
6028 }
6029
6030 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile_n) {
6031 TEST_REQUIRES_ARM_NEON_V8;
6032 for (uint32_t n = 1; n <= 8; n++) {
6033 GemmMicrokernelTester()
6034 .mr(4)
6035 .nr(8)
6036 .kr(1)
6037 .sr(1)
6038 .m(4)
6039 .n(n)
6040 .k(8)
6041 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006042 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006043 }
6044 }
6045
6046 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, k_lt_8) {
6047 TEST_REQUIRES_ARM_NEON_V8;
6048 for (size_t k = 1; k < 8; k++) {
6049 GemmMicrokernelTester()
6050 .mr(4)
6051 .nr(8)
6052 .kr(1)
6053 .sr(1)
6054 .m(4)
6055 .n(8)
6056 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08006057 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006058 }
6059 }
6060
6061 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, k_lt_8_strided_a) {
6062 TEST_REQUIRES_ARM_NEON_V8;
6063 for (size_t k = 1; k < 8; k++) {
6064 GemmMicrokernelTester()
6065 .mr(4)
6066 .nr(8)
6067 .kr(1)
6068 .sr(1)
6069 .m(4)
6070 .n(8)
6071 .k(k)
6072 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08006073 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006074 }
6075 }
6076
6077 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, k_lt_8_subtile) {
6078 TEST_REQUIRES_ARM_NEON_V8;
6079 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08006080 for (uint32_t n = 1; n <= 8; n++) {
6081 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006082 GemmMicrokernelTester()
6083 .mr(4)
6084 .nr(8)
6085 .kr(1)
6086 .sr(1)
6087 .m(m)
6088 .n(n)
6089 .k(k)
6090 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006091 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006092 }
6093 }
6094 }
6095 }
6096
6097 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, k_gt_8) {
6098 TEST_REQUIRES_ARM_NEON_V8;
6099 for (size_t k = 9; k < 16; k++) {
6100 GemmMicrokernelTester()
6101 .mr(4)
6102 .nr(8)
6103 .kr(1)
6104 .sr(1)
6105 .m(4)
6106 .n(8)
6107 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08006108 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006109 }
6110 }
6111
6112 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, k_gt_8_strided_a) {
6113 TEST_REQUIRES_ARM_NEON_V8;
6114 for (size_t k = 9; k < 16; k++) {
6115 GemmMicrokernelTester()
6116 .mr(4)
6117 .nr(8)
6118 .kr(1)
6119 .sr(1)
6120 .m(4)
6121 .n(8)
6122 .k(k)
6123 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08006124 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006125 }
6126 }
6127
6128 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, k_gt_8_subtile) {
6129 TEST_REQUIRES_ARM_NEON_V8;
6130 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08006131 for (uint32_t n = 1; n <= 8; n++) {
6132 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006133 GemmMicrokernelTester()
6134 .mr(4)
6135 .nr(8)
6136 .kr(1)
6137 .sr(1)
6138 .m(m)
6139 .n(n)
6140 .k(k)
6141 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006142 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006143 }
6144 }
6145 }
6146 }
6147
6148 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, k_div_8) {
6149 TEST_REQUIRES_ARM_NEON_V8;
6150 for (size_t k = 16; k <= 80; k += 8) {
6151 GemmMicrokernelTester()
6152 .mr(4)
6153 .nr(8)
6154 .kr(1)
6155 .sr(1)
6156 .m(4)
6157 .n(8)
6158 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08006159 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006160 }
6161 }
6162
6163 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, k_div_8_strided_a) {
6164 TEST_REQUIRES_ARM_NEON_V8;
6165 for (size_t k = 16; k <= 80; k += 8) {
6166 GemmMicrokernelTester()
6167 .mr(4)
6168 .nr(8)
6169 .kr(1)
6170 .sr(1)
6171 .m(4)
6172 .n(8)
6173 .k(k)
6174 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08006175 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006176 }
6177 }
6178
6179 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, k_div_8_subtile) {
6180 TEST_REQUIRES_ARM_NEON_V8;
6181 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08006182 for (uint32_t n = 1; n <= 8; n++) {
6183 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006184 GemmMicrokernelTester()
6185 .mr(4)
6186 .nr(8)
6187 .kr(1)
6188 .sr(1)
6189 .m(m)
6190 .n(n)
6191 .k(k)
6192 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006193 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006194 }
6195 }
6196 }
6197 }
6198
6199 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, n_gt_8) {
6200 TEST_REQUIRES_ARM_NEON_V8;
6201 for (uint32_t n = 9; n < 16; n++) {
6202 for (size_t k = 1; k <= 40; k += 9) {
6203 GemmMicrokernelTester()
6204 .mr(4)
6205 .nr(8)
6206 .kr(1)
6207 .sr(1)
6208 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08006209 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006210 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08006211 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006212 }
6213 }
6214 }
6215
6216 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, n_gt_8_strided_cn) {
6217 TEST_REQUIRES_ARM_NEON_V8;
6218 for (uint32_t n = 9; n < 16; n++) {
6219 for (size_t k = 1; k <= 40; k += 9) {
6220 GemmMicrokernelTester()
6221 .mr(4)
6222 .nr(8)
6223 .kr(1)
6224 .sr(1)
6225 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08006226 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006227 .k(k)
6228 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08006229 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006230 }
6231 }
6232 }
6233
6234 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, n_gt_8_strided_a) {
6235 TEST_REQUIRES_ARM_NEON_V8;
6236 for (uint32_t n = 9; n < 16; n++) {
6237 for (size_t k = 1; k <= 40; k += 9) {
6238 GemmMicrokernelTester()
6239 .mr(4)
6240 .nr(8)
6241 .kr(1)
6242 .sr(1)
6243 .m(4)
6244 .n(n)
6245 .k(k)
6246 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08006247 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006248 }
6249 }
6250 }
6251
6252 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, n_gt_8_subtile) {
6253 TEST_REQUIRES_ARM_NEON_V8;
6254 for (uint32_t n = 9; n < 16; n++) {
6255 for (size_t k = 1; k <= 40; k += 9) {
6256 for (uint32_t m = 1; m <= 4; m++) {
6257 GemmMicrokernelTester()
6258 .mr(4)
6259 .nr(8)
6260 .kr(1)
6261 .sr(1)
6262 .m(m)
6263 .n(n)
6264 .k(k)
6265 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006266 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006267 }
6268 }
6269 }
6270 }
6271
6272 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, n_div_8) {
6273 TEST_REQUIRES_ARM_NEON_V8;
6274 for (uint32_t n = 16; n <= 24; n += 8) {
6275 for (size_t k = 1; k <= 40; k += 9) {
6276 GemmMicrokernelTester()
6277 .mr(4)
6278 .nr(8)
6279 .kr(1)
6280 .sr(1)
6281 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08006282 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006283 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08006284 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006285 }
6286 }
6287 }
6288
6289 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, n_div_8_strided_cn) {
6290 TEST_REQUIRES_ARM_NEON_V8;
6291 for (uint32_t n = 16; n <= 24; n += 8) {
6292 for (size_t k = 1; k <= 40; k += 9) {
6293 GemmMicrokernelTester()
6294 .mr(4)
6295 .nr(8)
6296 .kr(1)
6297 .sr(1)
6298 .m(4)
6299 .n(n)
6300 .k(k)
6301 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08006302 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006303 }
6304 }
6305 }
6306
6307 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, n_div_8_strided_a) {
6308 TEST_REQUIRES_ARM_NEON_V8;
6309 for (uint32_t n = 16; n <= 24; n += 8) {
6310 for (size_t k = 1; k <= 40; k += 9) {
6311 GemmMicrokernelTester()
6312 .mr(4)
6313 .nr(8)
6314 .kr(1)
6315 .sr(1)
6316 .m(4)
6317 .n(n)
6318 .k(k)
6319 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08006320 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006321 }
6322 }
6323 }
6324
6325 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, n_div_8_subtile) {
6326 TEST_REQUIRES_ARM_NEON_V8;
6327 for (uint32_t n = 16; n <= 24; n += 8) {
6328 for (size_t k = 1; k <= 40; k += 9) {
6329 for (uint32_t m = 1; m <= 4; m++) {
6330 GemmMicrokernelTester()
6331 .mr(4)
6332 .nr(8)
6333 .kr(1)
6334 .sr(1)
6335 .m(m)
6336 .n(n)
6337 .k(k)
6338 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006339 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006340 }
6341 }
6342 }
6343 }
6344
6345 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, strided_cm_subtile) {
6346 TEST_REQUIRES_ARM_NEON_V8;
6347 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08006348 for (uint32_t n = 1; n <= 8; n++) {
6349 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006350 GemmMicrokernelTester()
6351 .mr(4)
6352 .nr(8)
6353 .kr(1)
6354 .sr(1)
6355 .m(m)
6356 .n(n)
6357 .k(k)
6358 .cm_stride(11)
6359 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006360 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006361 }
6362 }
6363 }
6364 }
6365
6366 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, qmin) {
6367 TEST_REQUIRES_ARM_NEON_V8;
6368 GemmMicrokernelTester()
6369 .mr(4)
6370 .nr(8)
6371 .kr(1)
6372 .sr(1)
6373 .m(4)
6374 .n(8)
6375 .k(8)
6376 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08006377 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006378 }
6379
6380 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, qmax) {
6381 TEST_REQUIRES_ARM_NEON_V8;
6382 GemmMicrokernelTester()
6383 .mr(4)
6384 .nr(8)
6385 .kr(1)
6386 .sr(1)
6387 .m(4)
6388 .n(8)
6389 .k(8)
6390 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08006391 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006392 }
6393
6394 TEST(QC8_GEMM_MINMAX_FP32_4X8__NEONV8_MLAL_LANE_PRFM, strided_cm) {
6395 TEST_REQUIRES_ARM_NEON_V8;
6396 GemmMicrokernelTester()
6397 .mr(4)
6398 .nr(8)
6399 .kr(1)
6400 .sr(1)
6401 .m(4)
6402 .n(8)
6403 .k(8)
6404 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08006405 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006406 }
6407#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
6408
6409
6410#if XNN_ARCH_ARM || XNN_ARCH_ARM64
6411 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, k_eq_8) {
6412 TEST_REQUIRES_ARM_NEON_V8;
6413 GemmMicrokernelTester()
6414 .mr(6)
6415 .nr(8)
6416 .kr(1)
6417 .sr(1)
6418 .m(6)
6419 .n(8)
6420 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -08006421 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006422 }
6423
6424 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, strided_cn) {
6425 TEST_REQUIRES_ARM_NEON_V8;
6426 GemmMicrokernelTester()
6427 .mr(6)
6428 .nr(8)
6429 .kr(1)
6430 .sr(1)
6431 .m(6)
6432 .n(8)
6433 .k(8)
6434 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08006435 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006436 }
6437
6438 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, k_eq_8_strided_a) {
6439 TEST_REQUIRES_ARM_NEON_V8;
6440 GemmMicrokernelTester()
6441 .mr(6)
6442 .nr(8)
6443 .kr(1)
6444 .sr(1)
6445 .m(6)
6446 .n(8)
6447 .k(8)
6448 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08006449 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006450 }
6451
6452 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile) {
6453 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -08006454 for (uint32_t n = 1; n <= 8; n++) {
6455 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006456 GemmMicrokernelTester()
6457 .mr(6)
6458 .nr(8)
6459 .kr(1)
6460 .sr(1)
6461 .m(m)
6462 .n(n)
6463 .k(8)
6464 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006465 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006466 }
6467 }
6468 }
6469
6470 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile_m) {
6471 TEST_REQUIRES_ARM_NEON_V8;
6472 for (uint32_t m = 1; m <= 6; m++) {
6473 GemmMicrokernelTester()
6474 .mr(6)
6475 .nr(8)
6476 .kr(1)
6477 .sr(1)
6478 .m(m)
6479 .n(8)
6480 .k(8)
6481 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006482 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006483 }
6484 }
6485
6486 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, k_eq_8_subtile_n) {
6487 TEST_REQUIRES_ARM_NEON_V8;
6488 for (uint32_t n = 1; n <= 8; n++) {
6489 GemmMicrokernelTester()
6490 .mr(6)
6491 .nr(8)
6492 .kr(1)
6493 .sr(1)
6494 .m(6)
6495 .n(n)
6496 .k(8)
6497 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006498 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006499 }
6500 }
6501
6502 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, k_lt_8) {
6503 TEST_REQUIRES_ARM_NEON_V8;
6504 for (size_t k = 1; k < 8; k++) {
6505 GemmMicrokernelTester()
6506 .mr(6)
6507 .nr(8)
6508 .kr(1)
6509 .sr(1)
6510 .m(6)
6511 .n(8)
6512 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08006513 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006514 }
6515 }
6516
6517 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, k_lt_8_strided_a) {
6518 TEST_REQUIRES_ARM_NEON_V8;
6519 for (size_t k = 1; k < 8; k++) {
6520 GemmMicrokernelTester()
6521 .mr(6)
6522 .nr(8)
6523 .kr(1)
6524 .sr(1)
6525 .m(6)
6526 .n(8)
6527 .k(k)
6528 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08006529 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006530 }
6531 }
6532
6533 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, k_lt_8_subtile) {
6534 TEST_REQUIRES_ARM_NEON_V8;
6535 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08006536 for (uint32_t n = 1; n <= 8; n++) {
6537 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006538 GemmMicrokernelTester()
6539 .mr(6)
6540 .nr(8)
6541 .kr(1)
6542 .sr(1)
6543 .m(m)
6544 .n(n)
6545 .k(k)
6546 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006547 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006548 }
6549 }
6550 }
6551 }
6552
6553 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, k_gt_8) {
6554 TEST_REQUIRES_ARM_NEON_V8;
6555 for (size_t k = 9; k < 16; k++) {
6556 GemmMicrokernelTester()
6557 .mr(6)
6558 .nr(8)
6559 .kr(1)
6560 .sr(1)
6561 .m(6)
6562 .n(8)
6563 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08006564 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006565 }
6566 }
6567
6568 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, k_gt_8_strided_a) {
6569 TEST_REQUIRES_ARM_NEON_V8;
6570 for (size_t k = 9; k < 16; k++) {
6571 GemmMicrokernelTester()
6572 .mr(6)
6573 .nr(8)
6574 .kr(1)
6575 .sr(1)
6576 .m(6)
6577 .n(8)
6578 .k(k)
6579 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08006580 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006581 }
6582 }
6583
6584 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, k_gt_8_subtile) {
6585 TEST_REQUIRES_ARM_NEON_V8;
6586 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08006587 for (uint32_t n = 1; n <= 8; n++) {
6588 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006589 GemmMicrokernelTester()
6590 .mr(6)
6591 .nr(8)
6592 .kr(1)
6593 .sr(1)
6594 .m(m)
6595 .n(n)
6596 .k(k)
6597 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006598 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006599 }
6600 }
6601 }
6602 }
6603
6604 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, k_div_8) {
6605 TEST_REQUIRES_ARM_NEON_V8;
6606 for (size_t k = 16; k <= 80; k += 8) {
6607 GemmMicrokernelTester()
6608 .mr(6)
6609 .nr(8)
6610 .kr(1)
6611 .sr(1)
6612 .m(6)
6613 .n(8)
6614 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08006615 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006616 }
6617 }
6618
6619 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, k_div_8_strided_a) {
6620 TEST_REQUIRES_ARM_NEON_V8;
6621 for (size_t k = 16; k <= 80; k += 8) {
6622 GemmMicrokernelTester()
6623 .mr(6)
6624 .nr(8)
6625 .kr(1)
6626 .sr(1)
6627 .m(6)
6628 .n(8)
6629 .k(k)
6630 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08006631 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006632 }
6633 }
6634
6635 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, k_div_8_subtile) {
6636 TEST_REQUIRES_ARM_NEON_V8;
6637 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08006638 for (uint32_t n = 1; n <= 8; n++) {
6639 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006640 GemmMicrokernelTester()
6641 .mr(6)
6642 .nr(8)
6643 .kr(1)
6644 .sr(1)
6645 .m(m)
6646 .n(n)
6647 .k(k)
6648 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006649 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006650 }
6651 }
6652 }
6653 }
6654
6655 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, n_gt_8) {
6656 TEST_REQUIRES_ARM_NEON_V8;
6657 for (uint32_t n = 9; n < 16; n++) {
6658 for (size_t k = 1; k <= 40; k += 9) {
6659 GemmMicrokernelTester()
6660 .mr(6)
6661 .nr(8)
6662 .kr(1)
6663 .sr(1)
6664 .m(6)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08006665 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006666 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08006667 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006668 }
6669 }
6670 }
6671
6672 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, n_gt_8_strided_cn) {
6673 TEST_REQUIRES_ARM_NEON_V8;
6674 for (uint32_t n = 9; n < 16; n++) {
6675 for (size_t k = 1; k <= 40; k += 9) {
6676 GemmMicrokernelTester()
6677 .mr(6)
6678 .nr(8)
6679 .kr(1)
6680 .sr(1)
6681 .m(6)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08006682 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006683 .k(k)
6684 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08006685 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006686 }
6687 }
6688 }
6689
6690 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, n_gt_8_strided_a) {
6691 TEST_REQUIRES_ARM_NEON_V8;
6692 for (uint32_t n = 9; n < 16; n++) {
6693 for (size_t k = 1; k <= 40; k += 9) {
6694 GemmMicrokernelTester()
6695 .mr(6)
6696 .nr(8)
6697 .kr(1)
6698 .sr(1)
6699 .m(6)
6700 .n(n)
6701 .k(k)
6702 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08006703 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006704 }
6705 }
6706 }
6707
6708 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, n_gt_8_subtile) {
6709 TEST_REQUIRES_ARM_NEON_V8;
6710 for (uint32_t n = 9; n < 16; n++) {
6711 for (size_t k = 1; k <= 40; k += 9) {
6712 for (uint32_t m = 1; m <= 6; m++) {
6713 GemmMicrokernelTester()
6714 .mr(6)
6715 .nr(8)
6716 .kr(1)
6717 .sr(1)
6718 .m(m)
6719 .n(n)
6720 .k(k)
6721 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006722 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006723 }
6724 }
6725 }
6726 }
6727
6728 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, n_div_8) {
6729 TEST_REQUIRES_ARM_NEON_V8;
6730 for (uint32_t n = 16; n <= 24; n += 8) {
6731 for (size_t k = 1; k <= 40; k += 9) {
6732 GemmMicrokernelTester()
6733 .mr(6)
6734 .nr(8)
6735 .kr(1)
6736 .sr(1)
6737 .m(6)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08006738 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006739 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08006740 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006741 }
6742 }
6743 }
6744
6745 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, n_div_8_strided_cn) {
6746 TEST_REQUIRES_ARM_NEON_V8;
6747 for (uint32_t n = 16; n <= 24; n += 8) {
6748 for (size_t k = 1; k <= 40; k += 9) {
6749 GemmMicrokernelTester()
6750 .mr(6)
6751 .nr(8)
6752 .kr(1)
6753 .sr(1)
6754 .m(6)
6755 .n(n)
6756 .k(k)
6757 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08006758 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006759 }
6760 }
6761 }
6762
6763 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, n_div_8_strided_a) {
6764 TEST_REQUIRES_ARM_NEON_V8;
6765 for (uint32_t n = 16; n <= 24; n += 8) {
6766 for (size_t k = 1; k <= 40; k += 9) {
6767 GemmMicrokernelTester()
6768 .mr(6)
6769 .nr(8)
6770 .kr(1)
6771 .sr(1)
6772 .m(6)
6773 .n(n)
6774 .k(k)
6775 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -08006776 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006777 }
6778 }
6779 }
6780
6781 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, n_div_8_subtile) {
6782 TEST_REQUIRES_ARM_NEON_V8;
6783 for (uint32_t n = 16; n <= 24; n += 8) {
6784 for (size_t k = 1; k <= 40; k += 9) {
6785 for (uint32_t m = 1; m <= 6; m++) {
6786 GemmMicrokernelTester()
6787 .mr(6)
6788 .nr(8)
6789 .kr(1)
6790 .sr(1)
6791 .m(m)
6792 .n(n)
6793 .k(k)
6794 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006795 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006796 }
6797 }
6798 }
6799 }
6800
6801 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, strided_cm_subtile) {
6802 TEST_REQUIRES_ARM_NEON_V8;
6803 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08006804 for (uint32_t n = 1; n <= 8; n++) {
6805 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006806 GemmMicrokernelTester()
6807 .mr(6)
6808 .nr(8)
6809 .kr(1)
6810 .sr(1)
6811 .m(m)
6812 .n(n)
6813 .k(k)
6814 .cm_stride(11)
6815 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006816 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006817 }
6818 }
6819 }
6820 }
6821
6822 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, qmin) {
6823 TEST_REQUIRES_ARM_NEON_V8;
6824 GemmMicrokernelTester()
6825 .mr(6)
6826 .nr(8)
6827 .kr(1)
6828 .sr(1)
6829 .m(6)
6830 .n(8)
6831 .k(8)
6832 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08006833 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006834 }
6835
6836 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, qmax) {
6837 TEST_REQUIRES_ARM_NEON_V8;
6838 GemmMicrokernelTester()
6839 .mr(6)
6840 .nr(8)
6841 .kr(1)
6842 .sr(1)
6843 .m(6)
6844 .n(8)
6845 .k(8)
6846 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08006847 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006848 }
6849
6850 TEST(QC8_GEMM_MINMAX_FP32_6X8__NEONV8_MLAL_LANE_PRFM, strided_cm) {
6851 TEST_REQUIRES_ARM_NEON_V8;
6852 GemmMicrokernelTester()
6853 .mr(6)
6854 .nr(8)
6855 .kr(1)
6856 .sr(1)
6857 .m(6)
6858 .n(8)
6859 .k(8)
6860 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08006861 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006862 }
6863#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
6864
6865
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006866#if XNN_ARCH_ARM || XNN_ARCH_ARM64
6867 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, k_eq_16) {
6868 TEST_REQUIRES_ARM_NEON;
6869 GemmMicrokernelTester()
6870 .mr(2)
6871 .nr(8)
6872 .kr(2)
6873 .sr(1)
6874 .m(2)
6875 .n(8)
6876 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -08006877 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006878 }
6879
6880 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, strided_cn) {
6881 TEST_REQUIRES_ARM_NEON;
6882 GemmMicrokernelTester()
6883 .mr(2)
6884 .nr(8)
6885 .kr(2)
6886 .sr(1)
6887 .m(2)
6888 .n(8)
6889 .k(16)
6890 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08006891 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006892 }
6893
6894 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, k_eq_16_strided_a) {
6895 TEST_REQUIRES_ARM_NEON;
6896 GemmMicrokernelTester()
6897 .mr(2)
6898 .nr(8)
6899 .kr(2)
6900 .sr(1)
6901 .m(2)
6902 .n(8)
6903 .k(16)
6904 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08006905 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006906 }
6907
6908 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, k_eq_16_subtile) {
6909 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -08006910 for (uint32_t n = 1; n <= 8; n++) {
6911 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006912 GemmMicrokernelTester()
6913 .mr(2)
6914 .nr(8)
6915 .kr(2)
6916 .sr(1)
6917 .m(m)
6918 .n(n)
6919 .k(16)
6920 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006921 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006922 }
6923 }
6924 }
6925
6926 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, k_eq_16_subtile_m) {
6927 TEST_REQUIRES_ARM_NEON;
6928 for (uint32_t m = 1; m <= 2; m++) {
6929 GemmMicrokernelTester()
6930 .mr(2)
6931 .nr(8)
6932 .kr(2)
6933 .sr(1)
6934 .m(m)
6935 .n(8)
6936 .k(16)
6937 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006938 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006939 }
6940 }
6941
6942 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, k_eq_16_subtile_n) {
6943 TEST_REQUIRES_ARM_NEON;
6944 for (uint32_t n = 1; n <= 8; n++) {
6945 GemmMicrokernelTester()
6946 .mr(2)
6947 .nr(8)
6948 .kr(2)
6949 .sr(1)
6950 .m(2)
6951 .n(n)
6952 .k(16)
6953 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08006954 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006955 }
6956 }
6957
6958 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, k_lt_16) {
6959 TEST_REQUIRES_ARM_NEON;
6960 for (size_t k = 1; k < 16; k++) {
6961 GemmMicrokernelTester()
6962 .mr(2)
6963 .nr(8)
6964 .kr(2)
6965 .sr(1)
6966 .m(2)
6967 .n(8)
6968 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08006969 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006970 }
6971 }
6972
6973 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, k_lt_16_strided_a) {
6974 TEST_REQUIRES_ARM_NEON;
6975 for (size_t k = 1; k < 16; k++) {
6976 GemmMicrokernelTester()
6977 .mr(2)
6978 .nr(8)
6979 .kr(2)
6980 .sr(1)
6981 .m(2)
6982 .n(8)
6983 .k(k)
6984 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08006985 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006986 }
6987 }
6988
6989 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, k_lt_16_subtile) {
6990 TEST_REQUIRES_ARM_NEON;
6991 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08006992 for (uint32_t n = 1; n <= 8; n++) {
6993 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08006994 GemmMicrokernelTester()
6995 .mr(2)
6996 .nr(8)
6997 .kr(2)
6998 .sr(1)
6999 .m(m)
7000 .n(n)
7001 .k(k)
7002 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007003 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007004 }
7005 }
7006 }
7007 }
7008
7009 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, k_gt_16) {
7010 TEST_REQUIRES_ARM_NEON;
7011 for (size_t k = 17; k < 32; k++) {
7012 GemmMicrokernelTester()
7013 .mr(2)
7014 .nr(8)
7015 .kr(2)
7016 .sr(1)
7017 .m(2)
7018 .n(8)
7019 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007020 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007021 }
7022 }
7023
7024 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, k_gt_16_strided_a) {
7025 TEST_REQUIRES_ARM_NEON;
7026 for (size_t k = 17; k < 32; k++) {
7027 GemmMicrokernelTester()
7028 .mr(2)
7029 .nr(8)
7030 .kr(2)
7031 .sr(1)
7032 .m(2)
7033 .n(8)
7034 .k(k)
7035 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -08007036 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007037 }
7038 }
7039
7040 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, k_gt_16_subtile) {
7041 TEST_REQUIRES_ARM_NEON;
7042 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08007043 for (uint32_t n = 1; n <= 8; n++) {
7044 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007045 GemmMicrokernelTester()
7046 .mr(2)
7047 .nr(8)
7048 .kr(2)
7049 .sr(1)
7050 .m(m)
7051 .n(n)
7052 .k(k)
7053 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007054 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007055 }
7056 }
7057 }
7058 }
7059
7060 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, k_div_16) {
7061 TEST_REQUIRES_ARM_NEON;
7062 for (size_t k = 32; k <= 160; k += 16) {
7063 GemmMicrokernelTester()
7064 .mr(2)
7065 .nr(8)
7066 .kr(2)
7067 .sr(1)
7068 .m(2)
7069 .n(8)
7070 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007071 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007072 }
7073 }
7074
7075 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, k_div_16_strided_a) {
7076 TEST_REQUIRES_ARM_NEON;
7077 for (size_t k = 32; k <= 160; k += 16) {
7078 GemmMicrokernelTester()
7079 .mr(2)
7080 .nr(8)
7081 .kr(2)
7082 .sr(1)
7083 .m(2)
7084 .n(8)
7085 .k(k)
7086 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -08007087 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007088 }
7089 }
7090
7091 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, k_div_16_subtile) {
7092 TEST_REQUIRES_ARM_NEON;
7093 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08007094 for (uint32_t n = 1; n <= 8; n++) {
7095 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007096 GemmMicrokernelTester()
7097 .mr(2)
7098 .nr(8)
7099 .kr(2)
7100 .sr(1)
7101 .m(m)
7102 .n(n)
7103 .k(k)
7104 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007105 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007106 }
7107 }
7108 }
7109 }
7110
7111 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, n_gt_8) {
7112 TEST_REQUIRES_ARM_NEON;
7113 for (uint32_t n = 9; n < 16; n++) {
7114 for (size_t k = 1; k <= 80; k += 17) {
7115 GemmMicrokernelTester()
7116 .mr(2)
7117 .nr(8)
7118 .kr(2)
7119 .sr(1)
7120 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08007121 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007122 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007123 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007124 }
7125 }
7126 }
7127
7128 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, n_gt_8_strided_cn) {
7129 TEST_REQUIRES_ARM_NEON;
7130 for (uint32_t n = 9; n < 16; n++) {
7131 for (size_t k = 1; k <= 80; k += 17) {
7132 GemmMicrokernelTester()
7133 .mr(2)
7134 .nr(8)
7135 .kr(2)
7136 .sr(1)
7137 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08007138 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007139 .k(k)
7140 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08007141 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007142 }
7143 }
7144 }
7145
7146 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, n_gt_8_strided_a) {
7147 TEST_REQUIRES_ARM_NEON;
7148 for (uint32_t n = 9; n < 16; n++) {
7149 for (size_t k = 1; k <= 80; k += 17) {
7150 GemmMicrokernelTester()
7151 .mr(2)
7152 .nr(8)
7153 .kr(2)
7154 .sr(1)
7155 .m(2)
7156 .n(n)
7157 .k(k)
7158 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08007159 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007160 }
7161 }
7162 }
7163
7164 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, n_gt_8_subtile) {
7165 TEST_REQUIRES_ARM_NEON;
7166 for (uint32_t n = 9; n < 16; n++) {
7167 for (size_t k = 1; k <= 80; k += 17) {
7168 for (uint32_t m = 1; m <= 2; m++) {
7169 GemmMicrokernelTester()
7170 .mr(2)
7171 .nr(8)
7172 .kr(2)
7173 .sr(1)
7174 .m(m)
7175 .n(n)
7176 .k(k)
7177 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007178 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007179 }
7180 }
7181 }
7182 }
7183
7184 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, n_div_8) {
7185 TEST_REQUIRES_ARM_NEON;
7186 for (uint32_t n = 16; n <= 24; n += 8) {
7187 for (size_t k = 1; k <= 80; k += 17) {
7188 GemmMicrokernelTester()
7189 .mr(2)
7190 .nr(8)
7191 .kr(2)
7192 .sr(1)
7193 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08007194 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007195 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007196 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007197 }
7198 }
7199 }
7200
7201 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, n_div_8_strided_cn) {
7202 TEST_REQUIRES_ARM_NEON;
7203 for (uint32_t n = 16; n <= 24; n += 8) {
7204 for (size_t k = 1; k <= 80; k += 17) {
7205 GemmMicrokernelTester()
7206 .mr(2)
7207 .nr(8)
7208 .kr(2)
7209 .sr(1)
7210 .m(2)
7211 .n(n)
7212 .k(k)
7213 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08007214 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007215 }
7216 }
7217 }
7218
7219 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, n_div_8_strided_a) {
7220 TEST_REQUIRES_ARM_NEON;
7221 for (uint32_t n = 16; n <= 24; n += 8) {
7222 for (size_t k = 1; k <= 80; k += 17) {
7223 GemmMicrokernelTester()
7224 .mr(2)
7225 .nr(8)
7226 .kr(2)
7227 .sr(1)
7228 .m(2)
7229 .n(n)
7230 .k(k)
7231 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08007232 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007233 }
7234 }
7235 }
7236
7237 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, n_div_8_subtile) {
7238 TEST_REQUIRES_ARM_NEON;
7239 for (uint32_t n = 16; n <= 24; n += 8) {
7240 for (size_t k = 1; k <= 80; k += 17) {
7241 for (uint32_t m = 1; m <= 2; m++) {
7242 GemmMicrokernelTester()
7243 .mr(2)
7244 .nr(8)
7245 .kr(2)
7246 .sr(1)
7247 .m(m)
7248 .n(n)
7249 .k(k)
7250 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007251 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007252 }
7253 }
7254 }
7255 }
7256
7257 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, strided_cm_subtile) {
7258 TEST_REQUIRES_ARM_NEON;
7259 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08007260 for (uint32_t n = 1; n <= 8; n++) {
7261 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007262 GemmMicrokernelTester()
7263 .mr(2)
7264 .nr(8)
7265 .kr(2)
7266 .sr(1)
7267 .m(m)
7268 .n(n)
7269 .k(k)
7270 .cm_stride(11)
7271 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007272 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007273 }
7274 }
7275 }
7276 }
7277
7278 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, qmin) {
7279 TEST_REQUIRES_ARM_NEON;
7280 GemmMicrokernelTester()
7281 .mr(2)
7282 .nr(8)
7283 .kr(2)
7284 .sr(1)
7285 .m(2)
7286 .n(8)
7287 .k(16)
7288 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08007289 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007290 }
7291
7292 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, qmax) {
7293 TEST_REQUIRES_ARM_NEON;
7294 GemmMicrokernelTester()
7295 .mr(2)
7296 .nr(8)
7297 .kr(2)
7298 .sr(1)
7299 .m(2)
7300 .n(8)
7301 .k(16)
7302 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08007303 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007304 }
7305
7306 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD1R, strided_cm) {
7307 TEST_REQUIRES_ARM_NEON;
7308 GemmMicrokernelTester()
7309 .mr(2)
7310 .nr(8)
7311 .kr(2)
7312 .sr(1)
7313 .m(2)
7314 .n(8)
7315 .k(16)
7316 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08007317 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007318 }
7319#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
7320
7321
7322#if XNN_ARCH_ARM || XNN_ARCH_ARM64
7323 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, k_eq_16) {
7324 TEST_REQUIRES_ARM_NEON_V8;
7325 GemmMicrokernelTester()
7326 .mr(2)
7327 .nr(8)
7328 .kr(2)
7329 .sr(1)
7330 .m(2)
7331 .n(8)
7332 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -08007333 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007334 }
7335
7336 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, strided_cn) {
7337 TEST_REQUIRES_ARM_NEON_V8;
7338 GemmMicrokernelTester()
7339 .mr(2)
7340 .nr(8)
7341 .kr(2)
7342 .sr(1)
7343 .m(2)
7344 .n(8)
7345 .k(16)
7346 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08007347 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007348 }
7349
7350 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, k_eq_16_strided_a) {
7351 TEST_REQUIRES_ARM_NEON_V8;
7352 GemmMicrokernelTester()
7353 .mr(2)
7354 .nr(8)
7355 .kr(2)
7356 .sr(1)
7357 .m(2)
7358 .n(8)
7359 .k(16)
7360 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08007361 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007362 }
7363
7364 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, k_eq_16_subtile) {
7365 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -08007366 for (uint32_t n = 1; n <= 8; n++) {
7367 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007368 GemmMicrokernelTester()
7369 .mr(2)
7370 .nr(8)
7371 .kr(2)
7372 .sr(1)
7373 .m(m)
7374 .n(n)
7375 .k(16)
7376 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007377 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007378 }
7379 }
7380 }
7381
7382 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, k_eq_16_subtile_m) {
7383 TEST_REQUIRES_ARM_NEON_V8;
7384 for (uint32_t m = 1; m <= 2; m++) {
7385 GemmMicrokernelTester()
7386 .mr(2)
7387 .nr(8)
7388 .kr(2)
7389 .sr(1)
7390 .m(m)
7391 .n(8)
7392 .k(16)
7393 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007394 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007395 }
7396 }
7397
7398 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, k_eq_16_subtile_n) {
7399 TEST_REQUIRES_ARM_NEON_V8;
7400 for (uint32_t n = 1; n <= 8; n++) {
7401 GemmMicrokernelTester()
7402 .mr(2)
7403 .nr(8)
7404 .kr(2)
7405 .sr(1)
7406 .m(2)
7407 .n(n)
7408 .k(16)
7409 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007410 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007411 }
7412 }
7413
7414 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, k_lt_16) {
7415 TEST_REQUIRES_ARM_NEON_V8;
7416 for (size_t k = 1; k < 16; k++) {
7417 GemmMicrokernelTester()
7418 .mr(2)
7419 .nr(8)
7420 .kr(2)
7421 .sr(1)
7422 .m(2)
7423 .n(8)
7424 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007425 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007426 }
7427 }
7428
7429 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, k_lt_16_strided_a) {
7430 TEST_REQUIRES_ARM_NEON_V8;
7431 for (size_t k = 1; k < 16; k++) {
7432 GemmMicrokernelTester()
7433 .mr(2)
7434 .nr(8)
7435 .kr(2)
7436 .sr(1)
7437 .m(2)
7438 .n(8)
7439 .k(k)
7440 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08007441 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007442 }
7443 }
7444
7445 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, k_lt_16_subtile) {
7446 TEST_REQUIRES_ARM_NEON_V8;
7447 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08007448 for (uint32_t n = 1; n <= 8; n++) {
7449 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007450 GemmMicrokernelTester()
7451 .mr(2)
7452 .nr(8)
7453 .kr(2)
7454 .sr(1)
7455 .m(m)
7456 .n(n)
7457 .k(k)
7458 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007459 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007460 }
7461 }
7462 }
7463 }
7464
7465 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, k_gt_16) {
7466 TEST_REQUIRES_ARM_NEON_V8;
7467 for (size_t k = 17; k < 32; k++) {
7468 GemmMicrokernelTester()
7469 .mr(2)
7470 .nr(8)
7471 .kr(2)
7472 .sr(1)
7473 .m(2)
7474 .n(8)
7475 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007476 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007477 }
7478 }
7479
7480 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, k_gt_16_strided_a) {
7481 TEST_REQUIRES_ARM_NEON_V8;
7482 for (size_t k = 17; k < 32; k++) {
7483 GemmMicrokernelTester()
7484 .mr(2)
7485 .nr(8)
7486 .kr(2)
7487 .sr(1)
7488 .m(2)
7489 .n(8)
7490 .k(k)
7491 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -08007492 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007493 }
7494 }
7495
7496 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, k_gt_16_subtile) {
7497 TEST_REQUIRES_ARM_NEON_V8;
7498 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08007499 for (uint32_t n = 1; n <= 8; n++) {
7500 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007501 GemmMicrokernelTester()
7502 .mr(2)
7503 .nr(8)
7504 .kr(2)
7505 .sr(1)
7506 .m(m)
7507 .n(n)
7508 .k(k)
7509 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007510 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007511 }
7512 }
7513 }
7514 }
7515
7516 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, k_div_16) {
7517 TEST_REQUIRES_ARM_NEON_V8;
7518 for (size_t k = 32; k <= 160; k += 16) {
7519 GemmMicrokernelTester()
7520 .mr(2)
7521 .nr(8)
7522 .kr(2)
7523 .sr(1)
7524 .m(2)
7525 .n(8)
7526 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007527 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007528 }
7529 }
7530
7531 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, k_div_16_strided_a) {
7532 TEST_REQUIRES_ARM_NEON_V8;
7533 for (size_t k = 32; k <= 160; k += 16) {
7534 GemmMicrokernelTester()
7535 .mr(2)
7536 .nr(8)
7537 .kr(2)
7538 .sr(1)
7539 .m(2)
7540 .n(8)
7541 .k(k)
7542 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -08007543 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007544 }
7545 }
7546
7547 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, k_div_16_subtile) {
7548 TEST_REQUIRES_ARM_NEON_V8;
7549 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08007550 for (uint32_t n = 1; n <= 8; n++) {
7551 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007552 GemmMicrokernelTester()
7553 .mr(2)
7554 .nr(8)
7555 .kr(2)
7556 .sr(1)
7557 .m(m)
7558 .n(n)
7559 .k(k)
7560 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007561 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007562 }
7563 }
7564 }
7565 }
7566
7567 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, n_gt_8) {
7568 TEST_REQUIRES_ARM_NEON_V8;
7569 for (uint32_t n = 9; n < 16; n++) {
7570 for (size_t k = 1; k <= 80; k += 17) {
7571 GemmMicrokernelTester()
7572 .mr(2)
7573 .nr(8)
7574 .kr(2)
7575 .sr(1)
7576 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08007577 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007578 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007579 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007580 }
7581 }
7582 }
7583
7584 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, n_gt_8_strided_cn) {
7585 TEST_REQUIRES_ARM_NEON_V8;
7586 for (uint32_t n = 9; n < 16; n++) {
7587 for (size_t k = 1; k <= 80; k += 17) {
7588 GemmMicrokernelTester()
7589 .mr(2)
7590 .nr(8)
7591 .kr(2)
7592 .sr(1)
7593 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08007594 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007595 .k(k)
7596 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08007597 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007598 }
7599 }
7600 }
7601
7602 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, n_gt_8_strided_a) {
7603 TEST_REQUIRES_ARM_NEON_V8;
7604 for (uint32_t n = 9; n < 16; n++) {
7605 for (size_t k = 1; k <= 80; k += 17) {
7606 GemmMicrokernelTester()
7607 .mr(2)
7608 .nr(8)
7609 .kr(2)
7610 .sr(1)
7611 .m(2)
7612 .n(n)
7613 .k(k)
7614 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08007615 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007616 }
7617 }
7618 }
7619
7620 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, n_gt_8_subtile) {
7621 TEST_REQUIRES_ARM_NEON_V8;
7622 for (uint32_t n = 9; n < 16; n++) {
7623 for (size_t k = 1; k <= 80; k += 17) {
7624 for (uint32_t m = 1; m <= 2; m++) {
7625 GemmMicrokernelTester()
7626 .mr(2)
7627 .nr(8)
7628 .kr(2)
7629 .sr(1)
7630 .m(m)
7631 .n(n)
7632 .k(k)
7633 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007634 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007635 }
7636 }
7637 }
7638 }
7639
7640 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, n_div_8) {
7641 TEST_REQUIRES_ARM_NEON_V8;
7642 for (uint32_t n = 16; n <= 24; n += 8) {
7643 for (size_t k = 1; k <= 80; k += 17) {
7644 GemmMicrokernelTester()
7645 .mr(2)
7646 .nr(8)
7647 .kr(2)
7648 .sr(1)
7649 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08007650 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007651 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007652 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007653 }
7654 }
7655 }
7656
7657 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, n_div_8_strided_cn) {
7658 TEST_REQUIRES_ARM_NEON_V8;
7659 for (uint32_t n = 16; n <= 24; n += 8) {
7660 for (size_t k = 1; k <= 80; k += 17) {
7661 GemmMicrokernelTester()
7662 .mr(2)
7663 .nr(8)
7664 .kr(2)
7665 .sr(1)
7666 .m(2)
7667 .n(n)
7668 .k(k)
7669 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08007670 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007671 }
7672 }
7673 }
7674
7675 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, n_div_8_strided_a) {
7676 TEST_REQUIRES_ARM_NEON_V8;
7677 for (uint32_t n = 16; n <= 24; n += 8) {
7678 for (size_t k = 1; k <= 80; k += 17) {
7679 GemmMicrokernelTester()
7680 .mr(2)
7681 .nr(8)
7682 .kr(2)
7683 .sr(1)
7684 .m(2)
7685 .n(n)
7686 .k(k)
7687 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08007688 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007689 }
7690 }
7691 }
7692
7693 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, n_div_8_subtile) {
7694 TEST_REQUIRES_ARM_NEON_V8;
7695 for (uint32_t n = 16; n <= 24; n += 8) {
7696 for (size_t k = 1; k <= 80; k += 17) {
7697 for (uint32_t m = 1; m <= 2; m++) {
7698 GemmMicrokernelTester()
7699 .mr(2)
7700 .nr(8)
7701 .kr(2)
7702 .sr(1)
7703 .m(m)
7704 .n(n)
7705 .k(k)
7706 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007707 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007708 }
7709 }
7710 }
7711 }
7712
7713 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, strided_cm_subtile) {
7714 TEST_REQUIRES_ARM_NEON_V8;
7715 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08007716 for (uint32_t n = 1; n <= 8; n++) {
7717 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007718 GemmMicrokernelTester()
7719 .mr(2)
7720 .nr(8)
7721 .kr(2)
7722 .sr(1)
7723 .m(m)
7724 .n(n)
7725 .k(k)
7726 .cm_stride(11)
7727 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007728 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007729 }
7730 }
7731 }
7732 }
7733
7734 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, qmin) {
7735 TEST_REQUIRES_ARM_NEON_V8;
7736 GemmMicrokernelTester()
7737 .mr(2)
7738 .nr(8)
7739 .kr(2)
7740 .sr(1)
7741 .m(2)
7742 .n(8)
7743 .k(16)
7744 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08007745 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007746 }
7747
7748 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, qmax) {
7749 TEST_REQUIRES_ARM_NEON_V8;
7750 GemmMicrokernelTester()
7751 .mr(2)
7752 .nr(8)
7753 .kr(2)
7754 .sr(1)
7755 .m(2)
7756 .n(8)
7757 .k(16)
7758 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08007759 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007760 }
7761
7762 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEONV8_MLAL_LD1R, strided_cm) {
7763 TEST_REQUIRES_ARM_NEON_V8;
7764 GemmMicrokernelTester()
7765 .mr(2)
7766 .nr(8)
7767 .kr(2)
7768 .sr(1)
7769 .m(2)
7770 .n(8)
7771 .k(16)
7772 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08007773 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007774 }
7775#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
7776
7777
7778#if XNN_ARCH_ARM || XNN_ARCH_ARM64
7779 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, k_eq_16) {
7780 TEST_REQUIRES_ARM_NEON;
7781 GemmMicrokernelTester()
7782 .mr(1)
7783 .nr(8)
7784 .kr(2)
7785 .sr(1)
7786 .m(1)
7787 .n(8)
7788 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -08007789 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007790 }
7791
7792 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, strided_cn) {
7793 TEST_REQUIRES_ARM_NEON;
7794 GemmMicrokernelTester()
7795 .mr(1)
7796 .nr(8)
7797 .kr(2)
7798 .sr(1)
7799 .m(1)
7800 .n(8)
7801 .k(16)
7802 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08007803 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007804 }
7805
7806 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, k_eq_16_strided_a) {
7807 TEST_REQUIRES_ARM_NEON;
7808 GemmMicrokernelTester()
7809 .mr(1)
7810 .nr(8)
7811 .kr(2)
7812 .sr(1)
7813 .m(1)
7814 .n(8)
7815 .k(16)
7816 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08007817 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007818 }
7819
7820 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, k_eq_16_subtile) {
7821 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -08007822 for (uint32_t n = 1; n <= 8; n++) {
7823 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007824 GemmMicrokernelTester()
7825 .mr(1)
7826 .nr(8)
7827 .kr(2)
7828 .sr(1)
7829 .m(m)
7830 .n(n)
7831 .k(16)
7832 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007833 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007834 }
7835 }
7836 }
7837
7838 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, k_eq_16_subtile_m) {
7839 TEST_REQUIRES_ARM_NEON;
7840 for (uint32_t m = 1; m <= 1; m++) {
7841 GemmMicrokernelTester()
7842 .mr(1)
7843 .nr(8)
7844 .kr(2)
7845 .sr(1)
7846 .m(m)
7847 .n(8)
7848 .k(16)
7849 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007850 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007851 }
7852 }
7853
7854 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, k_eq_16_subtile_n) {
7855 TEST_REQUIRES_ARM_NEON;
7856 for (uint32_t n = 1; n <= 8; n++) {
7857 GemmMicrokernelTester()
7858 .mr(1)
7859 .nr(8)
7860 .kr(2)
7861 .sr(1)
7862 .m(1)
7863 .n(n)
7864 .k(16)
7865 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007866 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007867 }
7868 }
7869
7870 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, k_lt_16) {
7871 TEST_REQUIRES_ARM_NEON;
7872 for (size_t k = 1; k < 16; k++) {
7873 GemmMicrokernelTester()
7874 .mr(1)
7875 .nr(8)
7876 .kr(2)
7877 .sr(1)
7878 .m(1)
7879 .n(8)
7880 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007881 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007882 }
7883 }
7884
7885 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, k_lt_16_strided_a) {
7886 TEST_REQUIRES_ARM_NEON;
7887 for (size_t k = 1; k < 16; k++) {
7888 GemmMicrokernelTester()
7889 .mr(1)
7890 .nr(8)
7891 .kr(2)
7892 .sr(1)
7893 .m(1)
7894 .n(8)
7895 .k(k)
7896 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08007897 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007898 }
7899 }
7900
7901 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, k_lt_16_subtile) {
7902 TEST_REQUIRES_ARM_NEON;
7903 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08007904 for (uint32_t n = 1; n <= 8; n++) {
7905 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007906 GemmMicrokernelTester()
7907 .mr(1)
7908 .nr(8)
7909 .kr(2)
7910 .sr(1)
7911 .m(m)
7912 .n(n)
7913 .k(k)
7914 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007915 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007916 }
7917 }
7918 }
7919 }
7920
7921 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, k_gt_16) {
7922 TEST_REQUIRES_ARM_NEON;
7923 for (size_t k = 17; k < 32; k++) {
7924 GemmMicrokernelTester()
7925 .mr(1)
7926 .nr(8)
7927 .kr(2)
7928 .sr(1)
7929 .m(1)
7930 .n(8)
7931 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007932 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007933 }
7934 }
7935
7936 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, k_gt_16_strided_a) {
7937 TEST_REQUIRES_ARM_NEON;
7938 for (size_t k = 17; k < 32; k++) {
7939 GemmMicrokernelTester()
7940 .mr(1)
7941 .nr(8)
7942 .kr(2)
7943 .sr(1)
7944 .m(1)
7945 .n(8)
7946 .k(k)
7947 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -08007948 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007949 }
7950 }
7951
7952 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, k_gt_16_subtile) {
7953 TEST_REQUIRES_ARM_NEON;
7954 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08007955 for (uint32_t n = 1; n <= 8; n++) {
7956 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007957 GemmMicrokernelTester()
7958 .mr(1)
7959 .nr(8)
7960 .kr(2)
7961 .sr(1)
7962 .m(m)
7963 .n(n)
7964 .k(k)
7965 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08007966 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007967 }
7968 }
7969 }
7970 }
7971
7972 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, k_div_16) {
7973 TEST_REQUIRES_ARM_NEON;
7974 for (size_t k = 32; k <= 160; k += 16) {
7975 GemmMicrokernelTester()
7976 .mr(1)
7977 .nr(8)
7978 .kr(2)
7979 .sr(1)
7980 .m(1)
7981 .n(8)
7982 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08007983 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08007984 }
7985 }
7986
7987 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, k_div_16_strided_a) {
7988 TEST_REQUIRES_ARM_NEON;
7989 for (size_t k = 32; k <= 160; k += 16) {
7990 GemmMicrokernelTester()
7991 .mr(1)
7992 .nr(8)
7993 .kr(2)
7994 .sr(1)
7995 .m(1)
7996 .n(8)
7997 .k(k)
7998 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -08007999 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008000 }
8001 }
8002
8003 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, k_div_16_subtile) {
8004 TEST_REQUIRES_ARM_NEON;
8005 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08008006 for (uint32_t n = 1; n <= 8; n++) {
8007 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008008 GemmMicrokernelTester()
8009 .mr(1)
8010 .nr(8)
8011 .kr(2)
8012 .sr(1)
8013 .m(m)
8014 .n(n)
8015 .k(k)
8016 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008017 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008018 }
8019 }
8020 }
8021 }
8022
8023 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, n_gt_8) {
8024 TEST_REQUIRES_ARM_NEON;
8025 for (uint32_t n = 9; n < 16; n++) {
8026 for (size_t k = 1; k <= 80; k += 17) {
8027 GemmMicrokernelTester()
8028 .mr(1)
8029 .nr(8)
8030 .kr(2)
8031 .sr(1)
8032 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08008033 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008034 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08008035 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008036 }
8037 }
8038 }
8039
8040 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, n_gt_8_strided_cn) {
8041 TEST_REQUIRES_ARM_NEON;
8042 for (uint32_t n = 9; n < 16; n++) {
8043 for (size_t k = 1; k <= 80; k += 17) {
8044 GemmMicrokernelTester()
8045 .mr(1)
8046 .nr(8)
8047 .kr(2)
8048 .sr(1)
8049 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08008050 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008051 .k(k)
8052 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08008053 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008054 }
8055 }
8056 }
8057
8058 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, n_gt_8_strided_a) {
8059 TEST_REQUIRES_ARM_NEON;
8060 for (uint32_t n = 9; n < 16; n++) {
8061 for (size_t k = 1; k <= 80; k += 17) {
8062 GemmMicrokernelTester()
8063 .mr(1)
8064 .nr(8)
8065 .kr(2)
8066 .sr(1)
8067 .m(1)
8068 .n(n)
8069 .k(k)
8070 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08008071 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008072 }
8073 }
8074 }
8075
8076 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, n_gt_8_subtile) {
8077 TEST_REQUIRES_ARM_NEON;
8078 for (uint32_t n = 9; n < 16; n++) {
8079 for (size_t k = 1; k <= 80; k += 17) {
8080 for (uint32_t m = 1; m <= 1; m++) {
8081 GemmMicrokernelTester()
8082 .mr(1)
8083 .nr(8)
8084 .kr(2)
8085 .sr(1)
8086 .m(m)
8087 .n(n)
8088 .k(k)
8089 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008090 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008091 }
8092 }
8093 }
8094 }
8095
8096 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, n_div_8) {
8097 TEST_REQUIRES_ARM_NEON;
8098 for (uint32_t n = 16; n <= 24; n += 8) {
8099 for (size_t k = 1; k <= 80; k += 17) {
8100 GemmMicrokernelTester()
8101 .mr(1)
8102 .nr(8)
8103 .kr(2)
8104 .sr(1)
8105 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08008106 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008107 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08008108 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008109 }
8110 }
8111 }
8112
8113 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, n_div_8_strided_cn) {
8114 TEST_REQUIRES_ARM_NEON;
8115 for (uint32_t n = 16; n <= 24; n += 8) {
8116 for (size_t k = 1; k <= 80; k += 17) {
8117 GemmMicrokernelTester()
8118 .mr(1)
8119 .nr(8)
8120 .kr(2)
8121 .sr(1)
8122 .m(1)
8123 .n(n)
8124 .k(k)
8125 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08008126 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008127 }
8128 }
8129 }
8130
8131 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, n_div_8_strided_a) {
8132 TEST_REQUIRES_ARM_NEON;
8133 for (uint32_t n = 16; n <= 24; n += 8) {
8134 for (size_t k = 1; k <= 80; k += 17) {
8135 GemmMicrokernelTester()
8136 .mr(1)
8137 .nr(8)
8138 .kr(2)
8139 .sr(1)
8140 .m(1)
8141 .n(n)
8142 .k(k)
8143 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08008144 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008145 }
8146 }
8147 }
8148
8149 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, n_div_8_subtile) {
8150 TEST_REQUIRES_ARM_NEON;
8151 for (uint32_t n = 16; n <= 24; n += 8) {
8152 for (size_t k = 1; k <= 80; k += 17) {
8153 for (uint32_t m = 1; m <= 1; m++) {
8154 GemmMicrokernelTester()
8155 .mr(1)
8156 .nr(8)
8157 .kr(2)
8158 .sr(1)
8159 .m(m)
8160 .n(n)
8161 .k(k)
8162 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008163 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008164 }
8165 }
8166 }
8167 }
8168
8169 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, strided_cm_subtile) {
8170 TEST_REQUIRES_ARM_NEON;
8171 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08008172 for (uint32_t n = 1; n <= 8; n++) {
8173 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008174 GemmMicrokernelTester()
8175 .mr(1)
8176 .nr(8)
8177 .kr(2)
8178 .sr(1)
8179 .m(m)
8180 .n(n)
8181 .k(k)
8182 .cm_stride(11)
8183 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008184 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008185 }
8186 }
8187 }
8188 }
8189
8190 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, qmin) {
8191 TEST_REQUIRES_ARM_NEON;
8192 GemmMicrokernelTester()
8193 .mr(1)
8194 .nr(8)
8195 .kr(2)
8196 .sr(1)
8197 .m(1)
8198 .n(8)
8199 .k(16)
8200 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08008201 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008202 }
8203
8204 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, qmax) {
8205 TEST_REQUIRES_ARM_NEON;
8206 GemmMicrokernelTester()
8207 .mr(1)
8208 .nr(8)
8209 .kr(2)
8210 .sr(1)
8211 .m(1)
8212 .n(8)
8213 .k(16)
8214 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08008215 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008216 }
8217
8218 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_LD2R, strided_cm) {
8219 TEST_REQUIRES_ARM_NEON;
8220 GemmMicrokernelTester()
8221 .mr(1)
8222 .nr(8)
8223 .kr(2)
8224 .sr(1)
8225 .m(1)
8226 .n(8)
8227 .k(16)
8228 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08008229 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008230 }
8231#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
8232
8233
8234#if XNN_ARCH_ARM || XNN_ARCH_ARM64
8235 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, k_eq_16) {
8236 TEST_REQUIRES_ARM_NEON;
8237 GemmMicrokernelTester()
8238 .mr(2)
8239 .nr(8)
8240 .kr(2)
8241 .sr(1)
8242 .m(2)
8243 .n(8)
8244 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -08008245 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008246 }
8247
8248 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, strided_cn) {
8249 TEST_REQUIRES_ARM_NEON;
8250 GemmMicrokernelTester()
8251 .mr(2)
8252 .nr(8)
8253 .kr(2)
8254 .sr(1)
8255 .m(2)
8256 .n(8)
8257 .k(16)
8258 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08008259 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008260 }
8261
8262 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, k_eq_16_strided_a) {
8263 TEST_REQUIRES_ARM_NEON;
8264 GemmMicrokernelTester()
8265 .mr(2)
8266 .nr(8)
8267 .kr(2)
8268 .sr(1)
8269 .m(2)
8270 .n(8)
8271 .k(16)
8272 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08008273 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008274 }
8275
8276 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, k_eq_16_subtile) {
8277 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -08008278 for (uint32_t n = 1; n <= 8; n++) {
8279 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008280 GemmMicrokernelTester()
8281 .mr(2)
8282 .nr(8)
8283 .kr(2)
8284 .sr(1)
8285 .m(m)
8286 .n(n)
8287 .k(16)
8288 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008289 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008290 }
8291 }
8292 }
8293
8294 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, k_eq_16_subtile_m) {
8295 TEST_REQUIRES_ARM_NEON;
8296 for (uint32_t m = 1; m <= 2; m++) {
8297 GemmMicrokernelTester()
8298 .mr(2)
8299 .nr(8)
8300 .kr(2)
8301 .sr(1)
8302 .m(m)
8303 .n(8)
8304 .k(16)
8305 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008306 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008307 }
8308 }
8309
8310 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, k_eq_16_subtile_n) {
8311 TEST_REQUIRES_ARM_NEON;
8312 for (uint32_t n = 1; n <= 8; n++) {
8313 GemmMicrokernelTester()
8314 .mr(2)
8315 .nr(8)
8316 .kr(2)
8317 .sr(1)
8318 .m(2)
8319 .n(n)
8320 .k(16)
8321 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008322 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008323 }
8324 }
8325
8326 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, k_lt_16) {
8327 TEST_REQUIRES_ARM_NEON;
8328 for (size_t k = 1; k < 16; k++) {
8329 GemmMicrokernelTester()
8330 .mr(2)
8331 .nr(8)
8332 .kr(2)
8333 .sr(1)
8334 .m(2)
8335 .n(8)
8336 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08008337 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008338 }
8339 }
8340
8341 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, k_lt_16_strided_a) {
8342 TEST_REQUIRES_ARM_NEON;
8343 for (size_t k = 1; k < 16; k++) {
8344 GemmMicrokernelTester()
8345 .mr(2)
8346 .nr(8)
8347 .kr(2)
8348 .sr(1)
8349 .m(2)
8350 .n(8)
8351 .k(k)
8352 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08008353 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008354 }
8355 }
8356
8357 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, k_lt_16_subtile) {
8358 TEST_REQUIRES_ARM_NEON;
8359 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08008360 for (uint32_t n = 1; n <= 8; n++) {
8361 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008362 GemmMicrokernelTester()
8363 .mr(2)
8364 .nr(8)
8365 .kr(2)
8366 .sr(1)
8367 .m(m)
8368 .n(n)
8369 .k(k)
8370 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008371 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008372 }
8373 }
8374 }
8375 }
8376
8377 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, k_gt_16) {
8378 TEST_REQUIRES_ARM_NEON;
8379 for (size_t k = 17; k < 32; k++) {
8380 GemmMicrokernelTester()
8381 .mr(2)
8382 .nr(8)
8383 .kr(2)
8384 .sr(1)
8385 .m(2)
8386 .n(8)
8387 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08008388 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008389 }
8390 }
8391
8392 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, k_gt_16_strided_a) {
8393 TEST_REQUIRES_ARM_NEON;
8394 for (size_t k = 17; k < 32; k++) {
8395 GemmMicrokernelTester()
8396 .mr(2)
8397 .nr(8)
8398 .kr(2)
8399 .sr(1)
8400 .m(2)
8401 .n(8)
8402 .k(k)
8403 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -08008404 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008405 }
8406 }
8407
8408 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, k_gt_16_subtile) {
8409 TEST_REQUIRES_ARM_NEON;
8410 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08008411 for (uint32_t n = 1; n <= 8; n++) {
8412 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008413 GemmMicrokernelTester()
8414 .mr(2)
8415 .nr(8)
8416 .kr(2)
8417 .sr(1)
8418 .m(m)
8419 .n(n)
8420 .k(k)
8421 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008422 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008423 }
8424 }
8425 }
8426 }
8427
8428 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, k_div_16) {
8429 TEST_REQUIRES_ARM_NEON;
8430 for (size_t k = 32; k <= 160; k += 16) {
8431 GemmMicrokernelTester()
8432 .mr(2)
8433 .nr(8)
8434 .kr(2)
8435 .sr(1)
8436 .m(2)
8437 .n(8)
8438 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08008439 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008440 }
8441 }
8442
8443 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, k_div_16_strided_a) {
8444 TEST_REQUIRES_ARM_NEON;
8445 for (size_t k = 32; k <= 160; k += 16) {
8446 GemmMicrokernelTester()
8447 .mr(2)
8448 .nr(8)
8449 .kr(2)
8450 .sr(1)
8451 .m(2)
8452 .n(8)
8453 .k(k)
8454 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -08008455 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008456 }
8457 }
8458
8459 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, k_div_16_subtile) {
8460 TEST_REQUIRES_ARM_NEON;
8461 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08008462 for (uint32_t n = 1; n <= 8; n++) {
8463 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008464 GemmMicrokernelTester()
8465 .mr(2)
8466 .nr(8)
8467 .kr(2)
8468 .sr(1)
8469 .m(m)
8470 .n(n)
8471 .k(k)
8472 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008473 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008474 }
8475 }
8476 }
8477 }
8478
8479 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, n_gt_8) {
8480 TEST_REQUIRES_ARM_NEON;
8481 for (uint32_t n = 9; n < 16; n++) {
8482 for (size_t k = 1; k <= 80; k += 17) {
8483 GemmMicrokernelTester()
8484 .mr(2)
8485 .nr(8)
8486 .kr(2)
8487 .sr(1)
8488 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08008489 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008490 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08008491 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008492 }
8493 }
8494 }
8495
8496 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, n_gt_8_strided_cn) {
8497 TEST_REQUIRES_ARM_NEON;
8498 for (uint32_t n = 9; n < 16; n++) {
8499 for (size_t k = 1; k <= 80; k += 17) {
8500 GemmMicrokernelTester()
8501 .mr(2)
8502 .nr(8)
8503 .kr(2)
8504 .sr(1)
8505 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08008506 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008507 .k(k)
8508 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08008509 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008510 }
8511 }
8512 }
8513
8514 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, n_gt_8_strided_a) {
8515 TEST_REQUIRES_ARM_NEON;
8516 for (uint32_t n = 9; n < 16; n++) {
8517 for (size_t k = 1; k <= 80; k += 17) {
8518 GemmMicrokernelTester()
8519 .mr(2)
8520 .nr(8)
8521 .kr(2)
8522 .sr(1)
8523 .m(2)
8524 .n(n)
8525 .k(k)
8526 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08008527 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008528 }
8529 }
8530 }
8531
8532 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, n_gt_8_subtile) {
8533 TEST_REQUIRES_ARM_NEON;
8534 for (uint32_t n = 9; n < 16; n++) {
8535 for (size_t k = 1; k <= 80; k += 17) {
8536 for (uint32_t m = 1; m <= 2; m++) {
8537 GemmMicrokernelTester()
8538 .mr(2)
8539 .nr(8)
8540 .kr(2)
8541 .sr(1)
8542 .m(m)
8543 .n(n)
8544 .k(k)
8545 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008546 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008547 }
8548 }
8549 }
8550 }
8551
8552 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, n_div_8) {
8553 TEST_REQUIRES_ARM_NEON;
8554 for (uint32_t n = 16; n <= 24; n += 8) {
8555 for (size_t k = 1; k <= 80; k += 17) {
8556 GemmMicrokernelTester()
8557 .mr(2)
8558 .nr(8)
8559 .kr(2)
8560 .sr(1)
8561 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08008562 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008563 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08008564 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008565 }
8566 }
8567 }
8568
8569 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, n_div_8_strided_cn) {
8570 TEST_REQUIRES_ARM_NEON;
8571 for (uint32_t n = 16; n <= 24; n += 8) {
8572 for (size_t k = 1; k <= 80; k += 17) {
8573 GemmMicrokernelTester()
8574 .mr(2)
8575 .nr(8)
8576 .kr(2)
8577 .sr(1)
8578 .m(2)
8579 .n(n)
8580 .k(k)
8581 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08008582 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008583 }
8584 }
8585 }
8586
8587 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, n_div_8_strided_a) {
8588 TEST_REQUIRES_ARM_NEON;
8589 for (uint32_t n = 16; n <= 24; n += 8) {
8590 for (size_t k = 1; k <= 80; k += 17) {
8591 GemmMicrokernelTester()
8592 .mr(2)
8593 .nr(8)
8594 .kr(2)
8595 .sr(1)
8596 .m(2)
8597 .n(n)
8598 .k(k)
8599 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08008600 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008601 }
8602 }
8603 }
8604
8605 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, n_div_8_subtile) {
8606 TEST_REQUIRES_ARM_NEON;
8607 for (uint32_t n = 16; n <= 24; n += 8) {
8608 for (size_t k = 1; k <= 80; k += 17) {
8609 for (uint32_t m = 1; m <= 2; m++) {
8610 GemmMicrokernelTester()
8611 .mr(2)
8612 .nr(8)
8613 .kr(2)
8614 .sr(1)
8615 .m(m)
8616 .n(n)
8617 .k(k)
8618 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008619 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008620 }
8621 }
8622 }
8623 }
8624
8625 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, strided_cm_subtile) {
8626 TEST_REQUIRES_ARM_NEON;
8627 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08008628 for (uint32_t n = 1; n <= 8; n++) {
8629 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008630 GemmMicrokernelTester()
8631 .mr(2)
8632 .nr(8)
8633 .kr(2)
8634 .sr(1)
8635 .m(m)
8636 .n(n)
8637 .k(k)
8638 .cm_stride(11)
8639 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008640 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008641 }
8642 }
8643 }
8644 }
8645
8646 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, qmin) {
8647 TEST_REQUIRES_ARM_NEON;
8648 GemmMicrokernelTester()
8649 .mr(2)
8650 .nr(8)
8651 .kr(2)
8652 .sr(1)
8653 .m(2)
8654 .n(8)
8655 .k(16)
8656 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08008657 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008658 }
8659
8660 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, qmax) {
8661 TEST_REQUIRES_ARM_NEON;
8662 GemmMicrokernelTester()
8663 .mr(2)
8664 .nr(8)
8665 .kr(2)
8666 .sr(1)
8667 .m(2)
8668 .n(8)
8669 .k(16)
8670 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08008671 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008672 }
8673
8674 TEST(QC8_GEMM_MINMAX_FP32_2X8C2__NEON_MLAL_LD4R, strided_cm) {
8675 TEST_REQUIRES_ARM_NEON;
8676 GemmMicrokernelTester()
8677 .mr(2)
8678 .nr(8)
8679 .kr(2)
8680 .sr(1)
8681 .m(2)
8682 .n(8)
8683 .k(16)
8684 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08008685 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2__neon_mlal_ld4r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008686 }
8687#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
8688
8689
8690#if XNN_ARCH_ARM || XNN_ARCH_ARM64
8691 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16) {
8692 TEST_REQUIRES_ARM_NEON;
8693 GemmMicrokernelTester()
8694 .mr(2)
8695 .nr(8)
8696 .kr(4)
8697 .sr(1)
8698 .m(2)
8699 .n(8)
8700 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -08008701 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008702 }
8703
8704 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, strided_cn) {
8705 TEST_REQUIRES_ARM_NEON;
8706 GemmMicrokernelTester()
8707 .mr(2)
8708 .nr(8)
8709 .kr(4)
8710 .sr(1)
8711 .m(2)
8712 .n(8)
8713 .k(16)
8714 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08008715 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008716 }
8717
8718 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16_strided_a) {
8719 TEST_REQUIRES_ARM_NEON;
8720 GemmMicrokernelTester()
8721 .mr(2)
8722 .nr(8)
8723 .kr(4)
8724 .sr(1)
8725 .m(2)
8726 .n(8)
8727 .k(16)
8728 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08008729 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008730 }
8731
8732 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile) {
8733 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -08008734 for (uint32_t n = 1; n <= 8; n++) {
8735 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008736 GemmMicrokernelTester()
8737 .mr(2)
8738 .nr(8)
8739 .kr(4)
8740 .sr(1)
8741 .m(m)
8742 .n(n)
8743 .k(16)
8744 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008745 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008746 }
8747 }
8748 }
8749
8750 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
8751 TEST_REQUIRES_ARM_NEON;
8752 for (uint32_t m = 1; m <= 2; m++) {
8753 GemmMicrokernelTester()
8754 .mr(2)
8755 .nr(8)
8756 .kr(4)
8757 .sr(1)
8758 .m(m)
8759 .n(8)
8760 .k(16)
8761 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008762 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008763 }
8764 }
8765
8766 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
8767 TEST_REQUIRES_ARM_NEON;
8768 for (uint32_t n = 1; n <= 8; n++) {
8769 GemmMicrokernelTester()
8770 .mr(2)
8771 .nr(8)
8772 .kr(4)
8773 .sr(1)
8774 .m(2)
8775 .n(n)
8776 .k(16)
8777 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008778 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008779 }
8780 }
8781
8782 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_lt_16) {
8783 TEST_REQUIRES_ARM_NEON;
8784 for (size_t k = 1; k < 16; k++) {
8785 GemmMicrokernelTester()
8786 .mr(2)
8787 .nr(8)
8788 .kr(4)
8789 .sr(1)
8790 .m(2)
8791 .n(8)
8792 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08008793 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008794 }
8795 }
8796
8797 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_lt_16_strided_a) {
8798 TEST_REQUIRES_ARM_NEON;
8799 for (size_t k = 1; k < 16; k++) {
8800 GemmMicrokernelTester()
8801 .mr(2)
8802 .nr(8)
8803 .kr(4)
8804 .sr(1)
8805 .m(2)
8806 .n(8)
8807 .k(k)
8808 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08008809 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008810 }
8811 }
8812
8813 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_lt_16_subtile) {
8814 TEST_REQUIRES_ARM_NEON;
8815 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08008816 for (uint32_t n = 1; n <= 8; n++) {
8817 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008818 GemmMicrokernelTester()
8819 .mr(2)
8820 .nr(8)
8821 .kr(4)
8822 .sr(1)
8823 .m(m)
8824 .n(n)
8825 .k(k)
8826 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008827 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008828 }
8829 }
8830 }
8831 }
8832
8833 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_gt_16) {
8834 TEST_REQUIRES_ARM_NEON;
8835 for (size_t k = 17; k < 32; k++) {
8836 GemmMicrokernelTester()
8837 .mr(2)
8838 .nr(8)
8839 .kr(4)
8840 .sr(1)
8841 .m(2)
8842 .n(8)
8843 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08008844 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008845 }
8846 }
8847
8848 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_gt_16_strided_a) {
8849 TEST_REQUIRES_ARM_NEON;
8850 for (size_t k = 17; k < 32; k++) {
8851 GemmMicrokernelTester()
8852 .mr(2)
8853 .nr(8)
8854 .kr(4)
8855 .sr(1)
8856 .m(2)
8857 .n(8)
8858 .k(k)
8859 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -08008860 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008861 }
8862 }
8863
8864 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_gt_16_subtile) {
8865 TEST_REQUIRES_ARM_NEON;
8866 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08008867 for (uint32_t n = 1; n <= 8; n++) {
8868 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008869 GemmMicrokernelTester()
8870 .mr(2)
8871 .nr(8)
8872 .kr(4)
8873 .sr(1)
8874 .m(m)
8875 .n(n)
8876 .k(k)
8877 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008878 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008879 }
8880 }
8881 }
8882 }
8883
8884 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_div_16) {
8885 TEST_REQUIRES_ARM_NEON;
8886 for (size_t k = 32; k <= 160; k += 16) {
8887 GemmMicrokernelTester()
8888 .mr(2)
8889 .nr(8)
8890 .kr(4)
8891 .sr(1)
8892 .m(2)
8893 .n(8)
8894 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08008895 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008896 }
8897 }
8898
8899 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_div_16_strided_a) {
8900 TEST_REQUIRES_ARM_NEON;
8901 for (size_t k = 32; k <= 160; k += 16) {
8902 GemmMicrokernelTester()
8903 .mr(2)
8904 .nr(8)
8905 .kr(4)
8906 .sr(1)
8907 .m(2)
8908 .n(8)
8909 .k(k)
8910 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -08008911 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008912 }
8913 }
8914
8915 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, k_div_16_subtile) {
8916 TEST_REQUIRES_ARM_NEON;
8917 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08008918 for (uint32_t n = 1; n <= 8; n++) {
8919 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008920 GemmMicrokernelTester()
8921 .mr(2)
8922 .nr(8)
8923 .kr(4)
8924 .sr(1)
8925 .m(m)
8926 .n(n)
8927 .k(k)
8928 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08008929 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008930 }
8931 }
8932 }
8933 }
8934
8935 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_gt_8) {
8936 TEST_REQUIRES_ARM_NEON;
8937 for (uint32_t n = 9; n < 16; n++) {
8938 for (size_t k = 1; k <= 80; k += 17) {
8939 GemmMicrokernelTester()
8940 .mr(2)
8941 .nr(8)
8942 .kr(4)
8943 .sr(1)
8944 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08008945 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008946 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08008947 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008948 }
8949 }
8950 }
8951
8952 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_gt_8_strided_cn) {
8953 TEST_REQUIRES_ARM_NEON;
8954 for (uint32_t n = 9; n < 16; n++) {
8955 for (size_t k = 1; k <= 80; k += 17) {
8956 GemmMicrokernelTester()
8957 .mr(2)
8958 .nr(8)
8959 .kr(4)
8960 .sr(1)
8961 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08008962 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008963 .k(k)
8964 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08008965 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008966 }
8967 }
8968 }
8969
8970 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_gt_8_strided_a) {
8971 TEST_REQUIRES_ARM_NEON;
8972 for (uint32_t n = 9; n < 16; n++) {
8973 for (size_t k = 1; k <= 80; k += 17) {
8974 GemmMicrokernelTester()
8975 .mr(2)
8976 .nr(8)
8977 .kr(4)
8978 .sr(1)
8979 .m(2)
8980 .n(n)
8981 .k(k)
8982 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08008983 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08008984 }
8985 }
8986 }
8987
8988 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_gt_8_subtile) {
8989 TEST_REQUIRES_ARM_NEON;
8990 for (uint32_t n = 9; n < 16; n++) {
8991 for (size_t k = 1; k <= 80; k += 17) {
8992 for (uint32_t m = 1; m <= 2; m++) {
8993 GemmMicrokernelTester()
8994 .mr(2)
8995 .nr(8)
8996 .kr(4)
8997 .sr(1)
8998 .m(m)
8999 .n(n)
9000 .k(k)
9001 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009002 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009003 }
9004 }
9005 }
9006 }
9007
9008 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_div_8) {
9009 TEST_REQUIRES_ARM_NEON;
9010 for (uint32_t n = 16; n <= 24; n += 8) {
9011 for (size_t k = 1; k <= 80; k += 17) {
9012 GemmMicrokernelTester()
9013 .mr(2)
9014 .nr(8)
9015 .kr(4)
9016 .sr(1)
9017 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08009018 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009019 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08009020 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009021 }
9022 }
9023 }
9024
9025 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_div_8_strided_cn) {
9026 TEST_REQUIRES_ARM_NEON;
9027 for (uint32_t n = 16; n <= 24; n += 8) {
9028 for (size_t k = 1; k <= 80; k += 17) {
9029 GemmMicrokernelTester()
9030 .mr(2)
9031 .nr(8)
9032 .kr(4)
9033 .sr(1)
9034 .m(2)
9035 .n(n)
9036 .k(k)
9037 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08009038 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009039 }
9040 }
9041 }
9042
9043 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_div_8_strided_a) {
9044 TEST_REQUIRES_ARM_NEON;
9045 for (uint32_t n = 16; n <= 24; n += 8) {
9046 for (size_t k = 1; k <= 80; k += 17) {
9047 GemmMicrokernelTester()
9048 .mr(2)
9049 .nr(8)
9050 .kr(4)
9051 .sr(1)
9052 .m(2)
9053 .n(n)
9054 .k(k)
9055 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08009056 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009057 }
9058 }
9059 }
9060
9061 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, n_div_8_subtile) {
9062 TEST_REQUIRES_ARM_NEON;
9063 for (uint32_t n = 16; n <= 24; n += 8) {
9064 for (size_t k = 1; k <= 80; k += 17) {
9065 for (uint32_t m = 1; m <= 2; m++) {
9066 GemmMicrokernelTester()
9067 .mr(2)
9068 .nr(8)
9069 .kr(4)
9070 .sr(1)
9071 .m(m)
9072 .n(n)
9073 .k(k)
9074 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009075 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009076 }
9077 }
9078 }
9079 }
9080
9081 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, strided_cm_subtile) {
9082 TEST_REQUIRES_ARM_NEON;
9083 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08009084 for (uint32_t n = 1; n <= 8; n++) {
9085 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009086 GemmMicrokernelTester()
9087 .mr(2)
9088 .nr(8)
9089 .kr(4)
9090 .sr(1)
9091 .m(m)
9092 .n(n)
9093 .k(k)
9094 .cm_stride(11)
9095 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009096 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009097 }
9098 }
9099 }
9100 }
9101
9102 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, qmin) {
9103 TEST_REQUIRES_ARM_NEON;
9104 GemmMicrokernelTester()
9105 .mr(2)
9106 .nr(8)
9107 .kr(4)
9108 .sr(1)
9109 .m(2)
9110 .n(8)
9111 .k(16)
9112 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08009113 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009114 }
9115
9116 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, qmax) {
9117 TEST_REQUIRES_ARM_NEON;
9118 GemmMicrokernelTester()
9119 .mr(2)
9120 .nr(8)
9121 .kr(4)
9122 .sr(1)
9123 .m(2)
9124 .n(8)
9125 .k(16)
9126 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08009127 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009128 }
9129
9130 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_DUP, strided_cm) {
9131 TEST_REQUIRES_ARM_NEON;
9132 GemmMicrokernelTester()
9133 .mr(2)
9134 .nr(8)
9135 .kr(4)
9136 .sr(1)
9137 .m(2)
9138 .n(8)
9139 .k(16)
9140 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08009141 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009142 }
9143#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
9144
9145
9146#if XNN_ARCH_ARM || XNN_ARCH_ARM64
9147 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16) {
9148 TEST_REQUIRES_ARM_NEON_V8;
9149 GemmMicrokernelTester()
9150 .mr(2)
9151 .nr(8)
9152 .kr(4)
9153 .sr(1)
9154 .m(2)
9155 .n(8)
9156 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -08009157 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009158 }
9159
9160 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, strided_cn) {
9161 TEST_REQUIRES_ARM_NEON_V8;
9162 GemmMicrokernelTester()
9163 .mr(2)
9164 .nr(8)
9165 .kr(4)
9166 .sr(1)
9167 .m(2)
9168 .n(8)
9169 .k(16)
9170 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08009171 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009172 }
9173
9174 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16_strided_a) {
9175 TEST_REQUIRES_ARM_NEON_V8;
9176 GemmMicrokernelTester()
9177 .mr(2)
9178 .nr(8)
9179 .kr(4)
9180 .sr(1)
9181 .m(2)
9182 .n(8)
9183 .k(16)
9184 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08009185 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009186 }
9187
9188 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile) {
9189 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -08009190 for (uint32_t n = 1; n <= 8; n++) {
9191 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009192 GemmMicrokernelTester()
9193 .mr(2)
9194 .nr(8)
9195 .kr(4)
9196 .sr(1)
9197 .m(m)
9198 .n(n)
9199 .k(16)
9200 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009201 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009202 }
9203 }
9204 }
9205
9206 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile_m) {
9207 TEST_REQUIRES_ARM_NEON_V8;
9208 for (uint32_t m = 1; m <= 2; m++) {
9209 GemmMicrokernelTester()
9210 .mr(2)
9211 .nr(8)
9212 .kr(4)
9213 .sr(1)
9214 .m(m)
9215 .n(8)
9216 .k(16)
9217 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009218 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009219 }
9220 }
9221
9222 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_eq_16_subtile_n) {
9223 TEST_REQUIRES_ARM_NEON_V8;
9224 for (uint32_t n = 1; n <= 8; n++) {
9225 GemmMicrokernelTester()
9226 .mr(2)
9227 .nr(8)
9228 .kr(4)
9229 .sr(1)
9230 .m(2)
9231 .n(n)
9232 .k(16)
9233 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009234 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009235 }
9236 }
9237
9238 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_lt_16) {
9239 TEST_REQUIRES_ARM_NEON_V8;
9240 for (size_t k = 1; k < 16; k++) {
9241 GemmMicrokernelTester()
9242 .mr(2)
9243 .nr(8)
9244 .kr(4)
9245 .sr(1)
9246 .m(2)
9247 .n(8)
9248 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08009249 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009250 }
9251 }
9252
9253 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_lt_16_strided_a) {
9254 TEST_REQUIRES_ARM_NEON_V8;
9255 for (size_t k = 1; k < 16; k++) {
9256 GemmMicrokernelTester()
9257 .mr(2)
9258 .nr(8)
9259 .kr(4)
9260 .sr(1)
9261 .m(2)
9262 .n(8)
9263 .k(k)
9264 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08009265 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009266 }
9267 }
9268
9269 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_lt_16_subtile) {
9270 TEST_REQUIRES_ARM_NEON_V8;
9271 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08009272 for (uint32_t n = 1; n <= 8; n++) {
9273 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009274 GemmMicrokernelTester()
9275 .mr(2)
9276 .nr(8)
9277 .kr(4)
9278 .sr(1)
9279 .m(m)
9280 .n(n)
9281 .k(k)
9282 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009283 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009284 }
9285 }
9286 }
9287 }
9288
9289 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_gt_16) {
9290 TEST_REQUIRES_ARM_NEON_V8;
9291 for (size_t k = 17; k < 32; k++) {
9292 GemmMicrokernelTester()
9293 .mr(2)
9294 .nr(8)
9295 .kr(4)
9296 .sr(1)
9297 .m(2)
9298 .n(8)
9299 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08009300 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009301 }
9302 }
9303
9304 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_gt_16_strided_a) {
9305 TEST_REQUIRES_ARM_NEON_V8;
9306 for (size_t k = 17; k < 32; k++) {
9307 GemmMicrokernelTester()
9308 .mr(2)
9309 .nr(8)
9310 .kr(4)
9311 .sr(1)
9312 .m(2)
9313 .n(8)
9314 .k(k)
9315 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -08009316 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009317 }
9318 }
9319
9320 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_gt_16_subtile) {
9321 TEST_REQUIRES_ARM_NEON_V8;
9322 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08009323 for (uint32_t n = 1; n <= 8; n++) {
9324 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009325 GemmMicrokernelTester()
9326 .mr(2)
9327 .nr(8)
9328 .kr(4)
9329 .sr(1)
9330 .m(m)
9331 .n(n)
9332 .k(k)
9333 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009334 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009335 }
9336 }
9337 }
9338 }
9339
9340 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_div_16) {
9341 TEST_REQUIRES_ARM_NEON_V8;
9342 for (size_t k = 32; k <= 160; k += 16) {
9343 GemmMicrokernelTester()
9344 .mr(2)
9345 .nr(8)
9346 .kr(4)
9347 .sr(1)
9348 .m(2)
9349 .n(8)
9350 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08009351 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009352 }
9353 }
9354
9355 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_div_16_strided_a) {
9356 TEST_REQUIRES_ARM_NEON_V8;
9357 for (size_t k = 32; k <= 160; k += 16) {
9358 GemmMicrokernelTester()
9359 .mr(2)
9360 .nr(8)
9361 .kr(4)
9362 .sr(1)
9363 .m(2)
9364 .n(8)
9365 .k(k)
9366 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -08009367 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009368 }
9369 }
9370
9371 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, k_div_16_subtile) {
9372 TEST_REQUIRES_ARM_NEON_V8;
9373 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08009374 for (uint32_t n = 1; n <= 8; n++) {
9375 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009376 GemmMicrokernelTester()
9377 .mr(2)
9378 .nr(8)
9379 .kr(4)
9380 .sr(1)
9381 .m(m)
9382 .n(n)
9383 .k(k)
9384 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009385 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009386 }
9387 }
9388 }
9389 }
9390
9391 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_gt_8) {
9392 TEST_REQUIRES_ARM_NEON_V8;
9393 for (uint32_t n = 9; n < 16; n++) {
9394 for (size_t k = 1; k <= 80; k += 17) {
9395 GemmMicrokernelTester()
9396 .mr(2)
9397 .nr(8)
9398 .kr(4)
9399 .sr(1)
9400 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08009401 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009402 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08009403 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009404 }
9405 }
9406 }
9407
9408 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_gt_8_strided_cn) {
9409 TEST_REQUIRES_ARM_NEON_V8;
9410 for (uint32_t n = 9; n < 16; n++) {
9411 for (size_t k = 1; k <= 80; k += 17) {
9412 GemmMicrokernelTester()
9413 .mr(2)
9414 .nr(8)
9415 .kr(4)
9416 .sr(1)
9417 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08009418 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009419 .k(k)
9420 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08009421 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009422 }
9423 }
9424 }
9425
9426 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_gt_8_strided_a) {
9427 TEST_REQUIRES_ARM_NEON_V8;
9428 for (uint32_t n = 9; n < 16; n++) {
9429 for (size_t k = 1; k <= 80; k += 17) {
9430 GemmMicrokernelTester()
9431 .mr(2)
9432 .nr(8)
9433 .kr(4)
9434 .sr(1)
9435 .m(2)
9436 .n(n)
9437 .k(k)
9438 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08009439 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009440 }
9441 }
9442 }
9443
9444 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_gt_8_subtile) {
9445 TEST_REQUIRES_ARM_NEON_V8;
9446 for (uint32_t n = 9; n < 16; n++) {
9447 for (size_t k = 1; k <= 80; k += 17) {
9448 for (uint32_t m = 1; m <= 2; m++) {
9449 GemmMicrokernelTester()
9450 .mr(2)
9451 .nr(8)
9452 .kr(4)
9453 .sr(1)
9454 .m(m)
9455 .n(n)
9456 .k(k)
9457 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009458 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009459 }
9460 }
9461 }
9462 }
9463
9464 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_div_8) {
9465 TEST_REQUIRES_ARM_NEON_V8;
9466 for (uint32_t n = 16; n <= 24; n += 8) {
9467 for (size_t k = 1; k <= 80; k += 17) {
9468 GemmMicrokernelTester()
9469 .mr(2)
9470 .nr(8)
9471 .kr(4)
9472 .sr(1)
9473 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08009474 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009475 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08009476 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009477 }
9478 }
9479 }
9480
9481 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_div_8_strided_cn) {
9482 TEST_REQUIRES_ARM_NEON_V8;
9483 for (uint32_t n = 16; n <= 24; n += 8) {
9484 for (size_t k = 1; k <= 80; k += 17) {
9485 GemmMicrokernelTester()
9486 .mr(2)
9487 .nr(8)
9488 .kr(4)
9489 .sr(1)
9490 .m(2)
9491 .n(n)
9492 .k(k)
9493 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08009494 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009495 }
9496 }
9497 }
9498
9499 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_div_8_strided_a) {
9500 TEST_REQUIRES_ARM_NEON_V8;
9501 for (uint32_t n = 16; n <= 24; n += 8) {
9502 for (size_t k = 1; k <= 80; k += 17) {
9503 GemmMicrokernelTester()
9504 .mr(2)
9505 .nr(8)
9506 .kr(4)
9507 .sr(1)
9508 .m(2)
9509 .n(n)
9510 .k(k)
9511 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08009512 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009513 }
9514 }
9515 }
9516
9517 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, n_div_8_subtile) {
9518 TEST_REQUIRES_ARM_NEON_V8;
9519 for (uint32_t n = 16; n <= 24; n += 8) {
9520 for (size_t k = 1; k <= 80; k += 17) {
9521 for (uint32_t m = 1; m <= 2; m++) {
9522 GemmMicrokernelTester()
9523 .mr(2)
9524 .nr(8)
9525 .kr(4)
9526 .sr(1)
9527 .m(m)
9528 .n(n)
9529 .k(k)
9530 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009531 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009532 }
9533 }
9534 }
9535 }
9536
9537 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, strided_cm_subtile) {
9538 TEST_REQUIRES_ARM_NEON_V8;
9539 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08009540 for (uint32_t n = 1; n <= 8; n++) {
9541 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009542 GemmMicrokernelTester()
9543 .mr(2)
9544 .nr(8)
9545 .kr(4)
9546 .sr(1)
9547 .m(m)
9548 .n(n)
9549 .k(k)
9550 .cm_stride(11)
9551 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009552 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009553 }
9554 }
9555 }
9556 }
9557
9558 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, qmin) {
9559 TEST_REQUIRES_ARM_NEON_V8;
9560 GemmMicrokernelTester()
9561 .mr(2)
9562 .nr(8)
9563 .kr(4)
9564 .sr(1)
9565 .m(2)
9566 .n(8)
9567 .k(16)
9568 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08009569 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009570 }
9571
9572 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, qmax) {
9573 TEST_REQUIRES_ARM_NEON_V8;
9574 GemmMicrokernelTester()
9575 .mr(2)
9576 .nr(8)
9577 .kr(4)
9578 .sr(1)
9579 .m(2)
9580 .n(8)
9581 .k(16)
9582 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -08009583 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009584 }
9585
9586 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_DUP, strided_cm) {
9587 TEST_REQUIRES_ARM_NEON_V8;
9588 GemmMicrokernelTester()
9589 .mr(2)
9590 .nr(8)
9591 .kr(4)
9592 .sr(1)
9593 .m(2)
9594 .n(8)
9595 .k(16)
9596 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08009597 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_dup, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009598 }
9599#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
9600
9601
9602#if XNN_ARCH_ARM || XNN_ARCH_ARM64
9603 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, k_eq_16) {
9604 TEST_REQUIRES_ARM_NEON;
9605 GemmMicrokernelTester()
9606 .mr(1)
9607 .nr(8)
9608 .kr(4)
9609 .sr(1)
9610 .m(1)
9611 .n(8)
9612 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -08009613 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009614 }
9615
9616 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, strided_cn) {
9617 TEST_REQUIRES_ARM_NEON;
9618 GemmMicrokernelTester()
9619 .mr(1)
9620 .nr(8)
9621 .kr(4)
9622 .sr(1)
9623 .m(1)
9624 .n(8)
9625 .k(16)
9626 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08009627 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009628 }
9629
9630 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, k_eq_16_strided_a) {
9631 TEST_REQUIRES_ARM_NEON;
9632 GemmMicrokernelTester()
9633 .mr(1)
9634 .nr(8)
9635 .kr(4)
9636 .sr(1)
9637 .m(1)
9638 .n(8)
9639 .k(16)
9640 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08009641 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009642 }
9643
9644 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, k_eq_16_subtile) {
9645 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -08009646 for (uint32_t n = 1; n <= 8; n++) {
9647 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009648 GemmMicrokernelTester()
9649 .mr(1)
9650 .nr(8)
9651 .kr(4)
9652 .sr(1)
9653 .m(m)
9654 .n(n)
9655 .k(16)
9656 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009657 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009658 }
9659 }
9660 }
9661
9662 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, k_eq_16_subtile_m) {
9663 TEST_REQUIRES_ARM_NEON;
9664 for (uint32_t m = 1; m <= 1; m++) {
9665 GemmMicrokernelTester()
9666 .mr(1)
9667 .nr(8)
9668 .kr(4)
9669 .sr(1)
9670 .m(m)
9671 .n(8)
9672 .k(16)
9673 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009674 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009675 }
9676 }
9677
9678 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, k_eq_16_subtile_n) {
9679 TEST_REQUIRES_ARM_NEON;
9680 for (uint32_t n = 1; n <= 8; n++) {
9681 GemmMicrokernelTester()
9682 .mr(1)
9683 .nr(8)
9684 .kr(4)
9685 .sr(1)
9686 .m(1)
9687 .n(n)
9688 .k(16)
9689 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009690 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009691 }
9692 }
9693
9694 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, k_lt_16) {
9695 TEST_REQUIRES_ARM_NEON;
9696 for (size_t k = 1; k < 16; k++) {
9697 GemmMicrokernelTester()
9698 .mr(1)
9699 .nr(8)
9700 .kr(4)
9701 .sr(1)
9702 .m(1)
9703 .n(8)
9704 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08009705 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009706 }
9707 }
9708
9709 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, k_lt_16_strided_a) {
9710 TEST_REQUIRES_ARM_NEON;
9711 for (size_t k = 1; k < 16; k++) {
9712 GemmMicrokernelTester()
9713 .mr(1)
9714 .nr(8)
9715 .kr(4)
9716 .sr(1)
9717 .m(1)
9718 .n(8)
9719 .k(k)
9720 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -08009721 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009722 }
9723 }
9724
9725 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, k_lt_16_subtile) {
9726 TEST_REQUIRES_ARM_NEON;
9727 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08009728 for (uint32_t n = 1; n <= 8; n++) {
9729 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009730 GemmMicrokernelTester()
9731 .mr(1)
9732 .nr(8)
9733 .kr(4)
9734 .sr(1)
9735 .m(m)
9736 .n(n)
9737 .k(k)
9738 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009739 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009740 }
9741 }
9742 }
9743 }
9744
9745 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, k_gt_16) {
9746 TEST_REQUIRES_ARM_NEON;
9747 for (size_t k = 17; k < 32; k++) {
9748 GemmMicrokernelTester()
9749 .mr(1)
9750 .nr(8)
9751 .kr(4)
9752 .sr(1)
9753 .m(1)
9754 .n(8)
9755 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08009756 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009757 }
9758 }
9759
9760 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, k_gt_16_strided_a) {
9761 TEST_REQUIRES_ARM_NEON;
9762 for (size_t k = 17; k < 32; k++) {
9763 GemmMicrokernelTester()
9764 .mr(1)
9765 .nr(8)
9766 .kr(4)
9767 .sr(1)
9768 .m(1)
9769 .n(8)
9770 .k(k)
9771 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -08009772 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009773 }
9774 }
9775
9776 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, k_gt_16_subtile) {
9777 TEST_REQUIRES_ARM_NEON;
9778 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08009779 for (uint32_t n = 1; n <= 8; n++) {
9780 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009781 GemmMicrokernelTester()
9782 .mr(1)
9783 .nr(8)
9784 .kr(4)
9785 .sr(1)
9786 .m(m)
9787 .n(n)
9788 .k(k)
9789 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009790 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009791 }
9792 }
9793 }
9794 }
9795
9796 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, k_div_16) {
9797 TEST_REQUIRES_ARM_NEON;
9798 for (size_t k = 32; k <= 160; k += 16) {
9799 GemmMicrokernelTester()
9800 .mr(1)
9801 .nr(8)
9802 .kr(4)
9803 .sr(1)
9804 .m(1)
9805 .n(8)
9806 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08009807 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009808 }
9809 }
9810
9811 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, k_div_16_strided_a) {
9812 TEST_REQUIRES_ARM_NEON;
9813 for (size_t k = 32; k <= 160; k += 16) {
9814 GemmMicrokernelTester()
9815 .mr(1)
9816 .nr(8)
9817 .kr(4)
9818 .sr(1)
9819 .m(1)
9820 .n(8)
9821 .k(k)
9822 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -08009823 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009824 }
9825 }
9826
9827 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, k_div_16_subtile) {
9828 TEST_REQUIRES_ARM_NEON;
9829 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08009830 for (uint32_t n = 1; n <= 8; n++) {
9831 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009832 GemmMicrokernelTester()
9833 .mr(1)
9834 .nr(8)
9835 .kr(4)
9836 .sr(1)
9837 .m(m)
9838 .n(n)
9839 .k(k)
9840 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009841 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009842 }
9843 }
9844 }
9845 }
9846
9847 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, n_gt_8) {
9848 TEST_REQUIRES_ARM_NEON;
9849 for (uint32_t n = 9; n < 16; n++) {
9850 for (size_t k = 1; k <= 80; k += 17) {
9851 GemmMicrokernelTester()
9852 .mr(1)
9853 .nr(8)
9854 .kr(4)
9855 .sr(1)
9856 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08009857 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009858 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08009859 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009860 }
9861 }
9862 }
9863
9864 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, n_gt_8_strided_cn) {
9865 TEST_REQUIRES_ARM_NEON;
9866 for (uint32_t n = 9; n < 16; n++) {
9867 for (size_t k = 1; k <= 80; k += 17) {
9868 GemmMicrokernelTester()
9869 .mr(1)
9870 .nr(8)
9871 .kr(4)
9872 .sr(1)
9873 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08009874 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009875 .k(k)
9876 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08009877 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009878 }
9879 }
9880 }
9881
9882 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, n_gt_8_strided_a) {
9883 TEST_REQUIRES_ARM_NEON;
9884 for (uint32_t n = 9; n < 16; n++) {
9885 for (size_t k = 1; k <= 80; k += 17) {
9886 GemmMicrokernelTester()
9887 .mr(1)
9888 .nr(8)
9889 .kr(4)
9890 .sr(1)
9891 .m(1)
9892 .n(n)
9893 .k(k)
9894 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08009895 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009896 }
9897 }
9898 }
9899
9900 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, n_gt_8_subtile) {
9901 TEST_REQUIRES_ARM_NEON;
9902 for (uint32_t n = 9; n < 16; n++) {
9903 for (size_t k = 1; k <= 80; k += 17) {
9904 for (uint32_t m = 1; m <= 1; m++) {
9905 GemmMicrokernelTester()
9906 .mr(1)
9907 .nr(8)
9908 .kr(4)
9909 .sr(1)
9910 .m(m)
9911 .n(n)
9912 .k(k)
9913 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009914 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009915 }
9916 }
9917 }
9918 }
9919
9920 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, n_div_8) {
9921 TEST_REQUIRES_ARM_NEON;
9922 for (uint32_t n = 16; n <= 24; n += 8) {
9923 for (size_t k = 1; k <= 80; k += 17) {
9924 GemmMicrokernelTester()
9925 .mr(1)
9926 .nr(8)
9927 .kr(4)
9928 .sr(1)
9929 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -08009930 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009931 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -08009932 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009933 }
9934 }
9935 }
9936
9937 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, n_div_8_strided_cn) {
9938 TEST_REQUIRES_ARM_NEON;
9939 for (uint32_t n = 16; n <= 24; n += 8) {
9940 for (size_t k = 1; k <= 80; k += 17) {
9941 GemmMicrokernelTester()
9942 .mr(1)
9943 .nr(8)
9944 .kr(4)
9945 .sr(1)
9946 .m(1)
9947 .n(n)
9948 .k(k)
9949 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -08009950 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009951 }
9952 }
9953 }
9954
9955 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, n_div_8_strided_a) {
9956 TEST_REQUIRES_ARM_NEON;
9957 for (uint32_t n = 16; n <= 24; n += 8) {
9958 for (size_t k = 1; k <= 80; k += 17) {
9959 GemmMicrokernelTester()
9960 .mr(1)
9961 .nr(8)
9962 .kr(4)
9963 .sr(1)
9964 .m(1)
9965 .n(n)
9966 .k(k)
9967 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -08009968 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009969 }
9970 }
9971 }
9972
9973 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, n_div_8_subtile) {
9974 TEST_REQUIRES_ARM_NEON;
9975 for (uint32_t n = 16; n <= 24; n += 8) {
9976 for (size_t k = 1; k <= 80; k += 17) {
9977 for (uint32_t m = 1; m <= 1; m++) {
9978 GemmMicrokernelTester()
9979 .mr(1)
9980 .nr(8)
9981 .kr(4)
9982 .sr(1)
9983 .m(m)
9984 .n(n)
9985 .k(k)
9986 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -08009987 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009988 }
9989 }
9990 }
9991 }
9992
9993 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, strided_cm_subtile) {
9994 TEST_REQUIRES_ARM_NEON;
9995 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -08009996 for (uint32_t n = 1; n <= 8; n++) {
9997 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -08009998 GemmMicrokernelTester()
9999 .mr(1)
10000 .nr(8)
10001 .kr(4)
10002 .sr(1)
10003 .m(m)
10004 .n(n)
10005 .k(k)
10006 .cm_stride(11)
10007 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010008 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010009 }
10010 }
10011 }
10012 }
10013
10014 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, qmin) {
10015 TEST_REQUIRES_ARM_NEON;
10016 GemmMicrokernelTester()
10017 .mr(1)
10018 .nr(8)
10019 .kr(4)
10020 .sr(1)
10021 .m(1)
10022 .n(8)
10023 .k(16)
10024 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080010025 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010026 }
10027
10028 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, qmax) {
10029 TEST_REQUIRES_ARM_NEON;
10030 GemmMicrokernelTester()
10031 .mr(1)
10032 .nr(8)
10033 .kr(4)
10034 .sr(1)
10035 .m(1)
10036 .n(8)
10037 .k(16)
10038 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080010039 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010040 }
10041
10042 TEST(QC8_GEMM_MINMAX_FP32_1X8C4__NEON_MLAL_LD1R, strided_cm) {
10043 TEST_REQUIRES_ARM_NEON;
10044 GemmMicrokernelTester()
10045 .mr(1)
10046 .nr(8)
10047 .kr(4)
10048 .sr(1)
10049 .m(1)
10050 .n(8)
10051 .k(16)
10052 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080010053 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010054 }
10055#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
10056
10057
10058#if XNN_ARCH_ARM || XNN_ARCH_ARM64
10059 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, k_eq_16) {
10060 TEST_REQUIRES_ARM_NEON;
10061 GemmMicrokernelTester()
10062 .mr(2)
10063 .nr(8)
10064 .kr(4)
10065 .sr(1)
10066 .m(2)
10067 .n(8)
10068 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -080010069 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010070 }
10071
10072 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, strided_cn) {
10073 TEST_REQUIRES_ARM_NEON;
10074 GemmMicrokernelTester()
10075 .mr(2)
10076 .nr(8)
10077 .kr(4)
10078 .sr(1)
10079 .m(2)
10080 .n(8)
10081 .k(16)
10082 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080010083 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010084 }
10085
10086 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, k_eq_16_strided_a) {
10087 TEST_REQUIRES_ARM_NEON;
10088 GemmMicrokernelTester()
10089 .mr(2)
10090 .nr(8)
10091 .kr(4)
10092 .sr(1)
10093 .m(2)
10094 .n(8)
10095 .k(16)
10096 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080010097 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010098 }
10099
10100 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, k_eq_16_subtile) {
10101 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -080010102 for (uint32_t n = 1; n <= 8; n++) {
10103 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010104 GemmMicrokernelTester()
10105 .mr(2)
10106 .nr(8)
10107 .kr(4)
10108 .sr(1)
10109 .m(m)
10110 .n(n)
10111 .k(16)
10112 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010113 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010114 }
10115 }
10116 }
10117
10118 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, k_eq_16_subtile_m) {
10119 TEST_REQUIRES_ARM_NEON;
10120 for (uint32_t m = 1; m <= 2; m++) {
10121 GemmMicrokernelTester()
10122 .mr(2)
10123 .nr(8)
10124 .kr(4)
10125 .sr(1)
10126 .m(m)
10127 .n(8)
10128 .k(16)
10129 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010130 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010131 }
10132 }
10133
10134 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, k_eq_16_subtile_n) {
10135 TEST_REQUIRES_ARM_NEON;
10136 for (uint32_t n = 1; n <= 8; n++) {
10137 GemmMicrokernelTester()
10138 .mr(2)
10139 .nr(8)
10140 .kr(4)
10141 .sr(1)
10142 .m(2)
10143 .n(n)
10144 .k(16)
10145 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010146 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010147 }
10148 }
10149
10150 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, k_lt_16) {
10151 TEST_REQUIRES_ARM_NEON;
10152 for (size_t k = 1; k < 16; k++) {
10153 GemmMicrokernelTester()
10154 .mr(2)
10155 .nr(8)
10156 .kr(4)
10157 .sr(1)
10158 .m(2)
10159 .n(8)
10160 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080010161 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010162 }
10163 }
10164
10165 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, k_lt_16_strided_a) {
10166 TEST_REQUIRES_ARM_NEON;
10167 for (size_t k = 1; k < 16; k++) {
10168 GemmMicrokernelTester()
10169 .mr(2)
10170 .nr(8)
10171 .kr(4)
10172 .sr(1)
10173 .m(2)
10174 .n(8)
10175 .k(k)
10176 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080010177 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010178 }
10179 }
10180
10181 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, k_lt_16_subtile) {
10182 TEST_REQUIRES_ARM_NEON;
10183 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080010184 for (uint32_t n = 1; n <= 8; n++) {
10185 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010186 GemmMicrokernelTester()
10187 .mr(2)
10188 .nr(8)
10189 .kr(4)
10190 .sr(1)
10191 .m(m)
10192 .n(n)
10193 .k(k)
10194 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010195 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010196 }
10197 }
10198 }
10199 }
10200
10201 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, k_gt_16) {
10202 TEST_REQUIRES_ARM_NEON;
10203 for (size_t k = 17; k < 32; k++) {
10204 GemmMicrokernelTester()
10205 .mr(2)
10206 .nr(8)
10207 .kr(4)
10208 .sr(1)
10209 .m(2)
10210 .n(8)
10211 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080010212 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010213 }
10214 }
10215
10216 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, k_gt_16_strided_a) {
10217 TEST_REQUIRES_ARM_NEON;
10218 for (size_t k = 17; k < 32; k++) {
10219 GemmMicrokernelTester()
10220 .mr(2)
10221 .nr(8)
10222 .kr(4)
10223 .sr(1)
10224 .m(2)
10225 .n(8)
10226 .k(k)
10227 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -080010228 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010229 }
10230 }
10231
10232 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, k_gt_16_subtile) {
10233 TEST_REQUIRES_ARM_NEON;
10234 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080010235 for (uint32_t n = 1; n <= 8; n++) {
10236 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010237 GemmMicrokernelTester()
10238 .mr(2)
10239 .nr(8)
10240 .kr(4)
10241 .sr(1)
10242 .m(m)
10243 .n(n)
10244 .k(k)
10245 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010246 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010247 }
10248 }
10249 }
10250 }
10251
10252 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, k_div_16) {
10253 TEST_REQUIRES_ARM_NEON;
10254 for (size_t k = 32; k <= 160; k += 16) {
10255 GemmMicrokernelTester()
10256 .mr(2)
10257 .nr(8)
10258 .kr(4)
10259 .sr(1)
10260 .m(2)
10261 .n(8)
10262 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080010263 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010264 }
10265 }
10266
10267 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, k_div_16_strided_a) {
10268 TEST_REQUIRES_ARM_NEON;
10269 for (size_t k = 32; k <= 160; k += 16) {
10270 GemmMicrokernelTester()
10271 .mr(2)
10272 .nr(8)
10273 .kr(4)
10274 .sr(1)
10275 .m(2)
10276 .n(8)
10277 .k(k)
10278 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -080010279 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010280 }
10281 }
10282
10283 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, k_div_16_subtile) {
10284 TEST_REQUIRES_ARM_NEON;
10285 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080010286 for (uint32_t n = 1; n <= 8; n++) {
10287 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010288 GemmMicrokernelTester()
10289 .mr(2)
10290 .nr(8)
10291 .kr(4)
10292 .sr(1)
10293 .m(m)
10294 .n(n)
10295 .k(k)
10296 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010297 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010298 }
10299 }
10300 }
10301 }
10302
10303 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, n_gt_8) {
10304 TEST_REQUIRES_ARM_NEON;
10305 for (uint32_t n = 9; n < 16; n++) {
10306 for (size_t k = 1; k <= 80; k += 17) {
10307 GemmMicrokernelTester()
10308 .mr(2)
10309 .nr(8)
10310 .kr(4)
10311 .sr(1)
10312 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080010313 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010314 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080010315 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010316 }
10317 }
10318 }
10319
10320 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, n_gt_8_strided_cn) {
10321 TEST_REQUIRES_ARM_NEON;
10322 for (uint32_t n = 9; n < 16; n++) {
10323 for (size_t k = 1; k <= 80; k += 17) {
10324 GemmMicrokernelTester()
10325 .mr(2)
10326 .nr(8)
10327 .kr(4)
10328 .sr(1)
10329 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080010330 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010331 .k(k)
10332 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080010333 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010334 }
10335 }
10336 }
10337
10338 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, n_gt_8_strided_a) {
10339 TEST_REQUIRES_ARM_NEON;
10340 for (uint32_t n = 9; n < 16; n++) {
10341 for (size_t k = 1; k <= 80; k += 17) {
10342 GemmMicrokernelTester()
10343 .mr(2)
10344 .nr(8)
10345 .kr(4)
10346 .sr(1)
10347 .m(2)
10348 .n(n)
10349 .k(k)
10350 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080010351 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010352 }
10353 }
10354 }
10355
10356 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, n_gt_8_subtile) {
10357 TEST_REQUIRES_ARM_NEON;
10358 for (uint32_t n = 9; n < 16; n++) {
10359 for (size_t k = 1; k <= 80; k += 17) {
10360 for (uint32_t m = 1; m <= 2; m++) {
10361 GemmMicrokernelTester()
10362 .mr(2)
10363 .nr(8)
10364 .kr(4)
10365 .sr(1)
10366 .m(m)
10367 .n(n)
10368 .k(k)
10369 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010370 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010371 }
10372 }
10373 }
10374 }
10375
10376 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, n_div_8) {
10377 TEST_REQUIRES_ARM_NEON;
10378 for (uint32_t n = 16; n <= 24; n += 8) {
10379 for (size_t k = 1; k <= 80; k += 17) {
10380 GemmMicrokernelTester()
10381 .mr(2)
10382 .nr(8)
10383 .kr(4)
10384 .sr(1)
10385 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080010386 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010387 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080010388 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010389 }
10390 }
10391 }
10392
10393 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, n_div_8_strided_cn) {
10394 TEST_REQUIRES_ARM_NEON;
10395 for (uint32_t n = 16; n <= 24; n += 8) {
10396 for (size_t k = 1; k <= 80; k += 17) {
10397 GemmMicrokernelTester()
10398 .mr(2)
10399 .nr(8)
10400 .kr(4)
10401 .sr(1)
10402 .m(2)
10403 .n(n)
10404 .k(k)
10405 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080010406 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010407 }
10408 }
10409 }
10410
10411 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, n_div_8_strided_a) {
10412 TEST_REQUIRES_ARM_NEON;
10413 for (uint32_t n = 16; n <= 24; n += 8) {
10414 for (size_t k = 1; k <= 80; k += 17) {
10415 GemmMicrokernelTester()
10416 .mr(2)
10417 .nr(8)
10418 .kr(4)
10419 .sr(1)
10420 .m(2)
10421 .n(n)
10422 .k(k)
10423 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080010424 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010425 }
10426 }
10427 }
10428
10429 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, n_div_8_subtile) {
10430 TEST_REQUIRES_ARM_NEON;
10431 for (uint32_t n = 16; n <= 24; n += 8) {
10432 for (size_t k = 1; k <= 80; k += 17) {
10433 for (uint32_t m = 1; m <= 2; m++) {
10434 GemmMicrokernelTester()
10435 .mr(2)
10436 .nr(8)
10437 .kr(4)
10438 .sr(1)
10439 .m(m)
10440 .n(n)
10441 .k(k)
10442 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010443 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010444 }
10445 }
10446 }
10447 }
10448
10449 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, strided_cm_subtile) {
10450 TEST_REQUIRES_ARM_NEON;
10451 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080010452 for (uint32_t n = 1; n <= 8; n++) {
10453 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010454 GemmMicrokernelTester()
10455 .mr(2)
10456 .nr(8)
10457 .kr(4)
10458 .sr(1)
10459 .m(m)
10460 .n(n)
10461 .k(k)
10462 .cm_stride(11)
10463 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010464 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010465 }
10466 }
10467 }
10468 }
10469
10470 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, qmin) {
10471 TEST_REQUIRES_ARM_NEON;
10472 GemmMicrokernelTester()
10473 .mr(2)
10474 .nr(8)
10475 .kr(4)
10476 .sr(1)
10477 .m(2)
10478 .n(8)
10479 .k(16)
10480 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080010481 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010482 }
10483
10484 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, qmax) {
10485 TEST_REQUIRES_ARM_NEON;
10486 GemmMicrokernelTester()
10487 .mr(2)
10488 .nr(8)
10489 .kr(4)
10490 .sr(1)
10491 .m(2)
10492 .n(8)
10493 .k(16)
10494 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080010495 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010496 }
10497
10498 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEON_MLAL_LD1R, strided_cm) {
10499 TEST_REQUIRES_ARM_NEON;
10500 GemmMicrokernelTester()
10501 .mr(2)
10502 .nr(8)
10503 .kr(4)
10504 .sr(1)
10505 .m(2)
10506 .n(8)
10507 .k(16)
10508 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080010509 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010510 }
10511#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
10512
10513
10514#if XNN_ARCH_ARM || XNN_ARCH_ARM64
10515 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, k_eq_16) {
10516 TEST_REQUIRES_ARM_NEON_V8;
10517 GemmMicrokernelTester()
10518 .mr(2)
10519 .nr(8)
10520 .kr(4)
10521 .sr(1)
10522 .m(2)
10523 .n(8)
10524 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -080010525 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010526 }
10527
10528 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, strided_cn) {
10529 TEST_REQUIRES_ARM_NEON_V8;
10530 GemmMicrokernelTester()
10531 .mr(2)
10532 .nr(8)
10533 .kr(4)
10534 .sr(1)
10535 .m(2)
10536 .n(8)
10537 .k(16)
10538 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080010539 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010540 }
10541
10542 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, k_eq_16_strided_a) {
10543 TEST_REQUIRES_ARM_NEON_V8;
10544 GemmMicrokernelTester()
10545 .mr(2)
10546 .nr(8)
10547 .kr(4)
10548 .sr(1)
10549 .m(2)
10550 .n(8)
10551 .k(16)
10552 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080010553 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010554 }
10555
10556 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, k_eq_16_subtile) {
10557 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -080010558 for (uint32_t n = 1; n <= 8; n++) {
10559 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010560 GemmMicrokernelTester()
10561 .mr(2)
10562 .nr(8)
10563 .kr(4)
10564 .sr(1)
10565 .m(m)
10566 .n(n)
10567 .k(16)
10568 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010569 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010570 }
10571 }
10572 }
10573
10574 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, k_eq_16_subtile_m) {
10575 TEST_REQUIRES_ARM_NEON_V8;
10576 for (uint32_t m = 1; m <= 2; m++) {
10577 GemmMicrokernelTester()
10578 .mr(2)
10579 .nr(8)
10580 .kr(4)
10581 .sr(1)
10582 .m(m)
10583 .n(8)
10584 .k(16)
10585 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010586 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010587 }
10588 }
10589
10590 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, k_eq_16_subtile_n) {
10591 TEST_REQUIRES_ARM_NEON_V8;
10592 for (uint32_t n = 1; n <= 8; n++) {
10593 GemmMicrokernelTester()
10594 .mr(2)
10595 .nr(8)
10596 .kr(4)
10597 .sr(1)
10598 .m(2)
10599 .n(n)
10600 .k(16)
10601 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010602 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010603 }
10604 }
10605
10606 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, k_lt_16) {
10607 TEST_REQUIRES_ARM_NEON_V8;
10608 for (size_t k = 1; k < 16; k++) {
10609 GemmMicrokernelTester()
10610 .mr(2)
10611 .nr(8)
10612 .kr(4)
10613 .sr(1)
10614 .m(2)
10615 .n(8)
10616 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080010617 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010618 }
10619 }
10620
10621 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, k_lt_16_strided_a) {
10622 TEST_REQUIRES_ARM_NEON_V8;
10623 for (size_t k = 1; k < 16; k++) {
10624 GemmMicrokernelTester()
10625 .mr(2)
10626 .nr(8)
10627 .kr(4)
10628 .sr(1)
10629 .m(2)
10630 .n(8)
10631 .k(k)
10632 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080010633 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010634 }
10635 }
10636
10637 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, k_lt_16_subtile) {
10638 TEST_REQUIRES_ARM_NEON_V8;
10639 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080010640 for (uint32_t n = 1; n <= 8; n++) {
10641 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010642 GemmMicrokernelTester()
10643 .mr(2)
10644 .nr(8)
10645 .kr(4)
10646 .sr(1)
10647 .m(m)
10648 .n(n)
10649 .k(k)
10650 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010651 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010652 }
10653 }
10654 }
10655 }
10656
10657 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, k_gt_16) {
10658 TEST_REQUIRES_ARM_NEON_V8;
10659 for (size_t k = 17; k < 32; k++) {
10660 GemmMicrokernelTester()
10661 .mr(2)
10662 .nr(8)
10663 .kr(4)
10664 .sr(1)
10665 .m(2)
10666 .n(8)
10667 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080010668 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010669 }
10670 }
10671
10672 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, k_gt_16_strided_a) {
10673 TEST_REQUIRES_ARM_NEON_V8;
10674 for (size_t k = 17; k < 32; k++) {
10675 GemmMicrokernelTester()
10676 .mr(2)
10677 .nr(8)
10678 .kr(4)
10679 .sr(1)
10680 .m(2)
10681 .n(8)
10682 .k(k)
10683 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -080010684 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010685 }
10686 }
10687
10688 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, k_gt_16_subtile) {
10689 TEST_REQUIRES_ARM_NEON_V8;
10690 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080010691 for (uint32_t n = 1; n <= 8; n++) {
10692 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010693 GemmMicrokernelTester()
10694 .mr(2)
10695 .nr(8)
10696 .kr(4)
10697 .sr(1)
10698 .m(m)
10699 .n(n)
10700 .k(k)
10701 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010702 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010703 }
10704 }
10705 }
10706 }
10707
10708 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, k_div_16) {
10709 TEST_REQUIRES_ARM_NEON_V8;
10710 for (size_t k = 32; k <= 160; k += 16) {
10711 GemmMicrokernelTester()
10712 .mr(2)
10713 .nr(8)
10714 .kr(4)
10715 .sr(1)
10716 .m(2)
10717 .n(8)
10718 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080010719 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010720 }
10721 }
10722
10723 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, k_div_16_strided_a) {
10724 TEST_REQUIRES_ARM_NEON_V8;
10725 for (size_t k = 32; k <= 160; k += 16) {
10726 GemmMicrokernelTester()
10727 .mr(2)
10728 .nr(8)
10729 .kr(4)
10730 .sr(1)
10731 .m(2)
10732 .n(8)
10733 .k(k)
10734 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -080010735 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010736 }
10737 }
10738
10739 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, k_div_16_subtile) {
10740 TEST_REQUIRES_ARM_NEON_V8;
10741 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080010742 for (uint32_t n = 1; n <= 8; n++) {
10743 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010744 GemmMicrokernelTester()
10745 .mr(2)
10746 .nr(8)
10747 .kr(4)
10748 .sr(1)
10749 .m(m)
10750 .n(n)
10751 .k(k)
10752 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010753 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010754 }
10755 }
10756 }
10757 }
10758
10759 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, n_gt_8) {
10760 TEST_REQUIRES_ARM_NEON_V8;
10761 for (uint32_t n = 9; n < 16; n++) {
10762 for (size_t k = 1; k <= 80; k += 17) {
10763 GemmMicrokernelTester()
10764 .mr(2)
10765 .nr(8)
10766 .kr(4)
10767 .sr(1)
10768 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080010769 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010770 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080010771 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010772 }
10773 }
10774 }
10775
10776 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, n_gt_8_strided_cn) {
10777 TEST_REQUIRES_ARM_NEON_V8;
10778 for (uint32_t n = 9; n < 16; n++) {
10779 for (size_t k = 1; k <= 80; k += 17) {
10780 GemmMicrokernelTester()
10781 .mr(2)
10782 .nr(8)
10783 .kr(4)
10784 .sr(1)
10785 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080010786 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010787 .k(k)
10788 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080010789 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010790 }
10791 }
10792 }
10793
10794 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, n_gt_8_strided_a) {
10795 TEST_REQUIRES_ARM_NEON_V8;
10796 for (uint32_t n = 9; n < 16; n++) {
10797 for (size_t k = 1; k <= 80; k += 17) {
10798 GemmMicrokernelTester()
10799 .mr(2)
10800 .nr(8)
10801 .kr(4)
10802 .sr(1)
10803 .m(2)
10804 .n(n)
10805 .k(k)
10806 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080010807 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010808 }
10809 }
10810 }
10811
10812 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, n_gt_8_subtile) {
10813 TEST_REQUIRES_ARM_NEON_V8;
10814 for (uint32_t n = 9; n < 16; n++) {
10815 for (size_t k = 1; k <= 80; k += 17) {
10816 for (uint32_t m = 1; m <= 2; m++) {
10817 GemmMicrokernelTester()
10818 .mr(2)
10819 .nr(8)
10820 .kr(4)
10821 .sr(1)
10822 .m(m)
10823 .n(n)
10824 .k(k)
10825 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010826 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010827 }
10828 }
10829 }
10830 }
10831
10832 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, n_div_8) {
10833 TEST_REQUIRES_ARM_NEON_V8;
10834 for (uint32_t n = 16; n <= 24; n += 8) {
10835 for (size_t k = 1; k <= 80; k += 17) {
10836 GemmMicrokernelTester()
10837 .mr(2)
10838 .nr(8)
10839 .kr(4)
10840 .sr(1)
10841 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080010842 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010843 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080010844 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010845 }
10846 }
10847 }
10848
10849 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, n_div_8_strided_cn) {
10850 TEST_REQUIRES_ARM_NEON_V8;
10851 for (uint32_t n = 16; n <= 24; n += 8) {
10852 for (size_t k = 1; k <= 80; k += 17) {
10853 GemmMicrokernelTester()
10854 .mr(2)
10855 .nr(8)
10856 .kr(4)
10857 .sr(1)
10858 .m(2)
10859 .n(n)
10860 .k(k)
10861 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080010862 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010863 }
10864 }
10865 }
10866
10867 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, n_div_8_strided_a) {
10868 TEST_REQUIRES_ARM_NEON_V8;
10869 for (uint32_t n = 16; n <= 24; n += 8) {
10870 for (size_t k = 1; k <= 80; k += 17) {
10871 GemmMicrokernelTester()
10872 .mr(2)
10873 .nr(8)
10874 .kr(4)
10875 .sr(1)
10876 .m(2)
10877 .n(n)
10878 .k(k)
10879 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080010880 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010881 }
10882 }
10883 }
10884
10885 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, n_div_8_subtile) {
10886 TEST_REQUIRES_ARM_NEON_V8;
10887 for (uint32_t n = 16; n <= 24; n += 8) {
10888 for (size_t k = 1; k <= 80; k += 17) {
10889 for (uint32_t m = 1; m <= 2; m++) {
10890 GemmMicrokernelTester()
10891 .mr(2)
10892 .nr(8)
10893 .kr(4)
10894 .sr(1)
10895 .m(m)
10896 .n(n)
10897 .k(k)
10898 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010899 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010900 }
10901 }
10902 }
10903 }
10904
10905 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, strided_cm_subtile) {
10906 TEST_REQUIRES_ARM_NEON_V8;
10907 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080010908 for (uint32_t n = 1; n <= 8; n++) {
10909 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010910 GemmMicrokernelTester()
10911 .mr(2)
10912 .nr(8)
10913 .kr(4)
10914 .sr(1)
10915 .m(m)
10916 .n(n)
10917 .k(k)
10918 .cm_stride(11)
10919 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080010920 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010921 }
10922 }
10923 }
10924 }
10925
10926 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, qmin) {
10927 TEST_REQUIRES_ARM_NEON_V8;
10928 GemmMicrokernelTester()
10929 .mr(2)
10930 .nr(8)
10931 .kr(4)
10932 .sr(1)
10933 .m(2)
10934 .n(8)
10935 .k(16)
10936 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080010937 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010938 }
10939
10940 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, qmax) {
10941 TEST_REQUIRES_ARM_NEON_V8;
10942 GemmMicrokernelTester()
10943 .mr(2)
10944 .nr(8)
10945 .kr(4)
10946 .sr(1)
10947 .m(2)
10948 .n(8)
10949 .k(16)
10950 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080010951 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010952 }
10953
10954 TEST(QC8_GEMM_MINMAX_FP32_2X8C4__NEONV8_MLAL_LD1R, strided_cm) {
10955 TEST_REQUIRES_ARM_NEON_V8;
10956 GemmMicrokernelTester()
10957 .mr(2)
10958 .nr(8)
10959 .kr(4)
10960 .sr(1)
10961 .m(2)
10962 .n(8)
10963 .k(16)
10964 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080010965 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_ld1r, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010966 }
10967#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
10968
10969
10970#if XNN_ARCH_ARM || XNN_ARCH_ARM64
10971 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16) {
10972 TEST_REQUIRES_ARM_NEON;
10973 GemmMicrokernelTester()
10974 .mr(1)
10975 .nr(8)
10976 .kr(2)
10977 .sr(1)
10978 .m(1)
10979 .n(8)
10980 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -080010981 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010982 }
10983
10984 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, strided_cn) {
10985 TEST_REQUIRES_ARM_NEON;
10986 GemmMicrokernelTester()
10987 .mr(1)
10988 .nr(8)
10989 .kr(2)
10990 .sr(1)
10991 .m(1)
10992 .n(8)
10993 .k(16)
10994 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080010995 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080010996 }
10997
10998 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16_strided_a) {
10999 TEST_REQUIRES_ARM_NEON;
11000 GemmMicrokernelTester()
11001 .mr(1)
11002 .nr(8)
11003 .kr(2)
11004 .sr(1)
11005 .m(1)
11006 .n(8)
11007 .k(16)
11008 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080011009 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011010 }
11011
11012 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
11013 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -080011014 for (uint32_t n = 1; n <= 8; n++) {
11015 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011016 GemmMicrokernelTester()
11017 .mr(1)
11018 .nr(8)
11019 .kr(2)
11020 .sr(1)
11021 .m(m)
11022 .n(n)
11023 .k(16)
11024 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011025 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011026 }
11027 }
11028 }
11029
11030 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
11031 TEST_REQUIRES_ARM_NEON;
11032 for (uint32_t m = 1; m <= 1; m++) {
11033 GemmMicrokernelTester()
11034 .mr(1)
11035 .nr(8)
11036 .kr(2)
11037 .sr(1)
11038 .m(m)
11039 .n(8)
11040 .k(16)
11041 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011042 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011043 }
11044 }
11045
11046 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
11047 TEST_REQUIRES_ARM_NEON;
11048 for (uint32_t n = 1; n <= 8; n++) {
11049 GemmMicrokernelTester()
11050 .mr(1)
11051 .nr(8)
11052 .kr(2)
11053 .sr(1)
11054 .m(1)
11055 .n(n)
11056 .k(16)
11057 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011058 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011059 }
11060 }
11061
11062 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_lt_16) {
11063 TEST_REQUIRES_ARM_NEON;
11064 for (size_t k = 1; k < 16; k++) {
11065 GemmMicrokernelTester()
11066 .mr(1)
11067 .nr(8)
11068 .kr(2)
11069 .sr(1)
11070 .m(1)
11071 .n(8)
11072 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080011073 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011074 }
11075 }
11076
11077 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_lt_16_strided_a) {
11078 TEST_REQUIRES_ARM_NEON;
11079 for (size_t k = 1; k < 16; k++) {
11080 GemmMicrokernelTester()
11081 .mr(1)
11082 .nr(8)
11083 .kr(2)
11084 .sr(1)
11085 .m(1)
11086 .n(8)
11087 .k(k)
11088 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080011089 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011090 }
11091 }
11092
11093 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
11094 TEST_REQUIRES_ARM_NEON;
11095 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080011096 for (uint32_t n = 1; n <= 8; n++) {
11097 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011098 GemmMicrokernelTester()
11099 .mr(1)
11100 .nr(8)
11101 .kr(2)
11102 .sr(1)
11103 .m(m)
11104 .n(n)
11105 .k(k)
11106 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011107 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011108 }
11109 }
11110 }
11111 }
11112
11113 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_gt_16) {
11114 TEST_REQUIRES_ARM_NEON;
11115 for (size_t k = 17; k < 32; k++) {
11116 GemmMicrokernelTester()
11117 .mr(1)
11118 .nr(8)
11119 .kr(2)
11120 .sr(1)
11121 .m(1)
11122 .n(8)
11123 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080011124 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011125 }
11126 }
11127
11128 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_gt_16_strided_a) {
11129 TEST_REQUIRES_ARM_NEON;
11130 for (size_t k = 17; k < 32; k++) {
11131 GemmMicrokernelTester()
11132 .mr(1)
11133 .nr(8)
11134 .kr(2)
11135 .sr(1)
11136 .m(1)
11137 .n(8)
11138 .k(k)
11139 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -080011140 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011141 }
11142 }
11143
11144 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
11145 TEST_REQUIRES_ARM_NEON;
11146 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080011147 for (uint32_t n = 1; n <= 8; n++) {
11148 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011149 GemmMicrokernelTester()
11150 .mr(1)
11151 .nr(8)
11152 .kr(2)
11153 .sr(1)
11154 .m(m)
11155 .n(n)
11156 .k(k)
11157 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011158 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011159 }
11160 }
11161 }
11162 }
11163
11164 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_div_16) {
11165 TEST_REQUIRES_ARM_NEON;
11166 for (size_t k = 32; k <= 160; k += 16) {
11167 GemmMicrokernelTester()
11168 .mr(1)
11169 .nr(8)
11170 .kr(2)
11171 .sr(1)
11172 .m(1)
11173 .n(8)
11174 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080011175 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011176 }
11177 }
11178
11179 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_div_16_strided_a) {
11180 TEST_REQUIRES_ARM_NEON;
11181 for (size_t k = 32; k <= 160; k += 16) {
11182 GemmMicrokernelTester()
11183 .mr(1)
11184 .nr(8)
11185 .kr(2)
11186 .sr(1)
11187 .m(1)
11188 .n(8)
11189 .k(k)
11190 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -080011191 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011192 }
11193 }
11194
11195 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
11196 TEST_REQUIRES_ARM_NEON;
11197 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080011198 for (uint32_t n = 1; n <= 8; n++) {
11199 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011200 GemmMicrokernelTester()
11201 .mr(1)
11202 .nr(8)
11203 .kr(2)
11204 .sr(1)
11205 .m(m)
11206 .n(n)
11207 .k(k)
11208 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011209 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011210 }
11211 }
11212 }
11213 }
11214
11215 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_gt_8) {
11216 TEST_REQUIRES_ARM_NEON;
11217 for (uint32_t n = 9; n < 16; n++) {
11218 for (size_t k = 1; k <= 80; k += 17) {
11219 GemmMicrokernelTester()
11220 .mr(1)
11221 .nr(8)
11222 .kr(2)
11223 .sr(1)
11224 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080011225 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011226 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080011227 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011228 }
11229 }
11230 }
11231
11232 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
11233 TEST_REQUIRES_ARM_NEON;
11234 for (uint32_t n = 9; n < 16; n++) {
11235 for (size_t k = 1; k <= 80; k += 17) {
11236 GemmMicrokernelTester()
11237 .mr(1)
11238 .nr(8)
11239 .kr(2)
11240 .sr(1)
11241 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080011242 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011243 .k(k)
11244 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080011245 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011246 }
11247 }
11248 }
11249
11250 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_gt_8_strided_a) {
11251 TEST_REQUIRES_ARM_NEON;
11252 for (uint32_t n = 9; n < 16; n++) {
11253 for (size_t k = 1; k <= 80; k += 17) {
11254 GemmMicrokernelTester()
11255 .mr(1)
11256 .nr(8)
11257 .kr(2)
11258 .sr(1)
11259 .m(1)
11260 .n(n)
11261 .k(k)
11262 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080011263 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011264 }
11265 }
11266 }
11267
11268 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
11269 TEST_REQUIRES_ARM_NEON;
11270 for (uint32_t n = 9; n < 16; n++) {
11271 for (size_t k = 1; k <= 80; k += 17) {
11272 for (uint32_t m = 1; m <= 1; m++) {
11273 GemmMicrokernelTester()
11274 .mr(1)
11275 .nr(8)
11276 .kr(2)
11277 .sr(1)
11278 .m(m)
11279 .n(n)
11280 .k(k)
11281 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011282 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011283 }
11284 }
11285 }
11286 }
11287
11288 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_div_8) {
11289 TEST_REQUIRES_ARM_NEON;
11290 for (uint32_t n = 16; n <= 24; n += 8) {
11291 for (size_t k = 1; k <= 80; k += 17) {
11292 GemmMicrokernelTester()
11293 .mr(1)
11294 .nr(8)
11295 .kr(2)
11296 .sr(1)
11297 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080011298 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011299 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080011300 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011301 }
11302 }
11303 }
11304
11305 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
11306 TEST_REQUIRES_ARM_NEON;
11307 for (uint32_t n = 16; n <= 24; n += 8) {
11308 for (size_t k = 1; k <= 80; k += 17) {
11309 GemmMicrokernelTester()
11310 .mr(1)
11311 .nr(8)
11312 .kr(2)
11313 .sr(1)
11314 .m(1)
11315 .n(n)
11316 .k(k)
11317 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080011318 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011319 }
11320 }
11321 }
11322
11323 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_div_8_strided_a) {
11324 TEST_REQUIRES_ARM_NEON;
11325 for (uint32_t n = 16; n <= 24; n += 8) {
11326 for (size_t k = 1; k <= 80; k += 17) {
11327 GemmMicrokernelTester()
11328 .mr(1)
11329 .nr(8)
11330 .kr(2)
11331 .sr(1)
11332 .m(1)
11333 .n(n)
11334 .k(k)
11335 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080011336 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011337 }
11338 }
11339 }
11340
11341 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
11342 TEST_REQUIRES_ARM_NEON;
11343 for (uint32_t n = 16; n <= 24; n += 8) {
11344 for (size_t k = 1; k <= 80; k += 17) {
11345 for (uint32_t m = 1; m <= 1; m++) {
11346 GemmMicrokernelTester()
11347 .mr(1)
11348 .nr(8)
11349 .kr(2)
11350 .sr(1)
11351 .m(m)
11352 .n(n)
11353 .k(k)
11354 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011355 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011356 }
11357 }
11358 }
11359 }
11360
11361 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
11362 TEST_REQUIRES_ARM_NEON;
11363 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080011364 for (uint32_t n = 1; n <= 8; n++) {
11365 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011366 GemmMicrokernelTester()
11367 .mr(1)
11368 .nr(8)
11369 .kr(2)
11370 .sr(1)
11371 .m(m)
11372 .n(n)
11373 .k(k)
11374 .cm_stride(11)
11375 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011376 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011377 }
11378 }
11379 }
11380 }
11381
11382 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, qmin) {
11383 TEST_REQUIRES_ARM_NEON;
11384 GemmMicrokernelTester()
11385 .mr(1)
11386 .nr(8)
11387 .kr(2)
11388 .sr(1)
11389 .m(1)
11390 .n(8)
11391 .k(16)
11392 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080011393 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011394 }
11395
11396 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, qmax) {
11397 TEST_REQUIRES_ARM_NEON;
11398 GemmMicrokernelTester()
11399 .mr(1)
11400 .nr(8)
11401 .kr(2)
11402 .sr(1)
11403 .m(1)
11404 .n(8)
11405 .k(16)
11406 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080011407 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011408 }
11409
11410 TEST(QC8_GEMM_MINMAX_FP32_1X8C2__NEON_MLAL_DUP, strided_cm) {
11411 TEST_REQUIRES_ARM_NEON;
11412 GemmMicrokernelTester()
11413 .mr(1)
11414 .nr(8)
11415 .kr(2)
11416 .sr(1)
11417 .m(1)
11418 .n(8)
11419 .k(16)
11420 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080011421 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_minmax_neon_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011422 }
11423#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
11424
11425
11426#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
11427 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16) {
11428 TEST_REQUIRES_ARM_NEON;
11429 GemmMicrokernelTester()
11430 .mr(2)
11431 .nr(8)
11432 .kr(8)
11433 .sr(1)
11434 .m(2)
11435 .n(8)
11436 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -080011437 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011438 }
11439
11440 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, strided_cn) {
11441 TEST_REQUIRES_ARM_NEON;
11442 GemmMicrokernelTester()
11443 .mr(2)
11444 .nr(8)
11445 .kr(8)
11446 .sr(1)
11447 .m(2)
11448 .n(8)
11449 .k(16)
11450 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080011451 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011452 }
11453
11454 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16_strided_a) {
11455 TEST_REQUIRES_ARM_NEON;
11456 GemmMicrokernelTester()
11457 .mr(2)
11458 .nr(8)
11459 .kr(8)
11460 .sr(1)
11461 .m(2)
11462 .n(8)
11463 .k(16)
11464 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080011465 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011466 }
11467
11468 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile) {
11469 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -080011470 for (uint32_t n = 1; n <= 8; n++) {
11471 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011472 GemmMicrokernelTester()
11473 .mr(2)
11474 .nr(8)
11475 .kr(8)
11476 .sr(1)
11477 .m(m)
11478 .n(n)
11479 .k(16)
11480 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011481 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011482 }
11483 }
11484 }
11485
11486 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
11487 TEST_REQUIRES_ARM_NEON;
11488 for (uint32_t m = 1; m <= 2; m++) {
11489 GemmMicrokernelTester()
11490 .mr(2)
11491 .nr(8)
11492 .kr(8)
11493 .sr(1)
11494 .m(m)
11495 .n(8)
11496 .k(16)
11497 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011498 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011499 }
11500 }
11501
11502 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
11503 TEST_REQUIRES_ARM_NEON;
11504 for (uint32_t n = 1; n <= 8; n++) {
11505 GemmMicrokernelTester()
11506 .mr(2)
11507 .nr(8)
11508 .kr(8)
11509 .sr(1)
11510 .m(2)
11511 .n(n)
11512 .k(16)
11513 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011514 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011515 }
11516 }
11517
11518 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_lt_16) {
11519 TEST_REQUIRES_ARM_NEON;
11520 for (size_t k = 1; k < 16; k++) {
11521 GemmMicrokernelTester()
11522 .mr(2)
11523 .nr(8)
11524 .kr(8)
11525 .sr(1)
11526 .m(2)
11527 .n(8)
11528 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080011529 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011530 }
11531 }
11532
11533 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_lt_16_strided_a) {
11534 TEST_REQUIRES_ARM_NEON;
11535 for (size_t k = 1; k < 16; k++) {
11536 GemmMicrokernelTester()
11537 .mr(2)
11538 .nr(8)
11539 .kr(8)
11540 .sr(1)
11541 .m(2)
11542 .n(8)
11543 .k(k)
11544 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080011545 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011546 }
11547 }
11548
11549 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_lt_16_subtile) {
11550 TEST_REQUIRES_ARM_NEON;
11551 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080011552 for (uint32_t n = 1; n <= 8; n++) {
11553 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011554 GemmMicrokernelTester()
11555 .mr(2)
11556 .nr(8)
11557 .kr(8)
11558 .sr(1)
11559 .m(m)
11560 .n(n)
11561 .k(k)
11562 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011563 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011564 }
11565 }
11566 }
11567 }
11568
11569 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_gt_16) {
11570 TEST_REQUIRES_ARM_NEON;
11571 for (size_t k = 17; k < 32; k++) {
11572 GemmMicrokernelTester()
11573 .mr(2)
11574 .nr(8)
11575 .kr(8)
11576 .sr(1)
11577 .m(2)
11578 .n(8)
11579 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080011580 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011581 }
11582 }
11583
11584 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_gt_16_strided_a) {
11585 TEST_REQUIRES_ARM_NEON;
11586 for (size_t k = 17; k < 32; k++) {
11587 GemmMicrokernelTester()
11588 .mr(2)
11589 .nr(8)
11590 .kr(8)
11591 .sr(1)
11592 .m(2)
11593 .n(8)
11594 .k(k)
11595 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -080011596 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011597 }
11598 }
11599
11600 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_gt_16_subtile) {
11601 TEST_REQUIRES_ARM_NEON;
11602 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080011603 for (uint32_t n = 1; n <= 8; n++) {
11604 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011605 GemmMicrokernelTester()
11606 .mr(2)
11607 .nr(8)
11608 .kr(8)
11609 .sr(1)
11610 .m(m)
11611 .n(n)
11612 .k(k)
11613 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011614 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011615 }
11616 }
11617 }
11618 }
11619
11620 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_div_16) {
11621 TEST_REQUIRES_ARM_NEON;
11622 for (size_t k = 32; k <= 160; k += 16) {
11623 GemmMicrokernelTester()
11624 .mr(2)
11625 .nr(8)
11626 .kr(8)
11627 .sr(1)
11628 .m(2)
11629 .n(8)
11630 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080011631 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011632 }
11633 }
11634
11635 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_div_16_strided_a) {
11636 TEST_REQUIRES_ARM_NEON;
11637 for (size_t k = 32; k <= 160; k += 16) {
11638 GemmMicrokernelTester()
11639 .mr(2)
11640 .nr(8)
11641 .kr(8)
11642 .sr(1)
11643 .m(2)
11644 .n(8)
11645 .k(k)
11646 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -080011647 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011648 }
11649 }
11650
11651 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, k_div_16_subtile) {
11652 TEST_REQUIRES_ARM_NEON;
11653 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080011654 for (uint32_t n = 1; n <= 8; n++) {
11655 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011656 GemmMicrokernelTester()
11657 .mr(2)
11658 .nr(8)
11659 .kr(8)
11660 .sr(1)
11661 .m(m)
11662 .n(n)
11663 .k(k)
11664 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011665 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011666 }
11667 }
11668 }
11669 }
11670
11671 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_gt_8) {
11672 TEST_REQUIRES_ARM_NEON;
11673 for (uint32_t n = 9; n < 16; n++) {
11674 for (size_t k = 1; k <= 80; k += 17) {
11675 GemmMicrokernelTester()
11676 .mr(2)
11677 .nr(8)
11678 .kr(8)
11679 .sr(1)
11680 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080011681 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011682 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080011683 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011684 }
11685 }
11686 }
11687
11688 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
11689 TEST_REQUIRES_ARM_NEON;
11690 for (uint32_t n = 9; n < 16; n++) {
11691 for (size_t k = 1; k <= 80; k += 17) {
11692 GemmMicrokernelTester()
11693 .mr(2)
11694 .nr(8)
11695 .kr(8)
11696 .sr(1)
11697 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080011698 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011699 .k(k)
11700 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080011701 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011702 }
11703 }
11704 }
11705
11706 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_gt_8_strided_a) {
11707 TEST_REQUIRES_ARM_NEON;
11708 for (uint32_t n = 9; n < 16; n++) {
11709 for (size_t k = 1; k <= 80; k += 17) {
11710 GemmMicrokernelTester()
11711 .mr(2)
11712 .nr(8)
11713 .kr(8)
11714 .sr(1)
11715 .m(2)
11716 .n(n)
11717 .k(k)
11718 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080011719 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011720 }
11721 }
11722 }
11723
11724 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_gt_8_subtile) {
11725 TEST_REQUIRES_ARM_NEON;
11726 for (uint32_t n = 9; n < 16; n++) {
11727 for (size_t k = 1; k <= 80; k += 17) {
11728 for (uint32_t m = 1; m <= 2; m++) {
11729 GemmMicrokernelTester()
11730 .mr(2)
11731 .nr(8)
11732 .kr(8)
11733 .sr(1)
11734 .m(m)
11735 .n(n)
11736 .k(k)
11737 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011738 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011739 }
11740 }
11741 }
11742 }
11743
11744 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_div_8) {
11745 TEST_REQUIRES_ARM_NEON;
11746 for (uint32_t n = 16; n <= 24; n += 8) {
11747 for (size_t k = 1; k <= 80; k += 17) {
11748 GemmMicrokernelTester()
11749 .mr(2)
11750 .nr(8)
11751 .kr(8)
11752 .sr(1)
11753 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080011754 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011755 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080011756 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011757 }
11758 }
11759 }
11760
11761 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
11762 TEST_REQUIRES_ARM_NEON;
11763 for (uint32_t n = 16; n <= 24; n += 8) {
11764 for (size_t k = 1; k <= 80; k += 17) {
11765 GemmMicrokernelTester()
11766 .mr(2)
11767 .nr(8)
11768 .kr(8)
11769 .sr(1)
11770 .m(2)
11771 .n(n)
11772 .k(k)
11773 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080011774 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011775 }
11776 }
11777 }
11778
11779 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_div_8_strided_a) {
11780 TEST_REQUIRES_ARM_NEON;
11781 for (uint32_t n = 16; n <= 24; n += 8) {
11782 for (size_t k = 1; k <= 80; k += 17) {
11783 GemmMicrokernelTester()
11784 .mr(2)
11785 .nr(8)
11786 .kr(8)
11787 .sr(1)
11788 .m(2)
11789 .n(n)
11790 .k(k)
11791 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080011792 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011793 }
11794 }
11795 }
11796
11797 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, n_div_8_subtile) {
11798 TEST_REQUIRES_ARM_NEON;
11799 for (uint32_t n = 16; n <= 24; n += 8) {
11800 for (size_t k = 1; k <= 80; k += 17) {
11801 for (uint32_t m = 1; m <= 2; m++) {
11802 GemmMicrokernelTester()
11803 .mr(2)
11804 .nr(8)
11805 .kr(8)
11806 .sr(1)
11807 .m(m)
11808 .n(n)
11809 .k(k)
11810 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011811 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011812 }
11813 }
11814 }
11815 }
11816
11817 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, strided_cm_subtile) {
11818 TEST_REQUIRES_ARM_NEON;
11819 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080011820 for (uint32_t n = 1; n <= 8; n++) {
11821 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011822 GemmMicrokernelTester()
11823 .mr(2)
11824 .nr(8)
11825 .kr(8)
11826 .sr(1)
11827 .m(m)
11828 .n(n)
11829 .k(k)
11830 .cm_stride(11)
11831 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011832 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011833 }
11834 }
11835 }
11836 }
11837
11838 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, qmin) {
11839 TEST_REQUIRES_ARM_NEON;
11840 GemmMicrokernelTester()
11841 .mr(2)
11842 .nr(8)
11843 .kr(8)
11844 .sr(1)
11845 .m(2)
11846 .n(8)
11847 .k(16)
11848 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080011849 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011850 }
11851
11852 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, qmax) {
11853 TEST_REQUIRES_ARM_NEON;
11854 GemmMicrokernelTester()
11855 .mr(2)
11856 .nr(8)
11857 .kr(8)
11858 .sr(1)
11859 .m(2)
11860 .n(8)
11861 .k(16)
11862 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080011863 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011864 }
11865
11866 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL, strided_cm) {
11867 TEST_REQUIRES_ARM_NEON;
11868 GemmMicrokernelTester()
11869 .mr(2)
11870 .nr(8)
11871 .kr(8)
11872 .sr(1)
11873 .m(2)
11874 .n(8)
11875 .k(16)
11876 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080011877 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011878 }
11879#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
11880
11881
11882#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
11883 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_eq_8) {
11884 TEST_REQUIRES_ARM_NEON;
11885 GemmMicrokernelTester()
11886 .mr(2)
11887 .nr(8)
11888 .kr(8)
11889 .sr(1)
11890 .m(2)
11891 .n(8)
11892 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080011893 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011894 }
11895
11896 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, strided_cn) {
11897 TEST_REQUIRES_ARM_NEON;
11898 GemmMicrokernelTester()
11899 .mr(2)
11900 .nr(8)
11901 .kr(8)
11902 .sr(1)
11903 .m(2)
11904 .n(8)
11905 .k(8)
11906 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080011907 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011908 }
11909
11910 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_eq_8_strided_a) {
11911 TEST_REQUIRES_ARM_NEON;
11912 GemmMicrokernelTester()
11913 .mr(2)
11914 .nr(8)
11915 .kr(8)
11916 .sr(1)
11917 .m(2)
11918 .n(8)
11919 .k(8)
11920 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080011921 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011922 }
11923
11924 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_eq_8_subtile) {
11925 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -080011926 for (uint32_t n = 1; n <= 8; n++) {
11927 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011928 GemmMicrokernelTester()
11929 .mr(2)
11930 .nr(8)
11931 .kr(8)
11932 .sr(1)
11933 .m(m)
11934 .n(n)
11935 .k(8)
11936 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011937 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011938 }
11939 }
11940 }
11941
11942 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_eq_8_subtile_m) {
11943 TEST_REQUIRES_ARM_NEON;
11944 for (uint32_t m = 1; m <= 2; m++) {
11945 GemmMicrokernelTester()
11946 .mr(2)
11947 .nr(8)
11948 .kr(8)
11949 .sr(1)
11950 .m(m)
11951 .n(8)
11952 .k(8)
11953 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011954 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011955 }
11956 }
11957
11958 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_eq_8_subtile_n) {
11959 TEST_REQUIRES_ARM_NEON;
11960 for (uint32_t n = 1; n <= 8; n++) {
11961 GemmMicrokernelTester()
11962 .mr(2)
11963 .nr(8)
11964 .kr(8)
11965 .sr(1)
11966 .m(2)
11967 .n(n)
11968 .k(8)
11969 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080011970 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011971 }
11972 }
11973
11974 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_lt_8) {
11975 TEST_REQUIRES_ARM_NEON;
11976 for (size_t k = 1; k < 8; k++) {
11977 GemmMicrokernelTester()
11978 .mr(2)
11979 .nr(8)
11980 .kr(8)
11981 .sr(1)
11982 .m(2)
11983 .n(8)
11984 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080011985 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080011986 }
11987 }
11988
11989 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_lt_8_strided_a) {
11990 TEST_REQUIRES_ARM_NEON;
11991 for (size_t k = 1; k < 8; k++) {
11992 GemmMicrokernelTester()
11993 .mr(2)
11994 .nr(8)
11995 .kr(8)
11996 .sr(1)
11997 .m(2)
11998 .n(8)
11999 .k(k)
12000 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080012001 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012002 }
12003 }
12004
12005 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_lt_8_subtile) {
12006 TEST_REQUIRES_ARM_NEON;
12007 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080012008 for (uint32_t n = 1; n <= 8; n++) {
12009 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012010 GemmMicrokernelTester()
12011 .mr(2)
12012 .nr(8)
12013 .kr(8)
12014 .sr(1)
12015 .m(m)
12016 .n(n)
12017 .k(k)
12018 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012019 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012020 }
12021 }
12022 }
12023 }
12024
12025 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_gt_8) {
12026 TEST_REQUIRES_ARM_NEON;
12027 for (size_t k = 9; k < 16; k++) {
12028 GemmMicrokernelTester()
12029 .mr(2)
12030 .nr(8)
12031 .kr(8)
12032 .sr(1)
12033 .m(2)
12034 .n(8)
12035 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012036 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012037 }
12038 }
12039
12040 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_gt_8_strided_a) {
12041 TEST_REQUIRES_ARM_NEON;
12042 for (size_t k = 9; k < 16; k++) {
12043 GemmMicrokernelTester()
12044 .mr(2)
12045 .nr(8)
12046 .kr(8)
12047 .sr(1)
12048 .m(2)
12049 .n(8)
12050 .k(k)
12051 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080012052 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012053 }
12054 }
12055
12056 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_gt_8_subtile) {
12057 TEST_REQUIRES_ARM_NEON;
12058 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080012059 for (uint32_t n = 1; n <= 8; n++) {
12060 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012061 GemmMicrokernelTester()
12062 .mr(2)
12063 .nr(8)
12064 .kr(8)
12065 .sr(1)
12066 .m(m)
12067 .n(n)
12068 .k(k)
12069 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012070 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012071 }
12072 }
12073 }
12074 }
12075
12076 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_div_8) {
12077 TEST_REQUIRES_ARM_NEON;
12078 for (size_t k = 16; k <= 80; k += 8) {
12079 GemmMicrokernelTester()
12080 .mr(2)
12081 .nr(8)
12082 .kr(8)
12083 .sr(1)
12084 .m(2)
12085 .n(8)
12086 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012087 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012088 }
12089 }
12090
12091 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_div_8_strided_a) {
12092 TEST_REQUIRES_ARM_NEON;
12093 for (size_t k = 16; k <= 80; k += 8) {
12094 GemmMicrokernelTester()
12095 .mr(2)
12096 .nr(8)
12097 .kr(8)
12098 .sr(1)
12099 .m(2)
12100 .n(8)
12101 .k(k)
12102 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080012103 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012104 }
12105 }
12106
12107 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, k_div_8_subtile) {
12108 TEST_REQUIRES_ARM_NEON;
12109 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080012110 for (uint32_t n = 1; n <= 8; n++) {
12111 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012112 GemmMicrokernelTester()
12113 .mr(2)
12114 .nr(8)
12115 .kr(8)
12116 .sr(1)
12117 .m(m)
12118 .n(n)
12119 .k(k)
12120 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012121 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012122 }
12123 }
12124 }
12125 }
12126
12127 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_gt_8) {
12128 TEST_REQUIRES_ARM_NEON;
12129 for (uint32_t n = 9; n < 16; n++) {
12130 for (size_t k = 1; k <= 40; k += 9) {
12131 GemmMicrokernelTester()
12132 .mr(2)
12133 .nr(8)
12134 .kr(8)
12135 .sr(1)
12136 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080012137 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012138 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012139 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012140 }
12141 }
12142 }
12143
12144 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_gt_8_strided_cn) {
12145 TEST_REQUIRES_ARM_NEON;
12146 for (uint32_t n = 9; n < 16; n++) {
12147 for (size_t k = 1; k <= 40; k += 9) {
12148 GemmMicrokernelTester()
12149 .mr(2)
12150 .nr(8)
12151 .kr(8)
12152 .sr(1)
12153 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080012154 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012155 .k(k)
12156 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080012157 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012158 }
12159 }
12160 }
12161
12162 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_gt_8_strided_a) {
12163 TEST_REQUIRES_ARM_NEON;
12164 for (uint32_t n = 9; n < 16; n++) {
12165 for (size_t k = 1; k <= 40; k += 9) {
12166 GemmMicrokernelTester()
12167 .mr(2)
12168 .nr(8)
12169 .kr(8)
12170 .sr(1)
12171 .m(2)
12172 .n(n)
12173 .k(k)
12174 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080012175 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012176 }
12177 }
12178 }
12179
12180 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_gt_8_subtile) {
12181 TEST_REQUIRES_ARM_NEON;
12182 for (uint32_t n = 9; n < 16; n++) {
12183 for (size_t k = 1; k <= 40; k += 9) {
12184 for (uint32_t m = 1; m <= 2; m++) {
12185 GemmMicrokernelTester()
12186 .mr(2)
12187 .nr(8)
12188 .kr(8)
12189 .sr(1)
12190 .m(m)
12191 .n(n)
12192 .k(k)
12193 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012194 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012195 }
12196 }
12197 }
12198 }
12199
12200 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_div_8) {
12201 TEST_REQUIRES_ARM_NEON;
12202 for (uint32_t n = 16; n <= 24; n += 8) {
12203 for (size_t k = 1; k <= 40; k += 9) {
12204 GemmMicrokernelTester()
12205 .mr(2)
12206 .nr(8)
12207 .kr(8)
12208 .sr(1)
12209 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080012210 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012211 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012212 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012213 }
12214 }
12215 }
12216
12217 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_div_8_strided_cn) {
12218 TEST_REQUIRES_ARM_NEON;
12219 for (uint32_t n = 16; n <= 24; n += 8) {
12220 for (size_t k = 1; k <= 40; k += 9) {
12221 GemmMicrokernelTester()
12222 .mr(2)
12223 .nr(8)
12224 .kr(8)
12225 .sr(1)
12226 .m(2)
12227 .n(n)
12228 .k(k)
12229 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080012230 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012231 }
12232 }
12233 }
12234
12235 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_div_8_strided_a) {
12236 TEST_REQUIRES_ARM_NEON;
12237 for (uint32_t n = 16; n <= 24; n += 8) {
12238 for (size_t k = 1; k <= 40; k += 9) {
12239 GemmMicrokernelTester()
12240 .mr(2)
12241 .nr(8)
12242 .kr(8)
12243 .sr(1)
12244 .m(2)
12245 .n(n)
12246 .k(k)
12247 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080012248 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012249 }
12250 }
12251 }
12252
12253 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, n_div_8_subtile) {
12254 TEST_REQUIRES_ARM_NEON;
12255 for (uint32_t n = 16; n <= 24; n += 8) {
12256 for (size_t k = 1; k <= 40; k += 9) {
12257 for (uint32_t m = 1; m <= 2; m++) {
12258 GemmMicrokernelTester()
12259 .mr(2)
12260 .nr(8)
12261 .kr(8)
12262 .sr(1)
12263 .m(m)
12264 .n(n)
12265 .k(k)
12266 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012267 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012268 }
12269 }
12270 }
12271 }
12272
12273 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, strided_cm_subtile) {
12274 TEST_REQUIRES_ARM_NEON;
12275 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080012276 for (uint32_t n = 1; n <= 8; n++) {
12277 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012278 GemmMicrokernelTester()
12279 .mr(2)
12280 .nr(8)
12281 .kr(8)
12282 .sr(1)
12283 .m(m)
12284 .n(n)
12285 .k(k)
12286 .cm_stride(11)
12287 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012288 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012289 }
12290 }
12291 }
12292 }
12293
12294 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, qmin) {
12295 TEST_REQUIRES_ARM_NEON;
12296 GemmMicrokernelTester()
12297 .mr(2)
12298 .nr(8)
12299 .kr(8)
12300 .sr(1)
12301 .m(2)
12302 .n(8)
12303 .k(8)
12304 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080012305 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012306 }
12307
12308 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, qmax) {
12309 TEST_REQUIRES_ARM_NEON;
12310 GemmMicrokernelTester()
12311 .mr(2)
12312 .nr(8)
12313 .kr(8)
12314 .sr(1)
12315 .m(2)
12316 .n(8)
12317 .k(8)
12318 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080012319 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012320 }
12321
12322 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MULL, strided_cm) {
12323 TEST_REQUIRES_ARM_NEON;
12324 GemmMicrokernelTester()
12325 .mr(2)
12326 .nr(8)
12327 .kr(8)
12328 .sr(1)
12329 .m(2)
12330 .n(8)
12331 .k(8)
12332 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080012333 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mull, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012334 }
12335#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
12336
12337
12338#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
12339 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16) {
12340 TEST_REQUIRES_ARM_NEON;
12341 GemmMicrokernelTester()
12342 .mr(2)
12343 .nr(8)
12344 .kr(8)
12345 .sr(1)
12346 .m(2)
12347 .n(8)
12348 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -080012349 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012350 }
12351
12352 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cn) {
12353 TEST_REQUIRES_ARM_NEON;
12354 GemmMicrokernelTester()
12355 .mr(2)
12356 .nr(8)
12357 .kr(8)
12358 .sr(1)
12359 .m(2)
12360 .n(8)
12361 .k(16)
12362 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080012363 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012364 }
12365
12366 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_strided_a) {
12367 TEST_REQUIRES_ARM_NEON;
12368 GemmMicrokernelTester()
12369 .mr(2)
12370 .nr(8)
12371 .kr(8)
12372 .sr(1)
12373 .m(2)
12374 .n(8)
12375 .k(16)
12376 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080012377 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012378 }
12379
12380 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile) {
12381 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -080012382 for (uint32_t n = 1; n <= 8; n++) {
12383 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012384 GemmMicrokernelTester()
12385 .mr(2)
12386 .nr(8)
12387 .kr(8)
12388 .sr(1)
12389 .m(m)
12390 .n(n)
12391 .k(16)
12392 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012393 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012394 }
12395 }
12396 }
12397
12398 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_m) {
12399 TEST_REQUIRES_ARM_NEON;
12400 for (uint32_t m = 1; m <= 2; m++) {
12401 GemmMicrokernelTester()
12402 .mr(2)
12403 .nr(8)
12404 .kr(8)
12405 .sr(1)
12406 .m(m)
12407 .n(8)
12408 .k(16)
12409 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012410 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012411 }
12412 }
12413
12414 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_n) {
12415 TEST_REQUIRES_ARM_NEON;
12416 for (uint32_t n = 1; n <= 8; n++) {
12417 GemmMicrokernelTester()
12418 .mr(2)
12419 .nr(8)
12420 .kr(8)
12421 .sr(1)
12422 .m(2)
12423 .n(n)
12424 .k(16)
12425 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012426 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012427 }
12428 }
12429
12430 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16) {
12431 TEST_REQUIRES_ARM_NEON;
12432 for (size_t k = 1; k < 16; k++) {
12433 GemmMicrokernelTester()
12434 .mr(2)
12435 .nr(8)
12436 .kr(8)
12437 .sr(1)
12438 .m(2)
12439 .n(8)
12440 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012441 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012442 }
12443 }
12444
12445 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_strided_a) {
12446 TEST_REQUIRES_ARM_NEON;
12447 for (size_t k = 1; k < 16; k++) {
12448 GemmMicrokernelTester()
12449 .mr(2)
12450 .nr(8)
12451 .kr(8)
12452 .sr(1)
12453 .m(2)
12454 .n(8)
12455 .k(k)
12456 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080012457 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012458 }
12459 }
12460
12461 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_subtile) {
12462 TEST_REQUIRES_ARM_NEON;
12463 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080012464 for (uint32_t n = 1; n <= 8; n++) {
12465 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012466 GemmMicrokernelTester()
12467 .mr(2)
12468 .nr(8)
12469 .kr(8)
12470 .sr(1)
12471 .m(m)
12472 .n(n)
12473 .k(k)
12474 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012475 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012476 }
12477 }
12478 }
12479 }
12480
12481 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16) {
12482 TEST_REQUIRES_ARM_NEON;
12483 for (size_t k = 17; k < 32; k++) {
12484 GemmMicrokernelTester()
12485 .mr(2)
12486 .nr(8)
12487 .kr(8)
12488 .sr(1)
12489 .m(2)
12490 .n(8)
12491 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012492 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012493 }
12494 }
12495
12496 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_strided_a) {
12497 TEST_REQUIRES_ARM_NEON;
12498 for (size_t k = 17; k < 32; k++) {
12499 GemmMicrokernelTester()
12500 .mr(2)
12501 .nr(8)
12502 .kr(8)
12503 .sr(1)
12504 .m(2)
12505 .n(8)
12506 .k(k)
12507 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -080012508 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012509 }
12510 }
12511
12512 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_subtile) {
12513 TEST_REQUIRES_ARM_NEON;
12514 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080012515 for (uint32_t n = 1; n <= 8; n++) {
12516 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012517 GemmMicrokernelTester()
12518 .mr(2)
12519 .nr(8)
12520 .kr(8)
12521 .sr(1)
12522 .m(m)
12523 .n(n)
12524 .k(k)
12525 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012526 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012527 }
12528 }
12529 }
12530 }
12531
12532 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16) {
12533 TEST_REQUIRES_ARM_NEON;
12534 for (size_t k = 32; k <= 160; k += 16) {
12535 GemmMicrokernelTester()
12536 .mr(2)
12537 .nr(8)
12538 .kr(8)
12539 .sr(1)
12540 .m(2)
12541 .n(8)
12542 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012543 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012544 }
12545 }
12546
12547 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_strided_a) {
12548 TEST_REQUIRES_ARM_NEON;
12549 for (size_t k = 32; k <= 160; k += 16) {
12550 GemmMicrokernelTester()
12551 .mr(2)
12552 .nr(8)
12553 .kr(8)
12554 .sr(1)
12555 .m(2)
12556 .n(8)
12557 .k(k)
12558 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -080012559 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012560 }
12561 }
12562
12563 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_subtile) {
12564 TEST_REQUIRES_ARM_NEON;
12565 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080012566 for (uint32_t n = 1; n <= 8; n++) {
12567 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012568 GemmMicrokernelTester()
12569 .mr(2)
12570 .nr(8)
12571 .kr(8)
12572 .sr(1)
12573 .m(m)
12574 .n(n)
12575 .k(k)
12576 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012577 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012578 }
12579 }
12580 }
12581 }
12582
12583 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8) {
12584 TEST_REQUIRES_ARM_NEON;
12585 for (uint32_t n = 9; n < 16; n++) {
12586 for (size_t k = 1; k <= 80; k += 17) {
12587 GemmMicrokernelTester()
12588 .mr(2)
12589 .nr(8)
12590 .kr(8)
12591 .sr(1)
12592 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080012593 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012594 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012595 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012596 }
12597 }
12598 }
12599
12600 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_cn) {
12601 TEST_REQUIRES_ARM_NEON;
12602 for (uint32_t n = 9; n < 16; n++) {
12603 for (size_t k = 1; k <= 80; k += 17) {
12604 GemmMicrokernelTester()
12605 .mr(2)
12606 .nr(8)
12607 .kr(8)
12608 .sr(1)
12609 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080012610 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012611 .k(k)
12612 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080012613 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012614 }
12615 }
12616 }
12617
12618 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_a) {
12619 TEST_REQUIRES_ARM_NEON;
12620 for (uint32_t n = 9; n < 16; n++) {
12621 for (size_t k = 1; k <= 80; k += 17) {
12622 GemmMicrokernelTester()
12623 .mr(2)
12624 .nr(8)
12625 .kr(8)
12626 .sr(1)
12627 .m(2)
12628 .n(n)
12629 .k(k)
12630 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080012631 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012632 }
12633 }
12634 }
12635
12636 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_subtile) {
12637 TEST_REQUIRES_ARM_NEON;
12638 for (uint32_t n = 9; n < 16; n++) {
12639 for (size_t k = 1; k <= 80; k += 17) {
12640 for (uint32_t m = 1; m <= 2; m++) {
12641 GemmMicrokernelTester()
12642 .mr(2)
12643 .nr(8)
12644 .kr(8)
12645 .sr(1)
12646 .m(m)
12647 .n(n)
12648 .k(k)
12649 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012650 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012651 }
12652 }
12653 }
12654 }
12655
12656 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8) {
12657 TEST_REQUIRES_ARM_NEON;
12658 for (uint32_t n = 16; n <= 24; n += 8) {
12659 for (size_t k = 1; k <= 80; k += 17) {
12660 GemmMicrokernelTester()
12661 .mr(2)
12662 .nr(8)
12663 .kr(8)
12664 .sr(1)
12665 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080012666 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012667 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012668 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012669 }
12670 }
12671 }
12672
12673 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_cn) {
12674 TEST_REQUIRES_ARM_NEON;
12675 for (uint32_t n = 16; n <= 24; n += 8) {
12676 for (size_t k = 1; k <= 80; k += 17) {
12677 GemmMicrokernelTester()
12678 .mr(2)
12679 .nr(8)
12680 .kr(8)
12681 .sr(1)
12682 .m(2)
12683 .n(n)
12684 .k(k)
12685 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080012686 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012687 }
12688 }
12689 }
12690
12691 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_a) {
12692 TEST_REQUIRES_ARM_NEON;
12693 for (uint32_t n = 16; n <= 24; n += 8) {
12694 for (size_t k = 1; k <= 80; k += 17) {
12695 GemmMicrokernelTester()
12696 .mr(2)
12697 .nr(8)
12698 .kr(8)
12699 .sr(1)
12700 .m(2)
12701 .n(n)
12702 .k(k)
12703 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080012704 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012705 }
12706 }
12707 }
12708
12709 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_subtile) {
12710 TEST_REQUIRES_ARM_NEON;
12711 for (uint32_t n = 16; n <= 24; n += 8) {
12712 for (size_t k = 1; k <= 80; k += 17) {
12713 for (uint32_t m = 1; m <= 2; m++) {
12714 GemmMicrokernelTester()
12715 .mr(2)
12716 .nr(8)
12717 .kr(8)
12718 .sr(1)
12719 .m(m)
12720 .n(n)
12721 .k(k)
12722 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012723 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012724 }
12725 }
12726 }
12727 }
12728
12729 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm_subtile) {
12730 TEST_REQUIRES_ARM_NEON;
12731 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080012732 for (uint32_t n = 1; n <= 8; n++) {
12733 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012734 GemmMicrokernelTester()
12735 .mr(2)
12736 .nr(8)
12737 .kr(8)
12738 .sr(1)
12739 .m(m)
12740 .n(n)
12741 .k(k)
12742 .cm_stride(11)
12743 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012744 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012745 }
12746 }
12747 }
12748 }
12749
12750 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmin) {
12751 TEST_REQUIRES_ARM_NEON;
12752 GemmMicrokernelTester()
12753 .mr(2)
12754 .nr(8)
12755 .kr(8)
12756 .sr(1)
12757 .m(2)
12758 .n(8)
12759 .k(16)
12760 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080012761 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012762 }
12763
12764 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmax) {
12765 TEST_REQUIRES_ARM_NEON;
12766 GemmMicrokernelTester()
12767 .mr(2)
12768 .nr(8)
12769 .kr(8)
12770 .sr(1)
12771 .m(2)
12772 .n(8)
12773 .k(16)
12774 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080012775 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012776 }
12777
12778 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm) {
12779 TEST_REQUIRES_ARM_NEON;
12780 GemmMicrokernelTester()
12781 .mr(2)
12782 .nr(8)
12783 .kr(8)
12784 .sr(1)
12785 .m(2)
12786 .n(8)
12787 .k(16)
12788 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080012789 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012790 }
12791#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
12792
12793
12794#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
12795 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16) {
12796 TEST_REQUIRES_ARM_NEON;
12797 GemmMicrokernelTester()
12798 .mr(1)
12799 .nr(8)
12800 .kr(8)
12801 .sr(1)
12802 .m(1)
12803 .n(8)
12804 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -080012805 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012806 }
12807
12808 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cn) {
12809 TEST_REQUIRES_ARM_NEON;
12810 GemmMicrokernelTester()
12811 .mr(1)
12812 .nr(8)
12813 .kr(8)
12814 .sr(1)
12815 .m(1)
12816 .n(8)
12817 .k(16)
12818 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080012819 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012820 }
12821
12822 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_strided_a) {
12823 TEST_REQUIRES_ARM_NEON;
12824 GemmMicrokernelTester()
12825 .mr(1)
12826 .nr(8)
12827 .kr(8)
12828 .sr(1)
12829 .m(1)
12830 .n(8)
12831 .k(16)
12832 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080012833 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012834 }
12835
12836 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile) {
12837 TEST_REQUIRES_ARM_NEON;
Zhi An Ng83844ae2022-01-14 09:52:25 -080012838 for (uint32_t n = 1; n <= 8; n++) {
12839 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012840 GemmMicrokernelTester()
12841 .mr(1)
12842 .nr(8)
12843 .kr(8)
12844 .sr(1)
12845 .m(m)
12846 .n(n)
12847 .k(16)
12848 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012849 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012850 }
12851 }
12852 }
12853
12854 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_m) {
12855 TEST_REQUIRES_ARM_NEON;
12856 for (uint32_t m = 1; m <= 1; m++) {
12857 GemmMicrokernelTester()
12858 .mr(1)
12859 .nr(8)
12860 .kr(8)
12861 .sr(1)
12862 .m(m)
12863 .n(8)
12864 .k(16)
12865 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012866 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012867 }
12868 }
12869
12870 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_n) {
12871 TEST_REQUIRES_ARM_NEON;
12872 for (uint32_t n = 1; n <= 8; n++) {
12873 GemmMicrokernelTester()
12874 .mr(1)
12875 .nr(8)
12876 .kr(8)
12877 .sr(1)
12878 .m(1)
12879 .n(n)
12880 .k(16)
12881 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012882 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012883 }
12884 }
12885
12886 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16) {
12887 TEST_REQUIRES_ARM_NEON;
12888 for (size_t k = 1; k < 16; k++) {
12889 GemmMicrokernelTester()
12890 .mr(1)
12891 .nr(8)
12892 .kr(8)
12893 .sr(1)
12894 .m(1)
12895 .n(8)
12896 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012897 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012898 }
12899 }
12900
12901 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_strided_a) {
12902 TEST_REQUIRES_ARM_NEON;
12903 for (size_t k = 1; k < 16; k++) {
12904 GemmMicrokernelTester()
12905 .mr(1)
12906 .nr(8)
12907 .kr(8)
12908 .sr(1)
12909 .m(1)
12910 .n(8)
12911 .k(k)
12912 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080012913 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012914 }
12915 }
12916
12917 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_subtile) {
12918 TEST_REQUIRES_ARM_NEON;
12919 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080012920 for (uint32_t n = 1; n <= 8; n++) {
12921 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012922 GemmMicrokernelTester()
12923 .mr(1)
12924 .nr(8)
12925 .kr(8)
12926 .sr(1)
12927 .m(m)
12928 .n(n)
12929 .k(k)
12930 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012931 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012932 }
12933 }
12934 }
12935 }
12936
12937 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16) {
12938 TEST_REQUIRES_ARM_NEON;
12939 for (size_t k = 17; k < 32; k++) {
12940 GemmMicrokernelTester()
12941 .mr(1)
12942 .nr(8)
12943 .kr(8)
12944 .sr(1)
12945 .m(1)
12946 .n(8)
12947 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012948 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012949 }
12950 }
12951
12952 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_strided_a) {
12953 TEST_REQUIRES_ARM_NEON;
12954 for (size_t k = 17; k < 32; k++) {
12955 GemmMicrokernelTester()
12956 .mr(1)
12957 .nr(8)
12958 .kr(8)
12959 .sr(1)
12960 .m(1)
12961 .n(8)
12962 .k(k)
12963 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -080012964 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012965 }
12966 }
12967
12968 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_subtile) {
12969 TEST_REQUIRES_ARM_NEON;
12970 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080012971 for (uint32_t n = 1; n <= 8; n++) {
12972 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012973 GemmMicrokernelTester()
12974 .mr(1)
12975 .nr(8)
12976 .kr(8)
12977 .sr(1)
12978 .m(m)
12979 .n(n)
12980 .k(k)
12981 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080012982 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080012983 }
12984 }
12985 }
12986 }
12987
12988 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16) {
12989 TEST_REQUIRES_ARM_NEON;
12990 for (size_t k = 32; k <= 160; k += 16) {
12991 GemmMicrokernelTester()
12992 .mr(1)
12993 .nr(8)
12994 .kr(8)
12995 .sr(1)
12996 .m(1)
12997 .n(8)
12998 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080012999 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013000 }
13001 }
13002
13003 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_strided_a) {
13004 TEST_REQUIRES_ARM_NEON;
13005 for (size_t k = 32; k <= 160; k += 16) {
13006 GemmMicrokernelTester()
13007 .mr(1)
13008 .nr(8)
13009 .kr(8)
13010 .sr(1)
13011 .m(1)
13012 .n(8)
13013 .k(k)
13014 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -080013015 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013016 }
13017 }
13018
13019 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_subtile) {
13020 TEST_REQUIRES_ARM_NEON;
13021 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080013022 for (uint32_t n = 1; n <= 8; n++) {
13023 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013024 GemmMicrokernelTester()
13025 .mr(1)
13026 .nr(8)
13027 .kr(8)
13028 .sr(1)
13029 .m(m)
13030 .n(n)
13031 .k(k)
13032 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013033 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013034 }
13035 }
13036 }
13037 }
13038
13039 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8) {
13040 TEST_REQUIRES_ARM_NEON;
13041 for (uint32_t n = 9; n < 16; n++) {
13042 for (size_t k = 1; k <= 80; k += 17) {
13043 GemmMicrokernelTester()
13044 .mr(1)
13045 .nr(8)
13046 .kr(8)
13047 .sr(1)
13048 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080013049 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013050 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080013051 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013052 }
13053 }
13054 }
13055
13056 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_cn) {
13057 TEST_REQUIRES_ARM_NEON;
13058 for (uint32_t n = 9; n < 16; n++) {
13059 for (size_t k = 1; k <= 80; k += 17) {
13060 GemmMicrokernelTester()
13061 .mr(1)
13062 .nr(8)
13063 .kr(8)
13064 .sr(1)
13065 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080013066 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013067 .k(k)
13068 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080013069 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013070 }
13071 }
13072 }
13073
13074 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_a) {
13075 TEST_REQUIRES_ARM_NEON;
13076 for (uint32_t n = 9; n < 16; n++) {
13077 for (size_t k = 1; k <= 80; k += 17) {
13078 GemmMicrokernelTester()
13079 .mr(1)
13080 .nr(8)
13081 .kr(8)
13082 .sr(1)
13083 .m(1)
13084 .n(n)
13085 .k(k)
13086 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080013087 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013088 }
13089 }
13090 }
13091
13092 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_subtile) {
13093 TEST_REQUIRES_ARM_NEON;
13094 for (uint32_t n = 9; n < 16; n++) {
13095 for (size_t k = 1; k <= 80; k += 17) {
13096 for (uint32_t m = 1; m <= 1; m++) {
13097 GemmMicrokernelTester()
13098 .mr(1)
13099 .nr(8)
13100 .kr(8)
13101 .sr(1)
13102 .m(m)
13103 .n(n)
13104 .k(k)
13105 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013106 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013107 }
13108 }
13109 }
13110 }
13111
13112 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8) {
13113 TEST_REQUIRES_ARM_NEON;
13114 for (uint32_t n = 16; n <= 24; n += 8) {
13115 for (size_t k = 1; k <= 80; k += 17) {
13116 GemmMicrokernelTester()
13117 .mr(1)
13118 .nr(8)
13119 .kr(8)
13120 .sr(1)
13121 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080013122 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013123 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080013124 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013125 }
13126 }
13127 }
13128
13129 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_cn) {
13130 TEST_REQUIRES_ARM_NEON;
13131 for (uint32_t n = 16; n <= 24; n += 8) {
13132 for (size_t k = 1; k <= 80; k += 17) {
13133 GemmMicrokernelTester()
13134 .mr(1)
13135 .nr(8)
13136 .kr(8)
13137 .sr(1)
13138 .m(1)
13139 .n(n)
13140 .k(k)
13141 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080013142 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013143 }
13144 }
13145 }
13146
13147 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_a) {
13148 TEST_REQUIRES_ARM_NEON;
13149 for (uint32_t n = 16; n <= 24; n += 8) {
13150 for (size_t k = 1; k <= 80; k += 17) {
13151 GemmMicrokernelTester()
13152 .mr(1)
13153 .nr(8)
13154 .kr(8)
13155 .sr(1)
13156 .m(1)
13157 .n(n)
13158 .k(k)
13159 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080013160 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013161 }
13162 }
13163 }
13164
13165 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_subtile) {
13166 TEST_REQUIRES_ARM_NEON;
13167 for (uint32_t n = 16; n <= 24; n += 8) {
13168 for (size_t k = 1; k <= 80; k += 17) {
13169 for (uint32_t m = 1; m <= 1; m++) {
13170 GemmMicrokernelTester()
13171 .mr(1)
13172 .nr(8)
13173 .kr(8)
13174 .sr(1)
13175 .m(m)
13176 .n(n)
13177 .k(k)
13178 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013179 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013180 }
13181 }
13182 }
13183 }
13184
13185 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm_subtile) {
13186 TEST_REQUIRES_ARM_NEON;
13187 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080013188 for (uint32_t n = 1; n <= 8; n++) {
13189 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013190 GemmMicrokernelTester()
13191 .mr(1)
13192 .nr(8)
13193 .kr(8)
13194 .sr(1)
13195 .m(m)
13196 .n(n)
13197 .k(k)
13198 .cm_stride(11)
13199 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013200 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013201 }
13202 }
13203 }
13204 }
13205
13206 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmin) {
13207 TEST_REQUIRES_ARM_NEON;
13208 GemmMicrokernelTester()
13209 .mr(1)
13210 .nr(8)
13211 .kr(8)
13212 .sr(1)
13213 .m(1)
13214 .n(8)
13215 .k(16)
13216 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080013217 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013218 }
13219
13220 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmax) {
13221 TEST_REQUIRES_ARM_NEON;
13222 GemmMicrokernelTester()
13223 .mr(1)
13224 .nr(8)
13225 .kr(8)
13226 .sr(1)
13227 .m(1)
13228 .n(8)
13229 .k(16)
13230 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080013231 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013232 }
13233
13234 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm) {
13235 TEST_REQUIRES_ARM_NEON;
13236 GemmMicrokernelTester()
13237 .mr(1)
13238 .nr(8)
13239 .kr(8)
13240 .sr(1)
13241 .m(1)
13242 .n(8)
13243 .k(16)
13244 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080013245 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013246 }
13247#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
13248
13249
13250#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
13251 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4) {
13252 TEST_REQUIRES_ARM_NEON_DOT;
13253 GemmMicrokernelTester()
13254 .mr(4)
13255 .nr(16)
13256 .kr(4)
13257 .sr(1)
13258 .m(4)
13259 .n(16)
13260 .k(4)
Marat Dukhan50323b82022-01-11 00:12:01 -080013261 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013262 }
13263
13264 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, strided_cn) {
13265 TEST_REQUIRES_ARM_NEON_DOT;
13266 GemmMicrokernelTester()
13267 .mr(4)
13268 .nr(16)
13269 .kr(4)
13270 .sr(1)
13271 .m(4)
13272 .n(16)
13273 .k(4)
13274 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080013275 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013276 }
13277
13278 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_strided_a) {
13279 TEST_REQUIRES_ARM_NEON_DOT;
13280 GemmMicrokernelTester()
13281 .mr(4)
13282 .nr(16)
13283 .kr(4)
13284 .sr(1)
13285 .m(4)
13286 .n(16)
13287 .k(4)
13288 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080013289 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013290 }
13291
13292 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile) {
13293 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng83844ae2022-01-14 09:52:25 -080013294 for (uint32_t n = 1; n <= 16; n++) {
13295 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013296 GemmMicrokernelTester()
13297 .mr(4)
13298 .nr(16)
13299 .kr(4)
13300 .sr(1)
13301 .m(m)
13302 .n(n)
13303 .k(4)
13304 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013305 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013306 }
13307 }
13308 }
13309
13310 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_m) {
13311 TEST_REQUIRES_ARM_NEON_DOT;
13312 for (uint32_t m = 1; m <= 4; m++) {
13313 GemmMicrokernelTester()
13314 .mr(4)
13315 .nr(16)
13316 .kr(4)
13317 .sr(1)
13318 .m(m)
13319 .n(16)
13320 .k(4)
13321 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013322 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013323 }
13324 }
13325
13326 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_n) {
13327 TEST_REQUIRES_ARM_NEON_DOT;
13328 for (uint32_t n = 1; n <= 16; n++) {
13329 GemmMicrokernelTester()
13330 .mr(4)
13331 .nr(16)
13332 .kr(4)
13333 .sr(1)
13334 .m(4)
13335 .n(n)
13336 .k(4)
13337 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013338 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013339 }
13340 }
13341
13342 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4) {
13343 TEST_REQUIRES_ARM_NEON_DOT;
13344 for (size_t k = 1; k < 4; k++) {
13345 GemmMicrokernelTester()
13346 .mr(4)
13347 .nr(16)
13348 .kr(4)
13349 .sr(1)
13350 .m(4)
13351 .n(16)
13352 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080013353 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013354 }
13355 }
13356
13357 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4_strided_a) {
13358 TEST_REQUIRES_ARM_NEON_DOT;
13359 for (size_t k = 1; k < 4; k++) {
13360 GemmMicrokernelTester()
13361 .mr(4)
13362 .nr(16)
13363 .kr(4)
13364 .sr(1)
13365 .m(4)
13366 .n(16)
13367 .k(k)
13368 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080013369 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013370 }
13371 }
13372
13373 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4_subtile) {
13374 TEST_REQUIRES_ARM_NEON_DOT;
13375 for (size_t k = 1; k < 4; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080013376 for (uint32_t n = 1; n <= 16; n++) {
13377 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013378 GemmMicrokernelTester()
13379 .mr(4)
13380 .nr(16)
13381 .kr(4)
13382 .sr(1)
13383 .m(m)
13384 .n(n)
13385 .k(k)
13386 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013387 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013388 }
13389 }
13390 }
13391 }
13392
13393 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4) {
13394 TEST_REQUIRES_ARM_NEON_DOT;
13395 for (size_t k = 5; k < 8; k++) {
13396 GemmMicrokernelTester()
13397 .mr(4)
13398 .nr(16)
13399 .kr(4)
13400 .sr(1)
13401 .m(4)
13402 .n(16)
13403 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080013404 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013405 }
13406 }
13407
13408 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4_strided_a) {
13409 TEST_REQUIRES_ARM_NEON_DOT;
13410 for (size_t k = 5; k < 8; k++) {
13411 GemmMicrokernelTester()
13412 .mr(4)
13413 .nr(16)
13414 .kr(4)
13415 .sr(1)
13416 .m(4)
13417 .n(16)
13418 .k(k)
13419 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080013420 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013421 }
13422 }
13423
13424 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4_subtile) {
13425 TEST_REQUIRES_ARM_NEON_DOT;
13426 for (size_t k = 5; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080013427 for (uint32_t n = 1; n <= 16; n++) {
13428 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013429 GemmMicrokernelTester()
13430 .mr(4)
13431 .nr(16)
13432 .kr(4)
13433 .sr(1)
13434 .m(m)
13435 .n(n)
13436 .k(k)
13437 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013438 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013439 }
13440 }
13441 }
13442 }
13443
13444 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_div_4) {
13445 TEST_REQUIRES_ARM_NEON_DOT;
13446 for (size_t k = 8; k <= 40; k += 4) {
13447 GemmMicrokernelTester()
13448 .mr(4)
13449 .nr(16)
13450 .kr(4)
13451 .sr(1)
13452 .m(4)
13453 .n(16)
13454 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080013455 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013456 }
13457 }
13458
13459 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_div_4_strided_a) {
13460 TEST_REQUIRES_ARM_NEON_DOT;
13461 for (size_t k = 8; k <= 40; k += 4) {
13462 GemmMicrokernelTester()
13463 .mr(4)
13464 .nr(16)
13465 .kr(4)
13466 .sr(1)
13467 .m(4)
13468 .n(16)
13469 .k(k)
13470 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080013471 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013472 }
13473 }
13474
13475 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_div_4_subtile) {
13476 TEST_REQUIRES_ARM_NEON_DOT;
13477 for (size_t k = 8; k <= 40; k += 4) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080013478 for (uint32_t n = 1; n <= 16; n++) {
13479 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013480 GemmMicrokernelTester()
13481 .mr(4)
13482 .nr(16)
13483 .kr(4)
13484 .sr(1)
13485 .m(m)
13486 .n(n)
13487 .k(k)
13488 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013489 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013490 }
13491 }
13492 }
13493 }
13494
13495 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16) {
13496 TEST_REQUIRES_ARM_NEON_DOT;
13497 for (uint32_t n = 17; n < 32; n++) {
13498 for (size_t k = 1; k <= 20; k += 5) {
13499 GemmMicrokernelTester()
13500 .mr(4)
13501 .nr(16)
13502 .kr(4)
13503 .sr(1)
13504 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080013505 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013506 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080013507 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013508 }
13509 }
13510 }
13511
13512 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_cn) {
13513 TEST_REQUIRES_ARM_NEON_DOT;
13514 for (uint32_t n = 17; n < 32; n++) {
13515 for (size_t k = 1; k <= 20; k += 5) {
13516 GemmMicrokernelTester()
13517 .mr(4)
13518 .nr(16)
13519 .kr(4)
13520 .sr(1)
13521 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080013522 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013523 .k(k)
13524 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080013525 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013526 }
13527 }
13528 }
13529
13530 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_a) {
13531 TEST_REQUIRES_ARM_NEON_DOT;
13532 for (uint32_t n = 17; n < 32; n++) {
13533 for (size_t k = 1; k <= 20; k += 5) {
13534 GemmMicrokernelTester()
13535 .mr(4)
13536 .nr(16)
13537 .kr(4)
13538 .sr(1)
13539 .m(4)
13540 .n(n)
13541 .k(k)
13542 .a_stride(23)
Marat Dukhan50323b82022-01-11 00:12:01 -080013543 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013544 }
13545 }
13546 }
13547
13548 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_subtile) {
13549 TEST_REQUIRES_ARM_NEON_DOT;
13550 for (uint32_t n = 17; n < 32; n++) {
13551 for (size_t k = 1; k <= 20; k += 5) {
13552 for (uint32_t m = 1; m <= 4; m++) {
13553 GemmMicrokernelTester()
13554 .mr(4)
13555 .nr(16)
13556 .kr(4)
13557 .sr(1)
13558 .m(m)
13559 .n(n)
13560 .k(k)
13561 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013562 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013563 }
13564 }
13565 }
13566 }
13567
13568 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_div_16) {
13569 TEST_REQUIRES_ARM_NEON_DOT;
13570 for (uint32_t n = 32; n <= 48; n += 16) {
13571 for (size_t k = 1; k <= 20; k += 5) {
13572 GemmMicrokernelTester()
13573 .mr(4)
13574 .nr(16)
13575 .kr(4)
13576 .sr(1)
13577 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080013578 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013579 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080013580 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013581 }
13582 }
13583 }
13584
13585 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_cn) {
13586 TEST_REQUIRES_ARM_NEON_DOT;
13587 for (uint32_t n = 32; n <= 48; n += 16) {
13588 for (size_t k = 1; k <= 20; k += 5) {
13589 GemmMicrokernelTester()
13590 .mr(4)
13591 .nr(16)
13592 .kr(4)
13593 .sr(1)
13594 .m(4)
13595 .n(n)
13596 .k(k)
13597 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080013598 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013599 }
13600 }
13601 }
13602
13603 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_a) {
13604 TEST_REQUIRES_ARM_NEON_DOT;
13605 for (uint32_t n = 32; n <= 48; n += 16) {
13606 for (size_t k = 1; k <= 20; k += 5) {
13607 GemmMicrokernelTester()
13608 .mr(4)
13609 .nr(16)
13610 .kr(4)
13611 .sr(1)
13612 .m(4)
13613 .n(n)
13614 .k(k)
13615 .a_stride(23)
Marat Dukhan50323b82022-01-11 00:12:01 -080013616 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013617 }
13618 }
13619 }
13620
13621 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_subtile) {
13622 TEST_REQUIRES_ARM_NEON_DOT;
13623 for (uint32_t n = 32; n <= 48; n += 16) {
13624 for (size_t k = 1; k <= 20; k += 5) {
13625 for (uint32_t m = 1; m <= 4; m++) {
13626 GemmMicrokernelTester()
13627 .mr(4)
13628 .nr(16)
13629 .kr(4)
13630 .sr(1)
13631 .m(m)
13632 .n(n)
13633 .k(k)
13634 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013635 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013636 }
13637 }
13638 }
13639 }
13640
13641 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, strided_cm_subtile) {
13642 TEST_REQUIRES_ARM_NEON_DOT;
13643 for (size_t k = 1; k <= 20; k += 5) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080013644 for (uint32_t n = 1; n <= 16; n++) {
13645 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013646 GemmMicrokernelTester()
13647 .mr(4)
13648 .nr(16)
13649 .kr(4)
13650 .sr(1)
13651 .m(m)
13652 .n(n)
13653 .k(k)
13654 .cm_stride(19)
13655 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013656 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013657 }
13658 }
13659 }
13660 }
13661
13662 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, qmin) {
13663 TEST_REQUIRES_ARM_NEON_DOT;
13664 GemmMicrokernelTester()
13665 .mr(4)
13666 .nr(16)
13667 .kr(4)
13668 .sr(1)
13669 .m(4)
13670 .n(16)
13671 .k(4)
13672 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080013673 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013674 }
13675
13676 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, qmax) {
13677 TEST_REQUIRES_ARM_NEON_DOT;
13678 GemmMicrokernelTester()
13679 .mr(4)
13680 .nr(16)
13681 .kr(4)
13682 .sr(1)
13683 .m(4)
13684 .n(16)
13685 .k(4)
13686 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080013687 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013688 }
13689
13690 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, strided_cm) {
13691 TEST_REQUIRES_ARM_NEON_DOT;
13692 GemmMicrokernelTester()
13693 .mr(4)
13694 .nr(16)
13695 .kr(4)
13696 .sr(1)
13697 .m(4)
13698 .n(16)
13699 .k(4)
13700 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080013701 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013702 }
13703#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
13704
13705
13706#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
13707 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16) {
13708 TEST_REQUIRES_ARM_NEON_DOT;
13709 GemmMicrokernelTester()
13710 .mr(4)
13711 .nr(16)
13712 .kr(4)
13713 .sr(1)
13714 .m(4)
13715 .n(16)
13716 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -080013717 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013718 }
13719
13720 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, strided_cn) {
13721 TEST_REQUIRES_ARM_NEON_DOT;
13722 GemmMicrokernelTester()
13723 .mr(4)
13724 .nr(16)
13725 .kr(4)
13726 .sr(1)
13727 .m(4)
13728 .n(16)
13729 .k(16)
13730 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080013731 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013732 }
13733
13734 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_strided_a) {
13735 TEST_REQUIRES_ARM_NEON_DOT;
13736 GemmMicrokernelTester()
13737 .mr(4)
13738 .nr(16)
13739 .kr(4)
13740 .sr(1)
13741 .m(4)
13742 .n(16)
13743 .k(16)
13744 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080013745 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013746 }
13747
13748 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile) {
13749 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng83844ae2022-01-14 09:52:25 -080013750 for (uint32_t n = 1; n <= 16; n++) {
13751 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013752 GemmMicrokernelTester()
13753 .mr(4)
13754 .nr(16)
13755 .kr(4)
13756 .sr(1)
13757 .m(m)
13758 .n(n)
13759 .k(16)
13760 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013761 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013762 }
13763 }
13764 }
13765
13766 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_m) {
13767 TEST_REQUIRES_ARM_NEON_DOT;
13768 for (uint32_t m = 1; m <= 4; m++) {
13769 GemmMicrokernelTester()
13770 .mr(4)
13771 .nr(16)
13772 .kr(4)
13773 .sr(1)
13774 .m(m)
13775 .n(16)
13776 .k(16)
13777 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013778 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013779 }
13780 }
13781
13782 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_n) {
13783 TEST_REQUIRES_ARM_NEON_DOT;
13784 for (uint32_t n = 1; n <= 16; n++) {
13785 GemmMicrokernelTester()
13786 .mr(4)
13787 .nr(16)
13788 .kr(4)
13789 .sr(1)
13790 .m(4)
13791 .n(n)
13792 .k(16)
13793 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013794 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013795 }
13796 }
13797
13798 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16) {
13799 TEST_REQUIRES_ARM_NEON_DOT;
13800 for (size_t k = 1; k < 16; k++) {
13801 GemmMicrokernelTester()
13802 .mr(4)
13803 .nr(16)
13804 .kr(4)
13805 .sr(1)
13806 .m(4)
13807 .n(16)
13808 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080013809 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013810 }
13811 }
13812
13813 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16_strided_a) {
13814 TEST_REQUIRES_ARM_NEON_DOT;
13815 for (size_t k = 1; k < 16; k++) {
13816 GemmMicrokernelTester()
13817 .mr(4)
13818 .nr(16)
13819 .kr(4)
13820 .sr(1)
13821 .m(4)
13822 .n(16)
13823 .k(k)
13824 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080013825 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013826 }
13827 }
13828
13829 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16_subtile) {
13830 TEST_REQUIRES_ARM_NEON_DOT;
13831 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080013832 for (uint32_t n = 1; n <= 16; n++) {
13833 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013834 GemmMicrokernelTester()
13835 .mr(4)
13836 .nr(16)
13837 .kr(4)
13838 .sr(1)
13839 .m(m)
13840 .n(n)
13841 .k(k)
13842 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013843 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013844 }
13845 }
13846 }
13847 }
13848
13849 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16) {
13850 TEST_REQUIRES_ARM_NEON_DOT;
13851 for (size_t k = 17; k < 32; k++) {
13852 GemmMicrokernelTester()
13853 .mr(4)
13854 .nr(16)
13855 .kr(4)
13856 .sr(1)
13857 .m(4)
13858 .n(16)
13859 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080013860 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013861 }
13862 }
13863
13864 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16_strided_a) {
13865 TEST_REQUIRES_ARM_NEON_DOT;
13866 for (size_t k = 17; k < 32; k++) {
13867 GemmMicrokernelTester()
13868 .mr(4)
13869 .nr(16)
13870 .kr(4)
13871 .sr(1)
13872 .m(4)
13873 .n(16)
13874 .k(k)
13875 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -080013876 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013877 }
13878 }
13879
13880 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16_subtile) {
13881 TEST_REQUIRES_ARM_NEON_DOT;
13882 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080013883 for (uint32_t n = 1; n <= 16; n++) {
13884 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013885 GemmMicrokernelTester()
13886 .mr(4)
13887 .nr(16)
13888 .kr(4)
13889 .sr(1)
13890 .m(m)
13891 .n(n)
13892 .k(k)
13893 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013894 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013895 }
13896 }
13897 }
13898 }
13899
13900 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_div_16) {
13901 TEST_REQUIRES_ARM_NEON_DOT;
13902 for (size_t k = 32; k <= 160; k += 16) {
13903 GemmMicrokernelTester()
13904 .mr(4)
13905 .nr(16)
13906 .kr(4)
13907 .sr(1)
13908 .m(4)
13909 .n(16)
13910 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080013911 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013912 }
13913 }
13914
13915 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_div_16_strided_a) {
13916 TEST_REQUIRES_ARM_NEON_DOT;
13917 for (size_t k = 32; k <= 160; k += 16) {
13918 GemmMicrokernelTester()
13919 .mr(4)
13920 .nr(16)
13921 .kr(4)
13922 .sr(1)
13923 .m(4)
13924 .n(16)
13925 .k(k)
13926 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -080013927 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013928 }
13929 }
13930
13931 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_div_16_subtile) {
13932 TEST_REQUIRES_ARM_NEON_DOT;
13933 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080013934 for (uint32_t n = 1; n <= 16; n++) {
13935 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013936 GemmMicrokernelTester()
13937 .mr(4)
13938 .nr(16)
13939 .kr(4)
13940 .sr(1)
13941 .m(m)
13942 .n(n)
13943 .k(k)
13944 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080013945 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013946 }
13947 }
13948 }
13949 }
13950
13951 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16) {
13952 TEST_REQUIRES_ARM_NEON_DOT;
13953 for (uint32_t n = 17; n < 32; n++) {
13954 for (size_t k = 1; k <= 80; k += 17) {
13955 GemmMicrokernelTester()
13956 .mr(4)
13957 .nr(16)
13958 .kr(4)
13959 .sr(1)
13960 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080013961 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013962 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080013963 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013964 }
13965 }
13966 }
13967
13968 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_strided_cn) {
13969 TEST_REQUIRES_ARM_NEON_DOT;
13970 for (uint32_t n = 17; n < 32; n++) {
13971 for (size_t k = 1; k <= 80; k += 17) {
13972 GemmMicrokernelTester()
13973 .mr(4)
13974 .nr(16)
13975 .kr(4)
13976 .sr(1)
13977 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080013978 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013979 .k(k)
13980 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080013981 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080013982 }
13983 }
13984 }
13985
13986 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_strided_a) {
13987 TEST_REQUIRES_ARM_NEON_DOT;
13988 for (uint32_t n = 17; n < 32; n++) {
13989 for (size_t k = 1; k <= 80; k += 17) {
13990 GemmMicrokernelTester()
13991 .mr(4)
13992 .nr(16)
13993 .kr(4)
13994 .sr(1)
13995 .m(4)
13996 .n(n)
13997 .k(k)
13998 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080013999 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014000 }
14001 }
14002 }
14003
14004 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_subtile) {
14005 TEST_REQUIRES_ARM_NEON_DOT;
14006 for (uint32_t n = 17; n < 32; n++) {
14007 for (size_t k = 1; k <= 80; k += 17) {
14008 for (uint32_t m = 1; m <= 4; m++) {
14009 GemmMicrokernelTester()
14010 .mr(4)
14011 .nr(16)
14012 .kr(4)
14013 .sr(1)
14014 .m(m)
14015 .n(n)
14016 .k(k)
14017 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014018 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014019 }
14020 }
14021 }
14022 }
14023
14024 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_div_16) {
14025 TEST_REQUIRES_ARM_NEON_DOT;
14026 for (uint32_t n = 32; n <= 48; n += 16) {
14027 for (size_t k = 1; k <= 80; k += 17) {
14028 GemmMicrokernelTester()
14029 .mr(4)
14030 .nr(16)
14031 .kr(4)
14032 .sr(1)
14033 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080014034 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014035 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080014036 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014037 }
14038 }
14039 }
14040
14041 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_strided_cn) {
14042 TEST_REQUIRES_ARM_NEON_DOT;
14043 for (uint32_t n = 32; n <= 48; n += 16) {
14044 for (size_t k = 1; k <= 80; k += 17) {
14045 GemmMicrokernelTester()
14046 .mr(4)
14047 .nr(16)
14048 .kr(4)
14049 .sr(1)
14050 .m(4)
14051 .n(n)
14052 .k(k)
14053 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080014054 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014055 }
14056 }
14057 }
14058
14059 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_strided_a) {
14060 TEST_REQUIRES_ARM_NEON_DOT;
14061 for (uint32_t n = 32; n <= 48; n += 16) {
14062 for (size_t k = 1; k <= 80; k += 17) {
14063 GemmMicrokernelTester()
14064 .mr(4)
14065 .nr(16)
14066 .kr(4)
14067 .sr(1)
14068 .m(4)
14069 .n(n)
14070 .k(k)
14071 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080014072 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014073 }
14074 }
14075 }
14076
14077 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_subtile) {
14078 TEST_REQUIRES_ARM_NEON_DOT;
14079 for (uint32_t n = 32; n <= 48; n += 16) {
14080 for (size_t k = 1; k <= 80; k += 17) {
14081 for (uint32_t m = 1; m <= 4; m++) {
14082 GemmMicrokernelTester()
14083 .mr(4)
14084 .nr(16)
14085 .kr(4)
14086 .sr(1)
14087 .m(m)
14088 .n(n)
14089 .k(k)
14090 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014091 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014092 }
14093 }
14094 }
14095 }
14096
14097 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, strided_cm_subtile) {
14098 TEST_REQUIRES_ARM_NEON_DOT;
14099 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080014100 for (uint32_t n = 1; n <= 16; n++) {
14101 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014102 GemmMicrokernelTester()
14103 .mr(4)
14104 .nr(16)
14105 .kr(4)
14106 .sr(1)
14107 .m(m)
14108 .n(n)
14109 .k(k)
14110 .cm_stride(19)
14111 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014112 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014113 }
14114 }
14115 }
14116 }
14117
14118 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, qmin) {
14119 TEST_REQUIRES_ARM_NEON_DOT;
14120 GemmMicrokernelTester()
14121 .mr(4)
14122 .nr(16)
14123 .kr(4)
14124 .sr(1)
14125 .m(4)
14126 .n(16)
14127 .k(16)
14128 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080014129 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014130 }
14131
14132 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, qmax) {
14133 TEST_REQUIRES_ARM_NEON_DOT;
14134 GemmMicrokernelTester()
14135 .mr(4)
14136 .nr(16)
14137 .kr(4)
14138 .sr(1)
14139 .m(4)
14140 .n(16)
14141 .k(16)
14142 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080014143 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014144 }
14145
14146 TEST(QC8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, strided_cm) {
14147 TEST_REQUIRES_ARM_NEON_DOT;
14148 GemmMicrokernelTester()
14149 .mr(4)
14150 .nr(16)
14151 .kr(4)
14152 .sr(1)
14153 .m(4)
14154 .n(16)
14155 .k(16)
14156 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080014157 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014158 }
14159#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
14160
14161
14162#if XNN_ARCH_ARM || XNN_ARCH_ARM64
14163 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16) {
14164 TEST_REQUIRES_ARM_NEON_V8;
14165 GemmMicrokernelTester()
14166 .mr(1)
14167 .nr(8)
14168 .kr(8)
14169 .sr(1)
14170 .m(1)
14171 .n(8)
14172 .k(16)
Marat Dukhan50323b82022-01-11 00:12:01 -080014173 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014174 }
14175
14176 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, strided_cn) {
14177 TEST_REQUIRES_ARM_NEON_V8;
14178 GemmMicrokernelTester()
14179 .mr(1)
14180 .nr(8)
14181 .kr(8)
14182 .sr(1)
14183 .m(1)
14184 .n(8)
14185 .k(16)
14186 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080014187 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014188 }
14189
14190 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16_strided_a) {
14191 TEST_REQUIRES_ARM_NEON_V8;
14192 GemmMicrokernelTester()
14193 .mr(1)
14194 .nr(8)
14195 .kr(8)
14196 .sr(1)
14197 .m(1)
14198 .n(8)
14199 .k(16)
14200 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080014201 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014202 }
14203
14204 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16_subtile) {
14205 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -080014206 for (uint32_t n = 1; n <= 8; n++) {
14207 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014208 GemmMicrokernelTester()
14209 .mr(1)
14210 .nr(8)
14211 .kr(8)
14212 .sr(1)
14213 .m(m)
14214 .n(n)
14215 .k(16)
14216 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014217 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014218 }
14219 }
14220 }
14221
14222 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16_subtile_m) {
14223 TEST_REQUIRES_ARM_NEON_V8;
14224 for (uint32_t m = 1; m <= 1; m++) {
14225 GemmMicrokernelTester()
14226 .mr(1)
14227 .nr(8)
14228 .kr(8)
14229 .sr(1)
14230 .m(m)
14231 .n(8)
14232 .k(16)
14233 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014234 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014235 }
14236 }
14237
14238 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_eq_16_subtile_n) {
14239 TEST_REQUIRES_ARM_NEON_V8;
14240 for (uint32_t n = 1; n <= 8; n++) {
14241 GemmMicrokernelTester()
14242 .mr(1)
14243 .nr(8)
14244 .kr(8)
14245 .sr(1)
14246 .m(1)
14247 .n(n)
14248 .k(16)
14249 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014250 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014251 }
14252 }
14253
14254 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_lt_16) {
14255 TEST_REQUIRES_ARM_NEON_V8;
14256 for (size_t k = 1; k < 16; k++) {
14257 GemmMicrokernelTester()
14258 .mr(1)
14259 .nr(8)
14260 .kr(8)
14261 .sr(1)
14262 .m(1)
14263 .n(8)
14264 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080014265 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014266 }
14267 }
14268
14269 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_lt_16_strided_a) {
14270 TEST_REQUIRES_ARM_NEON_V8;
14271 for (size_t k = 1; k < 16; k++) {
14272 GemmMicrokernelTester()
14273 .mr(1)
14274 .nr(8)
14275 .kr(8)
14276 .sr(1)
14277 .m(1)
14278 .n(8)
14279 .k(k)
14280 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080014281 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014282 }
14283 }
14284
14285 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_lt_16_subtile) {
14286 TEST_REQUIRES_ARM_NEON_V8;
14287 for (size_t k = 1; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080014288 for (uint32_t n = 1; n <= 8; n++) {
14289 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014290 GemmMicrokernelTester()
14291 .mr(1)
14292 .nr(8)
14293 .kr(8)
14294 .sr(1)
14295 .m(m)
14296 .n(n)
14297 .k(k)
14298 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014299 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014300 }
14301 }
14302 }
14303 }
14304
14305 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_gt_16) {
14306 TEST_REQUIRES_ARM_NEON_V8;
14307 for (size_t k = 17; k < 32; k++) {
14308 GemmMicrokernelTester()
14309 .mr(1)
14310 .nr(8)
14311 .kr(8)
14312 .sr(1)
14313 .m(1)
14314 .n(8)
14315 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080014316 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014317 }
14318 }
14319
14320 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_gt_16_strided_a) {
14321 TEST_REQUIRES_ARM_NEON_V8;
14322 for (size_t k = 17; k < 32; k++) {
14323 GemmMicrokernelTester()
14324 .mr(1)
14325 .nr(8)
14326 .kr(8)
14327 .sr(1)
14328 .m(1)
14329 .n(8)
14330 .k(k)
14331 .a_stride(37)
Marat Dukhan50323b82022-01-11 00:12:01 -080014332 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014333 }
14334 }
14335
14336 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_gt_16_subtile) {
14337 TEST_REQUIRES_ARM_NEON_V8;
14338 for (size_t k = 17; k < 32; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080014339 for (uint32_t n = 1; n <= 8; n++) {
14340 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014341 GemmMicrokernelTester()
14342 .mr(1)
14343 .nr(8)
14344 .kr(8)
14345 .sr(1)
14346 .m(m)
14347 .n(n)
14348 .k(k)
14349 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014350 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014351 }
14352 }
14353 }
14354 }
14355
14356 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_div_16) {
14357 TEST_REQUIRES_ARM_NEON_V8;
14358 for (size_t k = 32; k <= 160; k += 16) {
14359 GemmMicrokernelTester()
14360 .mr(1)
14361 .nr(8)
14362 .kr(8)
14363 .sr(1)
14364 .m(1)
14365 .n(8)
14366 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080014367 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014368 }
14369 }
14370
14371 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_div_16_strided_a) {
14372 TEST_REQUIRES_ARM_NEON_V8;
14373 for (size_t k = 32; k <= 160; k += 16) {
14374 GemmMicrokernelTester()
14375 .mr(1)
14376 .nr(8)
14377 .kr(8)
14378 .sr(1)
14379 .m(1)
14380 .n(8)
14381 .k(k)
14382 .a_stride(163)
Marat Dukhan50323b82022-01-11 00:12:01 -080014383 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014384 }
14385 }
14386
14387 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, k_div_16_subtile) {
14388 TEST_REQUIRES_ARM_NEON_V8;
14389 for (size_t k = 32; k <= 160; k += 16) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080014390 for (uint32_t n = 1; n <= 8; n++) {
14391 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014392 GemmMicrokernelTester()
14393 .mr(1)
14394 .nr(8)
14395 .kr(8)
14396 .sr(1)
14397 .m(m)
14398 .n(n)
14399 .k(k)
14400 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014401 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014402 }
14403 }
14404 }
14405 }
14406
14407 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_gt_8) {
14408 TEST_REQUIRES_ARM_NEON_V8;
14409 for (uint32_t n = 9; n < 16; n++) {
14410 for (size_t k = 1; k <= 80; k += 17) {
14411 GemmMicrokernelTester()
14412 .mr(1)
14413 .nr(8)
14414 .kr(8)
14415 .sr(1)
14416 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080014417 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014418 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080014419 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014420 }
14421 }
14422 }
14423
14424 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_gt_8_strided_cn) {
14425 TEST_REQUIRES_ARM_NEON_V8;
14426 for (uint32_t n = 9; n < 16; n++) {
14427 for (size_t k = 1; k <= 80; k += 17) {
14428 GemmMicrokernelTester()
14429 .mr(1)
14430 .nr(8)
14431 .kr(8)
14432 .sr(1)
14433 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080014434 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014435 .k(k)
14436 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080014437 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014438 }
14439 }
14440 }
14441
14442 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_gt_8_strided_a) {
14443 TEST_REQUIRES_ARM_NEON_V8;
14444 for (uint32_t n = 9; n < 16; n++) {
14445 for (size_t k = 1; k <= 80; k += 17) {
14446 GemmMicrokernelTester()
14447 .mr(1)
14448 .nr(8)
14449 .kr(8)
14450 .sr(1)
14451 .m(1)
14452 .n(n)
14453 .k(k)
14454 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080014455 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014456 }
14457 }
14458 }
14459
14460 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_gt_8_subtile) {
14461 TEST_REQUIRES_ARM_NEON_V8;
14462 for (uint32_t n = 9; n < 16; n++) {
14463 for (size_t k = 1; k <= 80; k += 17) {
14464 for (uint32_t m = 1; m <= 1; m++) {
14465 GemmMicrokernelTester()
14466 .mr(1)
14467 .nr(8)
14468 .kr(8)
14469 .sr(1)
14470 .m(m)
14471 .n(n)
14472 .k(k)
14473 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014474 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014475 }
14476 }
14477 }
14478 }
14479
14480 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_div_8) {
14481 TEST_REQUIRES_ARM_NEON_V8;
14482 for (uint32_t n = 16; n <= 24; n += 8) {
14483 for (size_t k = 1; k <= 80; k += 17) {
14484 GemmMicrokernelTester()
14485 .mr(1)
14486 .nr(8)
14487 .kr(8)
14488 .sr(1)
14489 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080014490 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014491 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080014492 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014493 }
14494 }
14495 }
14496
14497 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_div_8_strided_cn) {
14498 TEST_REQUIRES_ARM_NEON_V8;
14499 for (uint32_t n = 16; n <= 24; n += 8) {
14500 for (size_t k = 1; k <= 80; k += 17) {
14501 GemmMicrokernelTester()
14502 .mr(1)
14503 .nr(8)
14504 .kr(8)
14505 .sr(1)
14506 .m(1)
14507 .n(n)
14508 .k(k)
14509 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080014510 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014511 }
14512 }
14513 }
14514
14515 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_div_8_strided_a) {
14516 TEST_REQUIRES_ARM_NEON_V8;
14517 for (uint32_t n = 16; n <= 24; n += 8) {
14518 for (size_t k = 1; k <= 80; k += 17) {
14519 GemmMicrokernelTester()
14520 .mr(1)
14521 .nr(8)
14522 .kr(8)
14523 .sr(1)
14524 .m(1)
14525 .n(n)
14526 .k(k)
14527 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080014528 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014529 }
14530 }
14531 }
14532
14533 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, n_div_8_subtile) {
14534 TEST_REQUIRES_ARM_NEON_V8;
14535 for (uint32_t n = 16; n <= 24; n += 8) {
14536 for (size_t k = 1; k <= 80; k += 17) {
14537 for (uint32_t m = 1; m <= 1; m++) {
14538 GemmMicrokernelTester()
14539 .mr(1)
14540 .nr(8)
14541 .kr(8)
14542 .sr(1)
14543 .m(m)
14544 .n(n)
14545 .k(k)
14546 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014547 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014548 }
14549 }
14550 }
14551 }
14552
14553 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, strided_cm_subtile) {
14554 TEST_REQUIRES_ARM_NEON_V8;
14555 for (size_t k = 1; k <= 80; k += 17) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080014556 for (uint32_t n = 1; n <= 8; n++) {
14557 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014558 GemmMicrokernelTester()
14559 .mr(1)
14560 .nr(8)
14561 .kr(8)
14562 .sr(1)
14563 .m(m)
14564 .n(n)
14565 .k(k)
14566 .cm_stride(11)
14567 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014568 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014569 }
14570 }
14571 }
14572 }
14573
14574 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, qmin) {
14575 TEST_REQUIRES_ARM_NEON_V8;
14576 GemmMicrokernelTester()
14577 .mr(1)
14578 .nr(8)
14579 .kr(8)
14580 .sr(1)
14581 .m(1)
14582 .n(8)
14583 .k(16)
14584 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080014585 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014586 }
14587
14588 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, qmax) {
14589 TEST_REQUIRES_ARM_NEON_V8;
14590 GemmMicrokernelTester()
14591 .mr(1)
14592 .nr(8)
14593 .kr(8)
14594 .sr(1)
14595 .m(1)
14596 .n(8)
14597 .k(16)
14598 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080014599 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014600 }
14601
14602 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL, strided_cm) {
14603 TEST_REQUIRES_ARM_NEON_V8;
14604 GemmMicrokernelTester()
14605 .mr(1)
14606 .nr(8)
14607 .kr(8)
14608 .sr(1)
14609 .m(1)
14610 .n(8)
14611 .k(16)
14612 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080014613 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014614 }
14615#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
14616
14617
14618#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
14619 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, k_eq_8) {
14620 TEST_REQUIRES_ARM_NEON_DOT;
14621 GemmMicrokernelTester()
14622 .mr(6)
14623 .nr(16)
14624 .kr(4)
14625 .sr(1)
14626 .m(6)
14627 .n(16)
14628 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080014629 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014630 }
14631
14632 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, strided_cn) {
14633 TEST_REQUIRES_ARM_NEON_DOT;
14634 GemmMicrokernelTester()
14635 .mr(6)
14636 .nr(16)
14637 .kr(4)
14638 .sr(1)
14639 .m(6)
14640 .n(16)
14641 .k(8)
14642 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080014643 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014644 }
14645
14646 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, k_eq_8_strided_a) {
14647 TEST_REQUIRES_ARM_NEON_DOT;
14648 GemmMicrokernelTester()
14649 .mr(6)
14650 .nr(16)
14651 .kr(4)
14652 .sr(1)
14653 .m(6)
14654 .n(16)
14655 .k(8)
14656 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080014657 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014658 }
14659
14660 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, k_eq_8_subtile) {
14661 TEST_REQUIRES_ARM_NEON_DOT;
Zhi An Ng83844ae2022-01-14 09:52:25 -080014662 for (uint32_t n = 1; n <= 16; n++) {
14663 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014664 GemmMicrokernelTester()
14665 .mr(6)
14666 .nr(16)
14667 .kr(4)
14668 .sr(1)
14669 .m(m)
14670 .n(n)
14671 .k(8)
14672 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014673 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014674 }
14675 }
14676 }
14677
14678 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, k_eq_8_subtile_m) {
14679 TEST_REQUIRES_ARM_NEON_DOT;
14680 for (uint32_t m = 1; m <= 6; m++) {
14681 GemmMicrokernelTester()
14682 .mr(6)
14683 .nr(16)
14684 .kr(4)
14685 .sr(1)
14686 .m(m)
14687 .n(16)
14688 .k(8)
14689 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014690 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014691 }
14692 }
14693
14694 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, k_eq_8_subtile_n) {
14695 TEST_REQUIRES_ARM_NEON_DOT;
14696 for (uint32_t n = 1; n <= 16; n++) {
14697 GemmMicrokernelTester()
14698 .mr(6)
14699 .nr(16)
14700 .kr(4)
14701 .sr(1)
14702 .m(6)
14703 .n(n)
14704 .k(8)
14705 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014706 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014707 }
14708 }
14709
14710 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, k_lt_8) {
14711 TEST_REQUIRES_ARM_NEON_DOT;
14712 for (size_t k = 1; k < 8; k++) {
14713 GemmMicrokernelTester()
14714 .mr(6)
14715 .nr(16)
14716 .kr(4)
14717 .sr(1)
14718 .m(6)
14719 .n(16)
14720 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080014721 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014722 }
14723 }
14724
14725 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, k_lt_8_strided_a) {
14726 TEST_REQUIRES_ARM_NEON_DOT;
14727 for (size_t k = 1; k < 8; k++) {
14728 GemmMicrokernelTester()
14729 .mr(6)
14730 .nr(16)
14731 .kr(4)
14732 .sr(1)
14733 .m(6)
14734 .n(16)
14735 .k(k)
14736 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080014737 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014738 }
14739 }
14740
14741 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, k_lt_8_subtile) {
14742 TEST_REQUIRES_ARM_NEON_DOT;
14743 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080014744 for (uint32_t n = 1; n <= 16; n++) {
14745 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014746 GemmMicrokernelTester()
14747 .mr(6)
14748 .nr(16)
14749 .kr(4)
14750 .sr(1)
14751 .m(m)
14752 .n(n)
14753 .k(k)
14754 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014755 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014756 }
14757 }
14758 }
14759 }
14760
14761 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, k_gt_8) {
14762 TEST_REQUIRES_ARM_NEON_DOT;
14763 for (size_t k = 9; k < 16; k++) {
14764 GemmMicrokernelTester()
14765 .mr(6)
14766 .nr(16)
14767 .kr(4)
14768 .sr(1)
14769 .m(6)
14770 .n(16)
14771 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080014772 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014773 }
14774 }
14775
14776 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, k_gt_8_strided_a) {
14777 TEST_REQUIRES_ARM_NEON_DOT;
14778 for (size_t k = 9; k < 16; k++) {
14779 GemmMicrokernelTester()
14780 .mr(6)
14781 .nr(16)
14782 .kr(4)
14783 .sr(1)
14784 .m(6)
14785 .n(16)
14786 .k(k)
14787 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080014788 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014789 }
14790 }
14791
14792 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, k_gt_8_subtile) {
14793 TEST_REQUIRES_ARM_NEON_DOT;
14794 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080014795 for (uint32_t n = 1; n <= 16; n++) {
14796 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014797 GemmMicrokernelTester()
14798 .mr(6)
14799 .nr(16)
14800 .kr(4)
14801 .sr(1)
14802 .m(m)
14803 .n(n)
14804 .k(k)
14805 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014806 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014807 }
14808 }
14809 }
14810 }
14811
14812 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, k_div_8) {
14813 TEST_REQUIRES_ARM_NEON_DOT;
14814 for (size_t k = 16; k <= 80; k += 8) {
14815 GemmMicrokernelTester()
14816 .mr(6)
14817 .nr(16)
14818 .kr(4)
14819 .sr(1)
14820 .m(6)
14821 .n(16)
14822 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080014823 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014824 }
14825 }
14826
14827 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, k_div_8_strided_a) {
14828 TEST_REQUIRES_ARM_NEON_DOT;
14829 for (size_t k = 16; k <= 80; k += 8) {
14830 GemmMicrokernelTester()
14831 .mr(6)
14832 .nr(16)
14833 .kr(4)
14834 .sr(1)
14835 .m(6)
14836 .n(16)
14837 .k(k)
14838 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080014839 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014840 }
14841 }
14842
14843 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, k_div_8_subtile) {
14844 TEST_REQUIRES_ARM_NEON_DOT;
14845 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080014846 for (uint32_t n = 1; n <= 16; n++) {
14847 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014848 GemmMicrokernelTester()
14849 .mr(6)
14850 .nr(16)
14851 .kr(4)
14852 .sr(1)
14853 .m(m)
14854 .n(n)
14855 .k(k)
14856 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014857 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014858 }
14859 }
14860 }
14861 }
14862
14863 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, n_gt_16) {
14864 TEST_REQUIRES_ARM_NEON_DOT;
14865 for (uint32_t n = 17; n < 32; n++) {
14866 for (size_t k = 1; k <= 40; k += 9) {
14867 GemmMicrokernelTester()
14868 .mr(6)
14869 .nr(16)
14870 .kr(4)
14871 .sr(1)
14872 .m(6)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080014873 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014874 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080014875 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014876 }
14877 }
14878 }
14879
14880 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, n_gt_16_strided_cn) {
14881 TEST_REQUIRES_ARM_NEON_DOT;
14882 for (uint32_t n = 17; n < 32; n++) {
14883 for (size_t k = 1; k <= 40; k += 9) {
14884 GemmMicrokernelTester()
14885 .mr(6)
14886 .nr(16)
14887 .kr(4)
14888 .sr(1)
14889 .m(6)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080014890 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014891 .k(k)
14892 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080014893 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014894 }
14895 }
14896 }
14897
14898 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, n_gt_16_strided_a) {
14899 TEST_REQUIRES_ARM_NEON_DOT;
14900 for (uint32_t n = 17; n < 32; n++) {
14901 for (size_t k = 1; k <= 40; k += 9) {
14902 GemmMicrokernelTester()
14903 .mr(6)
14904 .nr(16)
14905 .kr(4)
14906 .sr(1)
14907 .m(6)
14908 .n(n)
14909 .k(k)
14910 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080014911 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014912 }
14913 }
14914 }
14915
14916 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, n_gt_16_subtile) {
14917 TEST_REQUIRES_ARM_NEON_DOT;
14918 for (uint32_t n = 17; n < 32; n++) {
14919 for (size_t k = 1; k <= 40; k += 9) {
14920 for (uint32_t m = 1; m <= 6; m++) {
14921 GemmMicrokernelTester()
14922 .mr(6)
14923 .nr(16)
14924 .kr(4)
14925 .sr(1)
14926 .m(m)
14927 .n(n)
14928 .k(k)
14929 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080014930 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014931 }
14932 }
14933 }
14934 }
14935
14936 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, n_div_16) {
14937 TEST_REQUIRES_ARM_NEON_DOT;
14938 for (uint32_t n = 32; n <= 48; n += 16) {
14939 for (size_t k = 1; k <= 40; k += 9) {
14940 GemmMicrokernelTester()
14941 .mr(6)
14942 .nr(16)
14943 .kr(4)
14944 .sr(1)
14945 .m(6)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080014946 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014947 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080014948 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014949 }
14950 }
14951 }
14952
14953 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, n_div_16_strided_cn) {
14954 TEST_REQUIRES_ARM_NEON_DOT;
14955 for (uint32_t n = 32; n <= 48; n += 16) {
14956 for (size_t k = 1; k <= 40; k += 9) {
14957 GemmMicrokernelTester()
14958 .mr(6)
14959 .nr(16)
14960 .kr(4)
14961 .sr(1)
14962 .m(6)
14963 .n(n)
14964 .k(k)
14965 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080014966 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014967 }
14968 }
14969 }
14970
14971 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, n_div_16_strided_a) {
14972 TEST_REQUIRES_ARM_NEON_DOT;
14973 for (uint32_t n = 32; n <= 48; n += 16) {
14974 for (size_t k = 1; k <= 40; k += 9) {
14975 GemmMicrokernelTester()
14976 .mr(6)
14977 .nr(16)
14978 .kr(4)
14979 .sr(1)
14980 .m(6)
14981 .n(n)
14982 .k(k)
14983 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080014984 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080014985 }
14986 }
14987 }
14988
14989 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, n_div_16_subtile) {
14990 TEST_REQUIRES_ARM_NEON_DOT;
14991 for (uint32_t n = 32; n <= 48; n += 16) {
14992 for (size_t k = 1; k <= 40; k += 9) {
14993 for (uint32_t m = 1; m <= 6; m++) {
14994 GemmMicrokernelTester()
14995 .mr(6)
14996 .nr(16)
14997 .kr(4)
14998 .sr(1)
14999 .m(m)
15000 .n(n)
15001 .k(k)
15002 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015003 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015004 }
15005 }
15006 }
15007 }
15008
15009 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, strided_cm_subtile) {
15010 TEST_REQUIRES_ARM_NEON_DOT;
15011 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080015012 for (uint32_t n = 1; n <= 16; n++) {
15013 for (uint32_t m = 1; m <= 6; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015014 GemmMicrokernelTester()
15015 .mr(6)
15016 .nr(16)
15017 .kr(4)
15018 .sr(1)
15019 .m(m)
15020 .n(n)
15021 .k(k)
15022 .cm_stride(19)
15023 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015024 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015025 }
15026 }
15027 }
15028 }
15029
15030 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, qmin) {
15031 TEST_REQUIRES_ARM_NEON_DOT;
15032 GemmMicrokernelTester()
15033 .mr(6)
15034 .nr(16)
15035 .kr(4)
15036 .sr(1)
15037 .m(6)
15038 .n(16)
15039 .k(8)
15040 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080015041 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015042 }
15043
15044 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, qmax) {
15045 TEST_REQUIRES_ARM_NEON_DOT;
15046 GemmMicrokernelTester()
15047 .mr(6)
15048 .nr(16)
15049 .kr(4)
15050 .sr(1)
15051 .m(6)
15052 .n(16)
15053 .k(8)
15054 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080015055 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015056 }
15057
15058 TEST(QC8_GEMM_MINMAX_FP32_6X16C4__NEONDOT, strided_cm) {
15059 TEST_REQUIRES_ARM_NEON_DOT;
15060 GemmMicrokernelTester()
15061 .mr(6)
15062 .nr(16)
15063 .kr(4)
15064 .sr(1)
15065 .m(6)
15066 .n(16)
15067 .k(8)
15068 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080015069 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015070 }
15071#endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
15072
15073
15074#if XNN_ARCH_X86 || XNN_ARCH_X86_64
15075 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8) {
15076 TEST_REQUIRES_X86_SSE2;
15077 GemmMicrokernelTester()
15078 .mr(4)
15079 .nr(4)
15080 .kr(2)
15081 .sr(1)
15082 .m(4)
15083 .n(4)
15084 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080015085 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015086 }
15087
15088 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, strided_cn) {
15089 TEST_REQUIRES_X86_SSE2;
15090 GemmMicrokernelTester()
15091 .mr(4)
15092 .nr(4)
15093 .kr(2)
15094 .sr(1)
15095 .m(4)
15096 .n(4)
15097 .k(8)
15098 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080015099 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015100 }
15101
15102 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_strided_a) {
15103 TEST_REQUIRES_X86_SSE2;
15104 GemmMicrokernelTester()
15105 .mr(4)
15106 .nr(4)
15107 .kr(2)
15108 .sr(1)
15109 .m(4)
15110 .n(4)
15111 .k(8)
15112 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080015113 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015114 }
15115
15116 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_subtile) {
15117 TEST_REQUIRES_X86_SSE2;
Zhi An Ng83844ae2022-01-14 09:52:25 -080015118 for (uint32_t n = 1; n <= 4; n++) {
15119 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015120 GemmMicrokernelTester()
15121 .mr(4)
15122 .nr(4)
15123 .kr(2)
15124 .sr(1)
15125 .m(m)
15126 .n(n)
15127 .k(8)
15128 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015129 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015130 }
15131 }
15132 }
15133
15134 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_subtile_m) {
15135 TEST_REQUIRES_X86_SSE2;
15136 for (uint32_t m = 1; m <= 4; m++) {
15137 GemmMicrokernelTester()
15138 .mr(4)
15139 .nr(4)
15140 .kr(2)
15141 .sr(1)
15142 .m(m)
15143 .n(4)
15144 .k(8)
15145 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015146 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015147 }
15148 }
15149
15150 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_subtile_n) {
15151 TEST_REQUIRES_X86_SSE2;
15152 for (uint32_t n = 1; n <= 4; n++) {
15153 GemmMicrokernelTester()
15154 .mr(4)
15155 .nr(4)
15156 .kr(2)
15157 .sr(1)
15158 .m(4)
15159 .n(n)
15160 .k(8)
15161 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015162 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015163 }
15164 }
15165
15166 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_lt_8) {
15167 TEST_REQUIRES_X86_SSE2;
15168 for (size_t k = 1; k < 8; k++) {
15169 GemmMicrokernelTester()
15170 .mr(4)
15171 .nr(4)
15172 .kr(2)
15173 .sr(1)
15174 .m(4)
15175 .n(4)
15176 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080015177 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015178 }
15179 }
15180
15181 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_lt_8_strided_a) {
15182 TEST_REQUIRES_X86_SSE2;
15183 for (size_t k = 1; k < 8; k++) {
15184 GemmMicrokernelTester()
15185 .mr(4)
15186 .nr(4)
15187 .kr(2)
15188 .sr(1)
15189 .m(4)
15190 .n(4)
15191 .k(k)
15192 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080015193 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015194 }
15195 }
15196
15197 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_lt_8_subtile) {
15198 TEST_REQUIRES_X86_SSE2;
15199 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080015200 for (uint32_t n = 1; n <= 4; n++) {
15201 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015202 GemmMicrokernelTester()
15203 .mr(4)
15204 .nr(4)
15205 .kr(2)
15206 .sr(1)
15207 .m(m)
15208 .n(n)
15209 .k(k)
15210 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015211 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015212 }
15213 }
15214 }
15215 }
15216
15217 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_gt_8) {
15218 TEST_REQUIRES_X86_SSE2;
15219 for (size_t k = 9; k < 16; k++) {
15220 GemmMicrokernelTester()
15221 .mr(4)
15222 .nr(4)
15223 .kr(2)
15224 .sr(1)
15225 .m(4)
15226 .n(4)
15227 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080015228 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015229 }
15230 }
15231
15232 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_gt_8_strided_a) {
15233 TEST_REQUIRES_X86_SSE2;
15234 for (size_t k = 9; k < 16; k++) {
15235 GemmMicrokernelTester()
15236 .mr(4)
15237 .nr(4)
15238 .kr(2)
15239 .sr(1)
15240 .m(4)
15241 .n(4)
15242 .k(k)
15243 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080015244 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015245 }
15246 }
15247
15248 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_gt_8_subtile) {
15249 TEST_REQUIRES_X86_SSE2;
15250 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080015251 for (uint32_t n = 1; n <= 4; n++) {
15252 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015253 GemmMicrokernelTester()
15254 .mr(4)
15255 .nr(4)
15256 .kr(2)
15257 .sr(1)
15258 .m(m)
15259 .n(n)
15260 .k(k)
15261 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015262 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015263 }
15264 }
15265 }
15266 }
15267
15268 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_div_8) {
15269 TEST_REQUIRES_X86_SSE2;
15270 for (size_t k = 16; k <= 80; k += 8) {
15271 GemmMicrokernelTester()
15272 .mr(4)
15273 .nr(4)
15274 .kr(2)
15275 .sr(1)
15276 .m(4)
15277 .n(4)
15278 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080015279 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015280 }
15281 }
15282
15283 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_div_8_strided_a) {
15284 TEST_REQUIRES_X86_SSE2;
15285 for (size_t k = 16; k <= 80; k += 8) {
15286 GemmMicrokernelTester()
15287 .mr(4)
15288 .nr(4)
15289 .kr(2)
15290 .sr(1)
15291 .m(4)
15292 .n(4)
15293 .k(k)
15294 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080015295 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015296 }
15297 }
15298
15299 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_div_8_subtile) {
15300 TEST_REQUIRES_X86_SSE2;
15301 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080015302 for (uint32_t n = 1; n <= 4; n++) {
15303 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015304 GemmMicrokernelTester()
15305 .mr(4)
15306 .nr(4)
15307 .kr(2)
15308 .sr(1)
15309 .m(m)
15310 .n(n)
15311 .k(k)
15312 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015313 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015314 }
15315 }
15316 }
15317 }
15318
15319 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4) {
15320 TEST_REQUIRES_X86_SSE2;
15321 for (uint32_t n = 5; n < 8; n++) {
15322 for (size_t k = 1; k <= 40; k += 9) {
15323 GemmMicrokernelTester()
15324 .mr(4)
15325 .nr(4)
15326 .kr(2)
15327 .sr(1)
15328 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080015329 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015330 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080015331 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015332 }
15333 }
15334 }
15335
15336 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4_strided_cn) {
15337 TEST_REQUIRES_X86_SSE2;
15338 for (uint32_t n = 5; n < 8; n++) {
15339 for (size_t k = 1; k <= 40; k += 9) {
15340 GemmMicrokernelTester()
15341 .mr(4)
15342 .nr(4)
15343 .kr(2)
15344 .sr(1)
15345 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080015346 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015347 .k(k)
15348 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080015349 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015350 }
15351 }
15352 }
15353
15354 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4_strided_a) {
15355 TEST_REQUIRES_X86_SSE2;
15356 for (uint32_t n = 5; n < 8; n++) {
15357 for (size_t k = 1; k <= 40; k += 9) {
15358 GemmMicrokernelTester()
15359 .mr(4)
15360 .nr(4)
15361 .kr(2)
15362 .sr(1)
15363 .m(4)
15364 .n(n)
15365 .k(k)
15366 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080015367 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015368 }
15369 }
15370 }
15371
15372 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4_subtile) {
15373 TEST_REQUIRES_X86_SSE2;
15374 for (uint32_t n = 5; n < 8; n++) {
15375 for (size_t k = 1; k <= 40; k += 9) {
15376 for (uint32_t m = 1; m <= 4; m++) {
15377 GemmMicrokernelTester()
15378 .mr(4)
15379 .nr(4)
15380 .kr(2)
15381 .sr(1)
15382 .m(m)
15383 .n(n)
15384 .k(k)
15385 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015386 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015387 }
15388 }
15389 }
15390 }
15391
15392 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4) {
15393 TEST_REQUIRES_X86_SSE2;
15394 for (uint32_t n = 8; n <= 12; n += 4) {
15395 for (size_t k = 1; k <= 40; k += 9) {
15396 GemmMicrokernelTester()
15397 .mr(4)
15398 .nr(4)
15399 .kr(2)
15400 .sr(1)
15401 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080015402 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015403 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080015404 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015405 }
15406 }
15407 }
15408
15409 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4_strided_cn) {
15410 TEST_REQUIRES_X86_SSE2;
15411 for (uint32_t n = 8; n <= 12; n += 4) {
15412 for (size_t k = 1; k <= 40; k += 9) {
15413 GemmMicrokernelTester()
15414 .mr(4)
15415 .nr(4)
15416 .kr(2)
15417 .sr(1)
15418 .m(4)
15419 .n(n)
15420 .k(k)
15421 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080015422 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015423 }
15424 }
15425 }
15426
15427 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4_strided_a) {
15428 TEST_REQUIRES_X86_SSE2;
15429 for (uint32_t n = 8; n <= 12; n += 4) {
15430 for (size_t k = 1; k <= 40; k += 9) {
15431 GemmMicrokernelTester()
15432 .mr(4)
15433 .nr(4)
15434 .kr(2)
15435 .sr(1)
15436 .m(4)
15437 .n(n)
15438 .k(k)
15439 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080015440 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015441 }
15442 }
15443 }
15444
15445 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4_subtile) {
15446 TEST_REQUIRES_X86_SSE2;
15447 for (uint32_t n = 8; n <= 12; n += 4) {
15448 for (size_t k = 1; k <= 40; k += 9) {
15449 for (uint32_t m = 1; m <= 4; m++) {
15450 GemmMicrokernelTester()
15451 .mr(4)
15452 .nr(4)
15453 .kr(2)
15454 .sr(1)
15455 .m(m)
15456 .n(n)
15457 .k(k)
15458 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015459 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015460 }
15461 }
15462 }
15463 }
15464
15465 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, strided_cm_subtile) {
15466 TEST_REQUIRES_X86_SSE2;
15467 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080015468 for (uint32_t n = 1; n <= 4; n++) {
15469 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015470 GemmMicrokernelTester()
15471 .mr(4)
15472 .nr(4)
15473 .kr(2)
15474 .sr(1)
15475 .m(m)
15476 .n(n)
15477 .k(k)
15478 .cm_stride(7)
15479 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015480 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015481 }
15482 }
15483 }
15484 }
15485
15486 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, qmin) {
15487 TEST_REQUIRES_X86_SSE2;
15488 GemmMicrokernelTester()
15489 .mr(4)
15490 .nr(4)
15491 .kr(2)
15492 .sr(1)
15493 .m(4)
15494 .n(4)
15495 .k(8)
15496 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080015497 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015498 }
15499
15500 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, qmax) {
15501 TEST_REQUIRES_X86_SSE2;
15502 GemmMicrokernelTester()
15503 .mr(4)
15504 .nr(4)
15505 .kr(2)
15506 .sr(1)
15507 .m(4)
15508 .n(4)
15509 .k(8)
15510 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080015511 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015512 }
15513
15514 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, strided_cm) {
15515 TEST_REQUIRES_X86_SSE2;
15516 GemmMicrokernelTester()
15517 .mr(4)
15518 .nr(4)
15519 .kr(2)
15520 .sr(1)
15521 .m(4)
15522 .n(4)
15523 .k(8)
15524 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080015525 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015526 }
15527#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
15528
15529
15530#if XNN_ARCH_X86 || XNN_ARCH_X86_64
15531 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8) {
15532 TEST_REQUIRES_X86_AVX;
15533 GemmMicrokernelTester()
15534 .mr(4)
15535 .nr(4)
15536 .kr(2)
15537 .sr(1)
15538 .m(4)
15539 .n(4)
15540 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080015541 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015542 }
15543
15544 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, strided_cn) {
15545 TEST_REQUIRES_X86_AVX;
15546 GemmMicrokernelTester()
15547 .mr(4)
15548 .nr(4)
15549 .kr(2)
15550 .sr(1)
15551 .m(4)
15552 .n(4)
15553 .k(8)
15554 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080015555 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015556 }
15557
15558 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8_strided_a) {
15559 TEST_REQUIRES_X86_AVX;
15560 GemmMicrokernelTester()
15561 .mr(4)
15562 .nr(4)
15563 .kr(2)
15564 .sr(1)
15565 .m(4)
15566 .n(4)
15567 .k(8)
15568 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080015569 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015570 }
15571
15572 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8_subtile) {
15573 TEST_REQUIRES_X86_AVX;
Zhi An Ng83844ae2022-01-14 09:52:25 -080015574 for (uint32_t n = 1; n <= 4; n++) {
15575 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015576 GemmMicrokernelTester()
15577 .mr(4)
15578 .nr(4)
15579 .kr(2)
15580 .sr(1)
15581 .m(m)
15582 .n(n)
15583 .k(8)
15584 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015585 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015586 }
15587 }
15588 }
15589
15590 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8_subtile_m) {
15591 TEST_REQUIRES_X86_AVX;
15592 for (uint32_t m = 1; m <= 4; m++) {
15593 GemmMicrokernelTester()
15594 .mr(4)
15595 .nr(4)
15596 .kr(2)
15597 .sr(1)
15598 .m(m)
15599 .n(4)
15600 .k(8)
15601 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015602 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015603 }
15604 }
15605
15606 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8_subtile_n) {
15607 TEST_REQUIRES_X86_AVX;
15608 for (uint32_t n = 1; n <= 4; n++) {
15609 GemmMicrokernelTester()
15610 .mr(4)
15611 .nr(4)
15612 .kr(2)
15613 .sr(1)
15614 .m(4)
15615 .n(n)
15616 .k(8)
15617 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015618 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015619 }
15620 }
15621
15622 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_lt_8) {
15623 TEST_REQUIRES_X86_AVX;
15624 for (size_t k = 1; k < 8; k++) {
15625 GemmMicrokernelTester()
15626 .mr(4)
15627 .nr(4)
15628 .kr(2)
15629 .sr(1)
15630 .m(4)
15631 .n(4)
15632 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080015633 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015634 }
15635 }
15636
15637 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_lt_8_strided_a) {
15638 TEST_REQUIRES_X86_AVX;
15639 for (size_t k = 1; k < 8; k++) {
15640 GemmMicrokernelTester()
15641 .mr(4)
15642 .nr(4)
15643 .kr(2)
15644 .sr(1)
15645 .m(4)
15646 .n(4)
15647 .k(k)
15648 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080015649 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015650 }
15651 }
15652
15653 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_lt_8_subtile) {
15654 TEST_REQUIRES_X86_AVX;
15655 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080015656 for (uint32_t n = 1; n <= 4; n++) {
15657 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015658 GemmMicrokernelTester()
15659 .mr(4)
15660 .nr(4)
15661 .kr(2)
15662 .sr(1)
15663 .m(m)
15664 .n(n)
15665 .k(k)
15666 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015667 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015668 }
15669 }
15670 }
15671 }
15672
15673 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_gt_8) {
15674 TEST_REQUIRES_X86_AVX;
15675 for (size_t k = 9; k < 16; k++) {
15676 GemmMicrokernelTester()
15677 .mr(4)
15678 .nr(4)
15679 .kr(2)
15680 .sr(1)
15681 .m(4)
15682 .n(4)
15683 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080015684 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015685 }
15686 }
15687
15688 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_gt_8_strided_a) {
15689 TEST_REQUIRES_X86_AVX;
15690 for (size_t k = 9; k < 16; k++) {
15691 GemmMicrokernelTester()
15692 .mr(4)
15693 .nr(4)
15694 .kr(2)
15695 .sr(1)
15696 .m(4)
15697 .n(4)
15698 .k(k)
15699 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080015700 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015701 }
15702 }
15703
15704 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_gt_8_subtile) {
15705 TEST_REQUIRES_X86_AVX;
15706 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080015707 for (uint32_t n = 1; n <= 4; n++) {
15708 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015709 GemmMicrokernelTester()
15710 .mr(4)
15711 .nr(4)
15712 .kr(2)
15713 .sr(1)
15714 .m(m)
15715 .n(n)
15716 .k(k)
15717 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015718 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015719 }
15720 }
15721 }
15722 }
15723
15724 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_div_8) {
15725 TEST_REQUIRES_X86_AVX;
15726 for (size_t k = 16; k <= 80; k += 8) {
15727 GemmMicrokernelTester()
15728 .mr(4)
15729 .nr(4)
15730 .kr(2)
15731 .sr(1)
15732 .m(4)
15733 .n(4)
15734 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080015735 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015736 }
15737 }
15738
15739 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_div_8_strided_a) {
15740 TEST_REQUIRES_X86_AVX;
15741 for (size_t k = 16; k <= 80; k += 8) {
15742 GemmMicrokernelTester()
15743 .mr(4)
15744 .nr(4)
15745 .kr(2)
15746 .sr(1)
15747 .m(4)
15748 .n(4)
15749 .k(k)
15750 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080015751 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015752 }
15753 }
15754
15755 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_div_8_subtile) {
15756 TEST_REQUIRES_X86_AVX;
15757 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080015758 for (uint32_t n = 1; n <= 4; n++) {
15759 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015760 GemmMicrokernelTester()
15761 .mr(4)
15762 .nr(4)
15763 .kr(2)
15764 .sr(1)
15765 .m(m)
15766 .n(n)
15767 .k(k)
15768 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015769 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015770 }
15771 }
15772 }
15773 }
15774
15775 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4) {
15776 TEST_REQUIRES_X86_AVX;
15777 for (uint32_t n = 5; n < 8; n++) {
15778 for (size_t k = 1; k <= 40; k += 9) {
15779 GemmMicrokernelTester()
15780 .mr(4)
15781 .nr(4)
15782 .kr(2)
15783 .sr(1)
15784 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080015785 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015786 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080015787 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015788 }
15789 }
15790 }
15791
15792 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4_strided_cn) {
15793 TEST_REQUIRES_X86_AVX;
15794 for (uint32_t n = 5; n < 8; n++) {
15795 for (size_t k = 1; k <= 40; k += 9) {
15796 GemmMicrokernelTester()
15797 .mr(4)
15798 .nr(4)
15799 .kr(2)
15800 .sr(1)
15801 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080015802 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015803 .k(k)
15804 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080015805 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015806 }
15807 }
15808 }
15809
15810 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4_strided_a) {
15811 TEST_REQUIRES_X86_AVX;
15812 for (uint32_t n = 5; n < 8; n++) {
15813 for (size_t k = 1; k <= 40; k += 9) {
15814 GemmMicrokernelTester()
15815 .mr(4)
15816 .nr(4)
15817 .kr(2)
15818 .sr(1)
15819 .m(4)
15820 .n(n)
15821 .k(k)
15822 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080015823 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015824 }
15825 }
15826 }
15827
15828 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4_subtile) {
15829 TEST_REQUIRES_X86_AVX;
15830 for (uint32_t n = 5; n < 8; n++) {
15831 for (size_t k = 1; k <= 40; k += 9) {
15832 for (uint32_t m = 1; m <= 4; m++) {
15833 GemmMicrokernelTester()
15834 .mr(4)
15835 .nr(4)
15836 .kr(2)
15837 .sr(1)
15838 .m(m)
15839 .n(n)
15840 .k(k)
15841 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015842 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015843 }
15844 }
15845 }
15846 }
15847
15848 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4) {
15849 TEST_REQUIRES_X86_AVX;
15850 for (uint32_t n = 8; n <= 12; n += 4) {
15851 for (size_t k = 1; k <= 40; k += 9) {
15852 GemmMicrokernelTester()
15853 .mr(4)
15854 .nr(4)
15855 .kr(2)
15856 .sr(1)
15857 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080015858 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015859 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080015860 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015861 }
15862 }
15863 }
15864
15865 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4_strided_cn) {
15866 TEST_REQUIRES_X86_AVX;
15867 for (uint32_t n = 8; n <= 12; n += 4) {
15868 for (size_t k = 1; k <= 40; k += 9) {
15869 GemmMicrokernelTester()
15870 .mr(4)
15871 .nr(4)
15872 .kr(2)
15873 .sr(1)
15874 .m(4)
15875 .n(n)
15876 .k(k)
15877 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080015878 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015879 }
15880 }
15881 }
15882
15883 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4_strided_a) {
15884 TEST_REQUIRES_X86_AVX;
15885 for (uint32_t n = 8; n <= 12; n += 4) {
15886 for (size_t k = 1; k <= 40; k += 9) {
15887 GemmMicrokernelTester()
15888 .mr(4)
15889 .nr(4)
15890 .kr(2)
15891 .sr(1)
15892 .m(4)
15893 .n(n)
15894 .k(k)
15895 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080015896 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015897 }
15898 }
15899 }
15900
15901 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4_subtile) {
15902 TEST_REQUIRES_X86_AVX;
15903 for (uint32_t n = 8; n <= 12; n += 4) {
15904 for (size_t k = 1; k <= 40; k += 9) {
15905 for (uint32_t m = 1; m <= 4; m++) {
15906 GemmMicrokernelTester()
15907 .mr(4)
15908 .nr(4)
15909 .kr(2)
15910 .sr(1)
15911 .m(m)
15912 .n(n)
15913 .k(k)
15914 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015915 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015916 }
15917 }
15918 }
15919 }
15920
15921 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, strided_cm_subtile) {
15922 TEST_REQUIRES_X86_AVX;
15923 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080015924 for (uint32_t n = 1; n <= 4; n++) {
15925 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015926 GemmMicrokernelTester()
15927 .mr(4)
15928 .nr(4)
15929 .kr(2)
15930 .sr(1)
15931 .m(m)
15932 .n(n)
15933 .k(k)
15934 .cm_stride(7)
15935 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080015936 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015937 }
15938 }
15939 }
15940 }
15941
15942 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, qmin) {
15943 TEST_REQUIRES_X86_AVX;
15944 GemmMicrokernelTester()
15945 .mr(4)
15946 .nr(4)
15947 .kr(2)
15948 .sr(1)
15949 .m(4)
15950 .n(4)
15951 .k(8)
15952 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080015953 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015954 }
15955
15956 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, qmax) {
15957 TEST_REQUIRES_X86_AVX;
15958 GemmMicrokernelTester()
15959 .mr(4)
15960 .nr(4)
15961 .kr(2)
15962 .sr(1)
15963 .m(4)
15964 .n(4)
15965 .k(8)
15966 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080015967 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015968 }
15969
15970 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, strided_cm) {
15971 TEST_REQUIRES_X86_AVX;
15972 GemmMicrokernelTester()
15973 .mr(4)
15974 .nr(4)
15975 .kr(2)
15976 .sr(1)
15977 .m(4)
15978 .n(4)
15979 .k(8)
15980 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080015981 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015982 }
15983#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
15984
15985
15986#if XNN_ARCH_X86 || XNN_ARCH_X86_64
15987 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8) {
15988 TEST_REQUIRES_X86_AVX;
15989 GemmMicrokernelTester()
15990 .mr(2)
15991 .nr(4)
15992 .kr(2)
15993 .sr(1)
15994 .m(2)
15995 .n(4)
15996 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080015997 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080015998 }
15999
16000 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, strided_cn) {
16001 TEST_REQUIRES_X86_AVX;
16002 GemmMicrokernelTester()
16003 .mr(2)
16004 .nr(4)
16005 .kr(2)
16006 .sr(1)
16007 .m(2)
16008 .n(4)
16009 .k(8)
16010 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080016011 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016012 }
16013
16014 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_strided_a) {
16015 TEST_REQUIRES_X86_AVX;
16016 GemmMicrokernelTester()
16017 .mr(2)
16018 .nr(4)
16019 .kr(2)
16020 .sr(1)
16021 .m(2)
16022 .n(4)
16023 .k(8)
16024 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080016025 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016026 }
16027
16028 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_subtile) {
16029 TEST_REQUIRES_X86_AVX;
Zhi An Ng83844ae2022-01-14 09:52:25 -080016030 for (uint32_t n = 1; n <= 4; n++) {
16031 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016032 GemmMicrokernelTester()
16033 .mr(2)
16034 .nr(4)
16035 .kr(2)
16036 .sr(1)
16037 .m(m)
16038 .n(n)
16039 .k(8)
16040 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016041 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016042 }
16043 }
16044 }
16045
16046 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_subtile_m) {
16047 TEST_REQUIRES_X86_AVX;
16048 for (uint32_t m = 1; m <= 2; m++) {
16049 GemmMicrokernelTester()
16050 .mr(2)
16051 .nr(4)
16052 .kr(2)
16053 .sr(1)
16054 .m(m)
16055 .n(4)
16056 .k(8)
16057 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016058 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016059 }
16060 }
16061
16062 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_subtile_n) {
16063 TEST_REQUIRES_X86_AVX;
16064 for (uint32_t n = 1; n <= 4; n++) {
16065 GemmMicrokernelTester()
16066 .mr(2)
16067 .nr(4)
16068 .kr(2)
16069 .sr(1)
16070 .m(2)
16071 .n(n)
16072 .k(8)
16073 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016074 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016075 }
16076 }
16077
16078 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_lt_8) {
16079 TEST_REQUIRES_X86_AVX;
16080 for (size_t k = 1; k < 8; k++) {
16081 GemmMicrokernelTester()
16082 .mr(2)
16083 .nr(4)
16084 .kr(2)
16085 .sr(1)
16086 .m(2)
16087 .n(4)
16088 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080016089 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016090 }
16091 }
16092
16093 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_lt_8_strided_a) {
16094 TEST_REQUIRES_X86_AVX;
16095 for (size_t k = 1; k < 8; k++) {
16096 GemmMicrokernelTester()
16097 .mr(2)
16098 .nr(4)
16099 .kr(2)
16100 .sr(1)
16101 .m(2)
16102 .n(4)
16103 .k(k)
16104 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080016105 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016106 }
16107 }
16108
16109 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_lt_8_subtile) {
16110 TEST_REQUIRES_X86_AVX;
16111 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080016112 for (uint32_t n = 1; n <= 4; n++) {
16113 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016114 GemmMicrokernelTester()
16115 .mr(2)
16116 .nr(4)
16117 .kr(2)
16118 .sr(1)
16119 .m(m)
16120 .n(n)
16121 .k(k)
16122 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016123 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016124 }
16125 }
16126 }
16127 }
16128
16129 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_gt_8) {
16130 TEST_REQUIRES_X86_AVX;
16131 for (size_t k = 9; k < 16; k++) {
16132 GemmMicrokernelTester()
16133 .mr(2)
16134 .nr(4)
16135 .kr(2)
16136 .sr(1)
16137 .m(2)
16138 .n(4)
16139 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080016140 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016141 }
16142 }
16143
16144 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_gt_8_strided_a) {
16145 TEST_REQUIRES_X86_AVX;
16146 for (size_t k = 9; k < 16; k++) {
16147 GemmMicrokernelTester()
16148 .mr(2)
16149 .nr(4)
16150 .kr(2)
16151 .sr(1)
16152 .m(2)
16153 .n(4)
16154 .k(k)
16155 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080016156 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016157 }
16158 }
16159
16160 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_gt_8_subtile) {
16161 TEST_REQUIRES_X86_AVX;
16162 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080016163 for (uint32_t n = 1; n <= 4; n++) {
16164 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016165 GemmMicrokernelTester()
16166 .mr(2)
16167 .nr(4)
16168 .kr(2)
16169 .sr(1)
16170 .m(m)
16171 .n(n)
16172 .k(k)
16173 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016174 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016175 }
16176 }
16177 }
16178 }
16179
16180 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_div_8) {
16181 TEST_REQUIRES_X86_AVX;
16182 for (size_t k = 16; k <= 80; k += 8) {
16183 GemmMicrokernelTester()
16184 .mr(2)
16185 .nr(4)
16186 .kr(2)
16187 .sr(1)
16188 .m(2)
16189 .n(4)
16190 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080016191 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016192 }
16193 }
16194
16195 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_div_8_strided_a) {
16196 TEST_REQUIRES_X86_AVX;
16197 for (size_t k = 16; k <= 80; k += 8) {
16198 GemmMicrokernelTester()
16199 .mr(2)
16200 .nr(4)
16201 .kr(2)
16202 .sr(1)
16203 .m(2)
16204 .n(4)
16205 .k(k)
16206 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080016207 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016208 }
16209 }
16210
16211 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_div_8_subtile) {
16212 TEST_REQUIRES_X86_AVX;
16213 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080016214 for (uint32_t n = 1; n <= 4; n++) {
16215 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016216 GemmMicrokernelTester()
16217 .mr(2)
16218 .nr(4)
16219 .kr(2)
16220 .sr(1)
16221 .m(m)
16222 .n(n)
16223 .k(k)
16224 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016225 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016226 }
16227 }
16228 }
16229 }
16230
16231 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4) {
16232 TEST_REQUIRES_X86_AVX;
16233 for (uint32_t n = 5; n < 8; n++) {
16234 for (size_t k = 1; k <= 40; k += 9) {
16235 GemmMicrokernelTester()
16236 .mr(2)
16237 .nr(4)
16238 .kr(2)
16239 .sr(1)
16240 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080016241 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016242 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080016243 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016244 }
16245 }
16246 }
16247
16248 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4_strided_cn) {
16249 TEST_REQUIRES_X86_AVX;
16250 for (uint32_t n = 5; n < 8; n++) {
16251 for (size_t k = 1; k <= 40; k += 9) {
16252 GemmMicrokernelTester()
16253 .mr(2)
16254 .nr(4)
16255 .kr(2)
16256 .sr(1)
16257 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080016258 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016259 .k(k)
16260 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080016261 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016262 }
16263 }
16264 }
16265
16266 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4_strided_a) {
16267 TEST_REQUIRES_X86_AVX;
16268 for (uint32_t n = 5; n < 8; n++) {
16269 for (size_t k = 1; k <= 40; k += 9) {
16270 GemmMicrokernelTester()
16271 .mr(2)
16272 .nr(4)
16273 .kr(2)
16274 .sr(1)
16275 .m(2)
16276 .n(n)
16277 .k(k)
16278 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080016279 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016280 }
16281 }
16282 }
16283
16284 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4_subtile) {
16285 TEST_REQUIRES_X86_AVX;
16286 for (uint32_t n = 5; n < 8; n++) {
16287 for (size_t k = 1; k <= 40; k += 9) {
16288 for (uint32_t m = 1; m <= 2; m++) {
16289 GemmMicrokernelTester()
16290 .mr(2)
16291 .nr(4)
16292 .kr(2)
16293 .sr(1)
16294 .m(m)
16295 .n(n)
16296 .k(k)
16297 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016298 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016299 }
16300 }
16301 }
16302 }
16303
16304 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4) {
16305 TEST_REQUIRES_X86_AVX;
16306 for (uint32_t n = 8; n <= 12; n += 4) {
16307 for (size_t k = 1; k <= 40; k += 9) {
16308 GemmMicrokernelTester()
16309 .mr(2)
16310 .nr(4)
16311 .kr(2)
16312 .sr(1)
16313 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080016314 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016315 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080016316 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016317 }
16318 }
16319 }
16320
16321 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4_strided_cn) {
16322 TEST_REQUIRES_X86_AVX;
16323 for (uint32_t n = 8; n <= 12; n += 4) {
16324 for (size_t k = 1; k <= 40; k += 9) {
16325 GemmMicrokernelTester()
16326 .mr(2)
16327 .nr(4)
16328 .kr(2)
16329 .sr(1)
16330 .m(2)
16331 .n(n)
16332 .k(k)
16333 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080016334 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016335 }
16336 }
16337 }
16338
16339 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4_strided_a) {
16340 TEST_REQUIRES_X86_AVX;
16341 for (uint32_t n = 8; n <= 12; n += 4) {
16342 for (size_t k = 1; k <= 40; k += 9) {
16343 GemmMicrokernelTester()
16344 .mr(2)
16345 .nr(4)
16346 .kr(2)
16347 .sr(1)
16348 .m(2)
16349 .n(n)
16350 .k(k)
16351 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080016352 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016353 }
16354 }
16355 }
16356
16357 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4_subtile) {
16358 TEST_REQUIRES_X86_AVX;
16359 for (uint32_t n = 8; n <= 12; n += 4) {
16360 for (size_t k = 1; k <= 40; k += 9) {
16361 for (uint32_t m = 1; m <= 2; m++) {
16362 GemmMicrokernelTester()
16363 .mr(2)
16364 .nr(4)
16365 .kr(2)
16366 .sr(1)
16367 .m(m)
16368 .n(n)
16369 .k(k)
16370 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016371 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016372 }
16373 }
16374 }
16375 }
16376
16377 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, strided_cm_subtile) {
16378 TEST_REQUIRES_X86_AVX;
16379 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080016380 for (uint32_t n = 1; n <= 4; n++) {
16381 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016382 GemmMicrokernelTester()
16383 .mr(2)
16384 .nr(4)
16385 .kr(2)
16386 .sr(1)
16387 .m(m)
16388 .n(n)
16389 .k(k)
16390 .cm_stride(7)
16391 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016392 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016393 }
16394 }
16395 }
16396 }
16397
16398 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, qmin) {
16399 TEST_REQUIRES_X86_AVX;
16400 GemmMicrokernelTester()
16401 .mr(2)
16402 .nr(4)
16403 .kr(2)
16404 .sr(1)
16405 .m(2)
16406 .n(4)
16407 .k(8)
16408 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080016409 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016410 }
16411
16412 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, qmax) {
16413 TEST_REQUIRES_X86_AVX;
16414 GemmMicrokernelTester()
16415 .mr(2)
16416 .nr(4)
16417 .kr(2)
16418 .sr(1)
16419 .m(2)
16420 .n(4)
16421 .k(8)
16422 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080016423 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016424 }
16425
16426 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, strided_cm) {
16427 TEST_REQUIRES_X86_AVX;
16428 GemmMicrokernelTester()
16429 .mr(2)
16430 .nr(4)
16431 .kr(2)
16432 .sr(1)
16433 .m(2)
16434 .n(4)
16435 .k(8)
16436 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080016437 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016438 }
16439#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
16440
16441
16442#if XNN_ARCH_X86 || XNN_ARCH_X86_64
16443 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8) {
16444 TEST_REQUIRES_X86_AVX;
16445 GemmMicrokernelTester()
16446 .mr(3)
16447 .nr(4)
16448 .kr(2)
16449 .sr(1)
16450 .m(3)
16451 .n(4)
16452 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080016453 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016454 }
16455
16456 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, strided_cn) {
16457 TEST_REQUIRES_X86_AVX;
16458 GemmMicrokernelTester()
16459 .mr(3)
16460 .nr(4)
16461 .kr(2)
16462 .sr(1)
16463 .m(3)
16464 .n(4)
16465 .k(8)
16466 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080016467 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016468 }
16469
16470 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8_strided_a) {
16471 TEST_REQUIRES_X86_AVX;
16472 GemmMicrokernelTester()
16473 .mr(3)
16474 .nr(4)
16475 .kr(2)
16476 .sr(1)
16477 .m(3)
16478 .n(4)
16479 .k(8)
16480 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080016481 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016482 }
16483
16484 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8_subtile) {
16485 TEST_REQUIRES_X86_AVX;
Zhi An Ng83844ae2022-01-14 09:52:25 -080016486 for (uint32_t n = 1; n <= 4; n++) {
16487 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016488 GemmMicrokernelTester()
16489 .mr(3)
16490 .nr(4)
16491 .kr(2)
16492 .sr(1)
16493 .m(m)
16494 .n(n)
16495 .k(8)
16496 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016497 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016498 }
16499 }
16500 }
16501
16502 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8_subtile_m) {
16503 TEST_REQUIRES_X86_AVX;
16504 for (uint32_t m = 1; m <= 3; m++) {
16505 GemmMicrokernelTester()
16506 .mr(3)
16507 .nr(4)
16508 .kr(2)
16509 .sr(1)
16510 .m(m)
16511 .n(4)
16512 .k(8)
16513 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016514 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016515 }
16516 }
16517
16518 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8_subtile_n) {
16519 TEST_REQUIRES_X86_AVX;
16520 for (uint32_t n = 1; n <= 4; n++) {
16521 GemmMicrokernelTester()
16522 .mr(3)
16523 .nr(4)
16524 .kr(2)
16525 .sr(1)
16526 .m(3)
16527 .n(n)
16528 .k(8)
16529 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016530 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016531 }
16532 }
16533
16534 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_lt_8) {
16535 TEST_REQUIRES_X86_AVX;
16536 for (size_t k = 1; k < 8; k++) {
16537 GemmMicrokernelTester()
16538 .mr(3)
16539 .nr(4)
16540 .kr(2)
16541 .sr(1)
16542 .m(3)
16543 .n(4)
16544 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080016545 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016546 }
16547 }
16548
16549 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_lt_8_strided_a) {
16550 TEST_REQUIRES_X86_AVX;
16551 for (size_t k = 1; k < 8; k++) {
16552 GemmMicrokernelTester()
16553 .mr(3)
16554 .nr(4)
16555 .kr(2)
16556 .sr(1)
16557 .m(3)
16558 .n(4)
16559 .k(k)
16560 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080016561 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016562 }
16563 }
16564
16565 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_lt_8_subtile) {
16566 TEST_REQUIRES_X86_AVX;
16567 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080016568 for (uint32_t n = 1; n <= 4; n++) {
16569 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016570 GemmMicrokernelTester()
16571 .mr(3)
16572 .nr(4)
16573 .kr(2)
16574 .sr(1)
16575 .m(m)
16576 .n(n)
16577 .k(k)
16578 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016579 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016580 }
16581 }
16582 }
16583 }
16584
16585 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_gt_8) {
16586 TEST_REQUIRES_X86_AVX;
16587 for (size_t k = 9; k < 16; k++) {
16588 GemmMicrokernelTester()
16589 .mr(3)
16590 .nr(4)
16591 .kr(2)
16592 .sr(1)
16593 .m(3)
16594 .n(4)
16595 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080016596 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016597 }
16598 }
16599
16600 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_gt_8_strided_a) {
16601 TEST_REQUIRES_X86_AVX;
16602 for (size_t k = 9; k < 16; k++) {
16603 GemmMicrokernelTester()
16604 .mr(3)
16605 .nr(4)
16606 .kr(2)
16607 .sr(1)
16608 .m(3)
16609 .n(4)
16610 .k(k)
16611 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080016612 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016613 }
16614 }
16615
16616 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_gt_8_subtile) {
16617 TEST_REQUIRES_X86_AVX;
16618 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080016619 for (uint32_t n = 1; n <= 4; n++) {
16620 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016621 GemmMicrokernelTester()
16622 .mr(3)
16623 .nr(4)
16624 .kr(2)
16625 .sr(1)
16626 .m(m)
16627 .n(n)
16628 .k(k)
16629 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016630 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016631 }
16632 }
16633 }
16634 }
16635
16636 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_div_8) {
16637 TEST_REQUIRES_X86_AVX;
16638 for (size_t k = 16; k <= 80; k += 8) {
16639 GemmMicrokernelTester()
16640 .mr(3)
16641 .nr(4)
16642 .kr(2)
16643 .sr(1)
16644 .m(3)
16645 .n(4)
16646 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080016647 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016648 }
16649 }
16650
16651 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_div_8_strided_a) {
16652 TEST_REQUIRES_X86_AVX;
16653 for (size_t k = 16; k <= 80; k += 8) {
16654 GemmMicrokernelTester()
16655 .mr(3)
16656 .nr(4)
16657 .kr(2)
16658 .sr(1)
16659 .m(3)
16660 .n(4)
16661 .k(k)
16662 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080016663 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016664 }
16665 }
16666
16667 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_div_8_subtile) {
16668 TEST_REQUIRES_X86_AVX;
16669 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080016670 for (uint32_t n = 1; n <= 4; n++) {
16671 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016672 GemmMicrokernelTester()
16673 .mr(3)
16674 .nr(4)
16675 .kr(2)
16676 .sr(1)
16677 .m(m)
16678 .n(n)
16679 .k(k)
16680 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016681 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016682 }
16683 }
16684 }
16685 }
16686
16687 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4) {
16688 TEST_REQUIRES_X86_AVX;
16689 for (uint32_t n = 5; n < 8; n++) {
16690 for (size_t k = 1; k <= 40; k += 9) {
16691 GemmMicrokernelTester()
16692 .mr(3)
16693 .nr(4)
16694 .kr(2)
16695 .sr(1)
16696 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080016697 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016698 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080016699 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016700 }
16701 }
16702 }
16703
16704 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4_strided_cn) {
16705 TEST_REQUIRES_X86_AVX;
16706 for (uint32_t n = 5; n < 8; n++) {
16707 for (size_t k = 1; k <= 40; k += 9) {
16708 GemmMicrokernelTester()
16709 .mr(3)
16710 .nr(4)
16711 .kr(2)
16712 .sr(1)
16713 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080016714 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016715 .k(k)
16716 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080016717 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016718 }
16719 }
16720 }
16721
16722 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4_strided_a) {
16723 TEST_REQUIRES_X86_AVX;
16724 for (uint32_t n = 5; n < 8; n++) {
16725 for (size_t k = 1; k <= 40; k += 9) {
16726 GemmMicrokernelTester()
16727 .mr(3)
16728 .nr(4)
16729 .kr(2)
16730 .sr(1)
16731 .m(3)
16732 .n(n)
16733 .k(k)
16734 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080016735 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016736 }
16737 }
16738 }
16739
16740 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4_subtile) {
16741 TEST_REQUIRES_X86_AVX;
16742 for (uint32_t n = 5; n < 8; n++) {
16743 for (size_t k = 1; k <= 40; k += 9) {
16744 for (uint32_t m = 1; m <= 3; m++) {
16745 GemmMicrokernelTester()
16746 .mr(3)
16747 .nr(4)
16748 .kr(2)
16749 .sr(1)
16750 .m(m)
16751 .n(n)
16752 .k(k)
16753 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016754 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016755 }
16756 }
16757 }
16758 }
16759
16760 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4) {
16761 TEST_REQUIRES_X86_AVX;
16762 for (uint32_t n = 8; n <= 12; n += 4) {
16763 for (size_t k = 1; k <= 40; k += 9) {
16764 GemmMicrokernelTester()
16765 .mr(3)
16766 .nr(4)
16767 .kr(2)
16768 .sr(1)
16769 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080016770 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016771 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080016772 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016773 }
16774 }
16775 }
16776
16777 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4_strided_cn) {
16778 TEST_REQUIRES_X86_AVX;
16779 for (uint32_t n = 8; n <= 12; n += 4) {
16780 for (size_t k = 1; k <= 40; k += 9) {
16781 GemmMicrokernelTester()
16782 .mr(3)
16783 .nr(4)
16784 .kr(2)
16785 .sr(1)
16786 .m(3)
16787 .n(n)
16788 .k(k)
16789 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080016790 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016791 }
16792 }
16793 }
16794
16795 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4_strided_a) {
16796 TEST_REQUIRES_X86_AVX;
16797 for (uint32_t n = 8; n <= 12; n += 4) {
16798 for (size_t k = 1; k <= 40; k += 9) {
16799 GemmMicrokernelTester()
16800 .mr(3)
16801 .nr(4)
16802 .kr(2)
16803 .sr(1)
16804 .m(3)
16805 .n(n)
16806 .k(k)
16807 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080016808 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016809 }
16810 }
16811 }
16812
16813 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4_subtile) {
16814 TEST_REQUIRES_X86_AVX;
16815 for (uint32_t n = 8; n <= 12; n += 4) {
16816 for (size_t k = 1; k <= 40; k += 9) {
16817 for (uint32_t m = 1; m <= 3; m++) {
16818 GemmMicrokernelTester()
16819 .mr(3)
16820 .nr(4)
16821 .kr(2)
16822 .sr(1)
16823 .m(m)
16824 .n(n)
16825 .k(k)
16826 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016827 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016828 }
16829 }
16830 }
16831 }
16832
16833 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, strided_cm_subtile) {
16834 TEST_REQUIRES_X86_AVX;
16835 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080016836 for (uint32_t n = 1; n <= 4; n++) {
16837 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016838 GemmMicrokernelTester()
16839 .mr(3)
16840 .nr(4)
16841 .kr(2)
16842 .sr(1)
16843 .m(m)
16844 .n(n)
16845 .k(k)
16846 .cm_stride(7)
16847 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016848 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016849 }
16850 }
16851 }
16852 }
16853
16854 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, qmin) {
16855 TEST_REQUIRES_X86_AVX;
16856 GemmMicrokernelTester()
16857 .mr(3)
16858 .nr(4)
16859 .kr(2)
16860 .sr(1)
16861 .m(3)
16862 .n(4)
16863 .k(8)
16864 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080016865 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016866 }
16867
16868 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, qmax) {
16869 TEST_REQUIRES_X86_AVX;
16870 GemmMicrokernelTester()
16871 .mr(3)
16872 .nr(4)
16873 .kr(2)
16874 .sr(1)
16875 .m(3)
16876 .n(4)
16877 .k(8)
16878 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080016879 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016880 }
16881
16882 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, strided_cm) {
16883 TEST_REQUIRES_X86_AVX;
16884 GemmMicrokernelTester()
16885 .mr(3)
16886 .nr(4)
16887 .kr(2)
16888 .sr(1)
16889 .m(3)
16890 .n(4)
16891 .k(8)
16892 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080016893 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016894 }
16895#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
16896
16897
16898#if XNN_ARCH_X86 || XNN_ARCH_X86_64
16899 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8) {
16900 TEST_REQUIRES_X86_AVX;
16901 GemmMicrokernelTester()
16902 .mr(4)
16903 .nr(4)
16904 .kr(2)
16905 .sr(1)
16906 .m(4)
16907 .n(4)
16908 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080016909 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016910 }
16911
16912 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, strided_cn) {
16913 TEST_REQUIRES_X86_AVX;
16914 GemmMicrokernelTester()
16915 .mr(4)
16916 .nr(4)
16917 .kr(2)
16918 .sr(1)
16919 .m(4)
16920 .n(4)
16921 .k(8)
16922 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080016923 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016924 }
16925
16926 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_strided_a) {
16927 TEST_REQUIRES_X86_AVX;
16928 GemmMicrokernelTester()
16929 .mr(4)
16930 .nr(4)
16931 .kr(2)
16932 .sr(1)
16933 .m(4)
16934 .n(4)
16935 .k(8)
16936 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080016937 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016938 }
16939
16940 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_subtile) {
16941 TEST_REQUIRES_X86_AVX;
Zhi An Ng83844ae2022-01-14 09:52:25 -080016942 for (uint32_t n = 1; n <= 4; n++) {
16943 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016944 GemmMicrokernelTester()
16945 .mr(4)
16946 .nr(4)
16947 .kr(2)
16948 .sr(1)
16949 .m(m)
16950 .n(n)
16951 .k(8)
16952 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016953 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016954 }
16955 }
16956 }
16957
16958 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_subtile_m) {
16959 TEST_REQUIRES_X86_AVX;
16960 for (uint32_t m = 1; m <= 4; m++) {
16961 GemmMicrokernelTester()
16962 .mr(4)
16963 .nr(4)
16964 .kr(2)
16965 .sr(1)
16966 .m(m)
16967 .n(4)
16968 .k(8)
16969 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016970 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016971 }
16972 }
16973
16974 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_subtile_n) {
16975 TEST_REQUIRES_X86_AVX;
16976 for (uint32_t n = 1; n <= 4; n++) {
16977 GemmMicrokernelTester()
16978 .mr(4)
16979 .nr(4)
16980 .kr(2)
16981 .sr(1)
16982 .m(4)
16983 .n(n)
16984 .k(8)
16985 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080016986 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080016987 }
16988 }
16989
16990 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_lt_8) {
16991 TEST_REQUIRES_X86_AVX;
16992 for (size_t k = 1; k < 8; k++) {
16993 GemmMicrokernelTester()
16994 .mr(4)
16995 .nr(4)
16996 .kr(2)
16997 .sr(1)
16998 .m(4)
16999 .n(4)
17000 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017001 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017002 }
17003 }
17004
17005 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_lt_8_strided_a) {
17006 TEST_REQUIRES_X86_AVX;
17007 for (size_t k = 1; k < 8; k++) {
17008 GemmMicrokernelTester()
17009 .mr(4)
17010 .nr(4)
17011 .kr(2)
17012 .sr(1)
17013 .m(4)
17014 .n(4)
17015 .k(k)
17016 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080017017 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017018 }
17019 }
17020
17021 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_lt_8_subtile) {
17022 TEST_REQUIRES_X86_AVX;
17023 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080017024 for (uint32_t n = 1; n <= 4; n++) {
17025 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017026 GemmMicrokernelTester()
17027 .mr(4)
17028 .nr(4)
17029 .kr(2)
17030 .sr(1)
17031 .m(m)
17032 .n(n)
17033 .k(k)
17034 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017035 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017036 }
17037 }
17038 }
17039 }
17040
17041 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_gt_8) {
17042 TEST_REQUIRES_X86_AVX;
17043 for (size_t k = 9; k < 16; k++) {
17044 GemmMicrokernelTester()
17045 .mr(4)
17046 .nr(4)
17047 .kr(2)
17048 .sr(1)
17049 .m(4)
17050 .n(4)
17051 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017052 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017053 }
17054 }
17055
17056 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_gt_8_strided_a) {
17057 TEST_REQUIRES_X86_AVX;
17058 for (size_t k = 9; k < 16; k++) {
17059 GemmMicrokernelTester()
17060 .mr(4)
17061 .nr(4)
17062 .kr(2)
17063 .sr(1)
17064 .m(4)
17065 .n(4)
17066 .k(k)
17067 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080017068 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017069 }
17070 }
17071
17072 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_gt_8_subtile) {
17073 TEST_REQUIRES_X86_AVX;
17074 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080017075 for (uint32_t n = 1; n <= 4; n++) {
17076 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017077 GemmMicrokernelTester()
17078 .mr(4)
17079 .nr(4)
17080 .kr(2)
17081 .sr(1)
17082 .m(m)
17083 .n(n)
17084 .k(k)
17085 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017086 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017087 }
17088 }
17089 }
17090 }
17091
17092 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_div_8) {
17093 TEST_REQUIRES_X86_AVX;
17094 for (size_t k = 16; k <= 80; k += 8) {
17095 GemmMicrokernelTester()
17096 .mr(4)
17097 .nr(4)
17098 .kr(2)
17099 .sr(1)
17100 .m(4)
17101 .n(4)
17102 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017103 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017104 }
17105 }
17106
17107 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_div_8_strided_a) {
17108 TEST_REQUIRES_X86_AVX;
17109 for (size_t k = 16; k <= 80; k += 8) {
17110 GemmMicrokernelTester()
17111 .mr(4)
17112 .nr(4)
17113 .kr(2)
17114 .sr(1)
17115 .m(4)
17116 .n(4)
17117 .k(k)
17118 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080017119 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017120 }
17121 }
17122
17123 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_div_8_subtile) {
17124 TEST_REQUIRES_X86_AVX;
17125 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080017126 for (uint32_t n = 1; n <= 4; n++) {
17127 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017128 GemmMicrokernelTester()
17129 .mr(4)
17130 .nr(4)
17131 .kr(2)
17132 .sr(1)
17133 .m(m)
17134 .n(n)
17135 .k(k)
17136 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017137 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017138 }
17139 }
17140 }
17141 }
17142
17143 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4) {
17144 TEST_REQUIRES_X86_AVX;
17145 for (uint32_t n = 5; n < 8; n++) {
17146 for (size_t k = 1; k <= 40; k += 9) {
17147 GemmMicrokernelTester()
17148 .mr(4)
17149 .nr(4)
17150 .kr(2)
17151 .sr(1)
17152 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080017153 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017154 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017155 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017156 }
17157 }
17158 }
17159
17160 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4_strided_cn) {
17161 TEST_REQUIRES_X86_AVX;
17162 for (uint32_t n = 5; n < 8; n++) {
17163 for (size_t k = 1; k <= 40; k += 9) {
17164 GemmMicrokernelTester()
17165 .mr(4)
17166 .nr(4)
17167 .kr(2)
17168 .sr(1)
17169 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080017170 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017171 .k(k)
17172 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080017173 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017174 }
17175 }
17176 }
17177
17178 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4_strided_a) {
17179 TEST_REQUIRES_X86_AVX;
17180 for (uint32_t n = 5; n < 8; n++) {
17181 for (size_t k = 1; k <= 40; k += 9) {
17182 GemmMicrokernelTester()
17183 .mr(4)
17184 .nr(4)
17185 .kr(2)
17186 .sr(1)
17187 .m(4)
17188 .n(n)
17189 .k(k)
17190 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080017191 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017192 }
17193 }
17194 }
17195
17196 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4_subtile) {
17197 TEST_REQUIRES_X86_AVX;
17198 for (uint32_t n = 5; n < 8; n++) {
17199 for (size_t k = 1; k <= 40; k += 9) {
17200 for (uint32_t m = 1; m <= 4; m++) {
17201 GemmMicrokernelTester()
17202 .mr(4)
17203 .nr(4)
17204 .kr(2)
17205 .sr(1)
17206 .m(m)
17207 .n(n)
17208 .k(k)
17209 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017210 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017211 }
17212 }
17213 }
17214 }
17215
17216 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4) {
17217 TEST_REQUIRES_X86_AVX;
17218 for (uint32_t n = 8; n <= 12; n += 4) {
17219 for (size_t k = 1; k <= 40; k += 9) {
17220 GemmMicrokernelTester()
17221 .mr(4)
17222 .nr(4)
17223 .kr(2)
17224 .sr(1)
17225 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080017226 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017227 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017228 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017229 }
17230 }
17231 }
17232
17233 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4_strided_cn) {
17234 TEST_REQUIRES_X86_AVX;
17235 for (uint32_t n = 8; n <= 12; n += 4) {
17236 for (size_t k = 1; k <= 40; k += 9) {
17237 GemmMicrokernelTester()
17238 .mr(4)
17239 .nr(4)
17240 .kr(2)
17241 .sr(1)
17242 .m(4)
17243 .n(n)
17244 .k(k)
17245 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080017246 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017247 }
17248 }
17249 }
17250
17251 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4_strided_a) {
17252 TEST_REQUIRES_X86_AVX;
17253 for (uint32_t n = 8; n <= 12; n += 4) {
17254 for (size_t k = 1; k <= 40; k += 9) {
17255 GemmMicrokernelTester()
17256 .mr(4)
17257 .nr(4)
17258 .kr(2)
17259 .sr(1)
17260 .m(4)
17261 .n(n)
17262 .k(k)
17263 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080017264 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017265 }
17266 }
17267 }
17268
17269 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4_subtile) {
17270 TEST_REQUIRES_X86_AVX;
17271 for (uint32_t n = 8; n <= 12; n += 4) {
17272 for (size_t k = 1; k <= 40; k += 9) {
17273 for (uint32_t m = 1; m <= 4; m++) {
17274 GemmMicrokernelTester()
17275 .mr(4)
17276 .nr(4)
17277 .kr(2)
17278 .sr(1)
17279 .m(m)
17280 .n(n)
17281 .k(k)
17282 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017283 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017284 }
17285 }
17286 }
17287 }
17288
17289 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, strided_cm_subtile) {
17290 TEST_REQUIRES_X86_AVX;
17291 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080017292 for (uint32_t n = 1; n <= 4; n++) {
17293 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017294 GemmMicrokernelTester()
17295 .mr(4)
17296 .nr(4)
17297 .kr(2)
17298 .sr(1)
17299 .m(m)
17300 .n(n)
17301 .k(k)
17302 .cm_stride(7)
17303 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017304 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017305 }
17306 }
17307 }
17308 }
17309
17310 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, qmin) {
17311 TEST_REQUIRES_X86_AVX;
17312 GemmMicrokernelTester()
17313 .mr(4)
17314 .nr(4)
17315 .kr(2)
17316 .sr(1)
17317 .m(4)
17318 .n(4)
17319 .k(8)
17320 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080017321 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017322 }
17323
17324 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, qmax) {
17325 TEST_REQUIRES_X86_AVX;
17326 GemmMicrokernelTester()
17327 .mr(4)
17328 .nr(4)
17329 .kr(2)
17330 .sr(1)
17331 .m(4)
17332 .n(4)
17333 .k(8)
17334 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080017335 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017336 }
17337
17338 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, strided_cm) {
17339 TEST_REQUIRES_X86_AVX;
17340 GemmMicrokernelTester()
17341 .mr(4)
17342 .nr(4)
17343 .kr(2)
17344 .sr(1)
17345 .m(4)
17346 .n(4)
17347 .k(8)
17348 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080017349 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017350 }
17351#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
17352
17353
17354#if XNN_ARCH_X86 || XNN_ARCH_X86_64
17355 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8) {
17356 TEST_REQUIRES_X86_SSE2;
17357 GemmMicrokernelTester()
17358 .mr(3)
17359 .nr(4)
17360 .kr(8)
17361 .sr(1)
17362 .m(3)
17363 .n(4)
17364 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080017365 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017366 }
17367
17368 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, strided_cn) {
17369 TEST_REQUIRES_X86_SSE2;
17370 GemmMicrokernelTester()
17371 .mr(3)
17372 .nr(4)
17373 .kr(8)
17374 .sr(1)
17375 .m(3)
17376 .n(4)
17377 .k(8)
17378 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080017379 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017380 }
17381
17382 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_strided_a) {
17383 TEST_REQUIRES_X86_SSE2;
17384 GemmMicrokernelTester()
17385 .mr(3)
17386 .nr(4)
17387 .kr(8)
17388 .sr(1)
17389 .m(3)
17390 .n(4)
17391 .k(8)
17392 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080017393 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017394 }
17395
17396 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_subtile) {
17397 TEST_REQUIRES_X86_SSE2;
Zhi An Ng83844ae2022-01-14 09:52:25 -080017398 for (uint32_t n = 1; n <= 4; n++) {
17399 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017400 GemmMicrokernelTester()
17401 .mr(3)
17402 .nr(4)
17403 .kr(8)
17404 .sr(1)
17405 .m(m)
17406 .n(n)
17407 .k(8)
17408 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017409 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017410 }
17411 }
17412 }
17413
17414 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_subtile_m) {
17415 TEST_REQUIRES_X86_SSE2;
17416 for (uint32_t m = 1; m <= 3; m++) {
17417 GemmMicrokernelTester()
17418 .mr(3)
17419 .nr(4)
17420 .kr(8)
17421 .sr(1)
17422 .m(m)
17423 .n(4)
17424 .k(8)
17425 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017426 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017427 }
17428 }
17429
17430 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_subtile_n) {
17431 TEST_REQUIRES_X86_SSE2;
17432 for (uint32_t n = 1; n <= 4; n++) {
17433 GemmMicrokernelTester()
17434 .mr(3)
17435 .nr(4)
17436 .kr(8)
17437 .sr(1)
17438 .m(3)
17439 .n(n)
17440 .k(8)
17441 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017442 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017443 }
17444 }
17445
17446 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_lt_8) {
17447 TEST_REQUIRES_X86_SSE2;
17448 for (size_t k = 1; k < 8; k++) {
17449 GemmMicrokernelTester()
17450 .mr(3)
17451 .nr(4)
17452 .kr(8)
17453 .sr(1)
17454 .m(3)
17455 .n(4)
17456 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017457 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017458 }
17459 }
17460
17461 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_lt_8_strided_a) {
17462 TEST_REQUIRES_X86_SSE2;
17463 for (size_t k = 1; k < 8; k++) {
17464 GemmMicrokernelTester()
17465 .mr(3)
17466 .nr(4)
17467 .kr(8)
17468 .sr(1)
17469 .m(3)
17470 .n(4)
17471 .k(k)
17472 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080017473 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017474 }
17475 }
17476
17477 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_lt_8_subtile) {
17478 TEST_REQUIRES_X86_SSE2;
17479 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080017480 for (uint32_t n = 1; n <= 4; n++) {
17481 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017482 GemmMicrokernelTester()
17483 .mr(3)
17484 .nr(4)
17485 .kr(8)
17486 .sr(1)
17487 .m(m)
17488 .n(n)
17489 .k(k)
17490 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017491 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017492 }
17493 }
17494 }
17495 }
17496
17497 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_gt_8) {
17498 TEST_REQUIRES_X86_SSE2;
17499 for (size_t k = 9; k < 16; k++) {
17500 GemmMicrokernelTester()
17501 .mr(3)
17502 .nr(4)
17503 .kr(8)
17504 .sr(1)
17505 .m(3)
17506 .n(4)
17507 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017508 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017509 }
17510 }
17511
17512 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_gt_8_strided_a) {
17513 TEST_REQUIRES_X86_SSE2;
17514 for (size_t k = 9; k < 16; k++) {
17515 GemmMicrokernelTester()
17516 .mr(3)
17517 .nr(4)
17518 .kr(8)
17519 .sr(1)
17520 .m(3)
17521 .n(4)
17522 .k(k)
17523 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080017524 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017525 }
17526 }
17527
17528 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_gt_8_subtile) {
17529 TEST_REQUIRES_X86_SSE2;
17530 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080017531 for (uint32_t n = 1; n <= 4; n++) {
17532 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017533 GemmMicrokernelTester()
17534 .mr(3)
17535 .nr(4)
17536 .kr(8)
17537 .sr(1)
17538 .m(m)
17539 .n(n)
17540 .k(k)
17541 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017542 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017543 }
17544 }
17545 }
17546 }
17547
17548 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_div_8) {
17549 TEST_REQUIRES_X86_SSE2;
17550 for (size_t k = 16; k <= 80; k += 8) {
17551 GemmMicrokernelTester()
17552 .mr(3)
17553 .nr(4)
17554 .kr(8)
17555 .sr(1)
17556 .m(3)
17557 .n(4)
17558 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017559 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017560 }
17561 }
17562
17563 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_div_8_strided_a) {
17564 TEST_REQUIRES_X86_SSE2;
17565 for (size_t k = 16; k <= 80; k += 8) {
17566 GemmMicrokernelTester()
17567 .mr(3)
17568 .nr(4)
17569 .kr(8)
17570 .sr(1)
17571 .m(3)
17572 .n(4)
17573 .k(k)
17574 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080017575 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017576 }
17577 }
17578
17579 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_div_8_subtile) {
17580 TEST_REQUIRES_X86_SSE2;
17581 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080017582 for (uint32_t n = 1; n <= 4; n++) {
17583 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017584 GemmMicrokernelTester()
17585 .mr(3)
17586 .nr(4)
17587 .kr(8)
17588 .sr(1)
17589 .m(m)
17590 .n(n)
17591 .k(k)
17592 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017593 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017594 }
17595 }
17596 }
17597 }
17598
17599 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4) {
17600 TEST_REQUIRES_X86_SSE2;
17601 for (uint32_t n = 5; n < 8; n++) {
17602 for (size_t k = 1; k <= 40; k += 9) {
17603 GemmMicrokernelTester()
17604 .mr(3)
17605 .nr(4)
17606 .kr(8)
17607 .sr(1)
17608 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080017609 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017610 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017611 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017612 }
17613 }
17614 }
17615
17616 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4_strided_cn) {
17617 TEST_REQUIRES_X86_SSE2;
17618 for (uint32_t n = 5; n < 8; n++) {
17619 for (size_t k = 1; k <= 40; k += 9) {
17620 GemmMicrokernelTester()
17621 .mr(3)
17622 .nr(4)
17623 .kr(8)
17624 .sr(1)
17625 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080017626 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017627 .k(k)
17628 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080017629 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017630 }
17631 }
17632 }
17633
17634 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4_strided_a) {
17635 TEST_REQUIRES_X86_SSE2;
17636 for (uint32_t n = 5; n < 8; n++) {
17637 for (size_t k = 1; k <= 40; k += 9) {
17638 GemmMicrokernelTester()
17639 .mr(3)
17640 .nr(4)
17641 .kr(8)
17642 .sr(1)
17643 .m(3)
17644 .n(n)
17645 .k(k)
17646 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080017647 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017648 }
17649 }
17650 }
17651
17652 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4_subtile) {
17653 TEST_REQUIRES_X86_SSE2;
17654 for (uint32_t n = 5; n < 8; n++) {
17655 for (size_t k = 1; k <= 40; k += 9) {
17656 for (uint32_t m = 1; m <= 3; m++) {
17657 GemmMicrokernelTester()
17658 .mr(3)
17659 .nr(4)
17660 .kr(8)
17661 .sr(1)
17662 .m(m)
17663 .n(n)
17664 .k(k)
17665 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017666 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017667 }
17668 }
17669 }
17670 }
17671
17672 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4) {
17673 TEST_REQUIRES_X86_SSE2;
17674 for (uint32_t n = 8; n <= 12; n += 4) {
17675 for (size_t k = 1; k <= 40; k += 9) {
17676 GemmMicrokernelTester()
17677 .mr(3)
17678 .nr(4)
17679 .kr(8)
17680 .sr(1)
17681 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080017682 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017683 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017684 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017685 }
17686 }
17687 }
17688
17689 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4_strided_cn) {
17690 TEST_REQUIRES_X86_SSE2;
17691 for (uint32_t n = 8; n <= 12; n += 4) {
17692 for (size_t k = 1; k <= 40; k += 9) {
17693 GemmMicrokernelTester()
17694 .mr(3)
17695 .nr(4)
17696 .kr(8)
17697 .sr(1)
17698 .m(3)
17699 .n(n)
17700 .k(k)
17701 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080017702 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017703 }
17704 }
17705 }
17706
17707 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4_strided_a) {
17708 TEST_REQUIRES_X86_SSE2;
17709 for (uint32_t n = 8; n <= 12; n += 4) {
17710 for (size_t k = 1; k <= 40; k += 9) {
17711 GemmMicrokernelTester()
17712 .mr(3)
17713 .nr(4)
17714 .kr(8)
17715 .sr(1)
17716 .m(3)
17717 .n(n)
17718 .k(k)
17719 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080017720 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017721 }
17722 }
17723 }
17724
17725 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4_subtile) {
17726 TEST_REQUIRES_X86_SSE2;
17727 for (uint32_t n = 8; n <= 12; n += 4) {
17728 for (size_t k = 1; k <= 40; k += 9) {
17729 for (uint32_t m = 1; m <= 3; m++) {
17730 GemmMicrokernelTester()
17731 .mr(3)
17732 .nr(4)
17733 .kr(8)
17734 .sr(1)
17735 .m(m)
17736 .n(n)
17737 .k(k)
17738 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017739 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017740 }
17741 }
17742 }
17743 }
17744
17745 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, strided_cm_subtile) {
17746 TEST_REQUIRES_X86_SSE2;
17747 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080017748 for (uint32_t n = 1; n <= 4; n++) {
17749 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017750 GemmMicrokernelTester()
17751 .mr(3)
17752 .nr(4)
17753 .kr(8)
17754 .sr(1)
17755 .m(m)
17756 .n(n)
17757 .k(k)
17758 .cm_stride(7)
17759 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017760 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017761 }
17762 }
17763 }
17764 }
17765
17766 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, qmin) {
17767 TEST_REQUIRES_X86_SSE2;
17768 GemmMicrokernelTester()
17769 .mr(3)
17770 .nr(4)
17771 .kr(8)
17772 .sr(1)
17773 .m(3)
17774 .n(4)
17775 .k(8)
17776 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080017777 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017778 }
17779
17780 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, qmax) {
17781 TEST_REQUIRES_X86_SSE2;
17782 GemmMicrokernelTester()
17783 .mr(3)
17784 .nr(4)
17785 .kr(8)
17786 .sr(1)
17787 .m(3)
17788 .n(4)
17789 .k(8)
17790 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080017791 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017792 }
17793
17794 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, strided_cm) {
17795 TEST_REQUIRES_X86_SSE2;
17796 GemmMicrokernelTester()
17797 .mr(3)
17798 .nr(4)
17799 .kr(8)
17800 .sr(1)
17801 .m(3)
17802 .n(4)
17803 .k(8)
17804 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080017805 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017806 }
17807#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
17808
17809
17810#if XNN_ARCH_X86 || XNN_ARCH_X86_64
17811 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8) {
17812 TEST_REQUIRES_X86_AVX;
17813 GemmMicrokernelTester()
17814 .mr(1)
17815 .nr(4)
17816 .kr(8)
17817 .sr(1)
17818 .m(1)
17819 .n(4)
17820 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080017821 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017822 }
17823
17824 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, strided_cn) {
17825 TEST_REQUIRES_X86_AVX;
17826 GemmMicrokernelTester()
17827 .mr(1)
17828 .nr(4)
17829 .kr(8)
17830 .sr(1)
17831 .m(1)
17832 .n(4)
17833 .k(8)
17834 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080017835 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017836 }
17837
17838 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_strided_a) {
17839 TEST_REQUIRES_X86_AVX;
17840 GemmMicrokernelTester()
17841 .mr(1)
17842 .nr(4)
17843 .kr(8)
17844 .sr(1)
17845 .m(1)
17846 .n(4)
17847 .k(8)
17848 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080017849 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017850 }
17851
17852 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_subtile) {
17853 TEST_REQUIRES_X86_AVX;
Zhi An Ng83844ae2022-01-14 09:52:25 -080017854 for (uint32_t n = 1; n <= 4; n++) {
17855 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017856 GemmMicrokernelTester()
17857 .mr(1)
17858 .nr(4)
17859 .kr(8)
17860 .sr(1)
17861 .m(m)
17862 .n(n)
17863 .k(8)
17864 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017865 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017866 }
17867 }
17868 }
17869
17870 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_subtile_m) {
17871 TEST_REQUIRES_X86_AVX;
17872 for (uint32_t m = 1; m <= 1; m++) {
17873 GemmMicrokernelTester()
17874 .mr(1)
17875 .nr(4)
17876 .kr(8)
17877 .sr(1)
17878 .m(m)
17879 .n(4)
17880 .k(8)
17881 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017882 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017883 }
17884 }
17885
17886 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_subtile_n) {
17887 TEST_REQUIRES_X86_AVX;
17888 for (uint32_t n = 1; n <= 4; n++) {
17889 GemmMicrokernelTester()
17890 .mr(1)
17891 .nr(4)
17892 .kr(8)
17893 .sr(1)
17894 .m(1)
17895 .n(n)
17896 .k(8)
17897 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017898 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017899 }
17900 }
17901
17902 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_lt_8) {
17903 TEST_REQUIRES_X86_AVX;
17904 for (size_t k = 1; k < 8; k++) {
17905 GemmMicrokernelTester()
17906 .mr(1)
17907 .nr(4)
17908 .kr(8)
17909 .sr(1)
17910 .m(1)
17911 .n(4)
17912 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017913 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017914 }
17915 }
17916
17917 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_lt_8_strided_a) {
17918 TEST_REQUIRES_X86_AVX;
17919 for (size_t k = 1; k < 8; k++) {
17920 GemmMicrokernelTester()
17921 .mr(1)
17922 .nr(4)
17923 .kr(8)
17924 .sr(1)
17925 .m(1)
17926 .n(4)
17927 .k(k)
17928 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080017929 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017930 }
17931 }
17932
17933 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_lt_8_subtile) {
17934 TEST_REQUIRES_X86_AVX;
17935 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080017936 for (uint32_t n = 1; n <= 4; n++) {
17937 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017938 GemmMicrokernelTester()
17939 .mr(1)
17940 .nr(4)
17941 .kr(8)
17942 .sr(1)
17943 .m(m)
17944 .n(n)
17945 .k(k)
17946 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017947 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017948 }
17949 }
17950 }
17951 }
17952
17953 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_gt_8) {
17954 TEST_REQUIRES_X86_AVX;
17955 for (size_t k = 9; k < 16; k++) {
17956 GemmMicrokernelTester()
17957 .mr(1)
17958 .nr(4)
17959 .kr(8)
17960 .sr(1)
17961 .m(1)
17962 .n(4)
17963 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080017964 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017965 }
17966 }
17967
17968 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_gt_8_strided_a) {
17969 TEST_REQUIRES_X86_AVX;
17970 for (size_t k = 9; k < 16; k++) {
17971 GemmMicrokernelTester()
17972 .mr(1)
17973 .nr(4)
17974 .kr(8)
17975 .sr(1)
17976 .m(1)
17977 .n(4)
17978 .k(k)
17979 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080017980 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017981 }
17982 }
17983
17984 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_gt_8_subtile) {
17985 TEST_REQUIRES_X86_AVX;
17986 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080017987 for (uint32_t n = 1; n <= 4; n++) {
17988 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017989 GemmMicrokernelTester()
17990 .mr(1)
17991 .nr(4)
17992 .kr(8)
17993 .sr(1)
17994 .m(m)
17995 .n(n)
17996 .k(k)
17997 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080017998 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080017999 }
18000 }
18001 }
18002 }
18003
18004 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_div_8) {
18005 TEST_REQUIRES_X86_AVX;
18006 for (size_t k = 16; k <= 80; k += 8) {
18007 GemmMicrokernelTester()
18008 .mr(1)
18009 .nr(4)
18010 .kr(8)
18011 .sr(1)
18012 .m(1)
18013 .n(4)
18014 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018015 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018016 }
18017 }
18018
18019 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_div_8_strided_a) {
18020 TEST_REQUIRES_X86_AVX;
18021 for (size_t k = 16; k <= 80; k += 8) {
18022 GemmMicrokernelTester()
18023 .mr(1)
18024 .nr(4)
18025 .kr(8)
18026 .sr(1)
18027 .m(1)
18028 .n(4)
18029 .k(k)
18030 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080018031 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018032 }
18033 }
18034
18035 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_div_8_subtile) {
18036 TEST_REQUIRES_X86_AVX;
18037 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080018038 for (uint32_t n = 1; n <= 4; n++) {
18039 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018040 GemmMicrokernelTester()
18041 .mr(1)
18042 .nr(4)
18043 .kr(8)
18044 .sr(1)
18045 .m(m)
18046 .n(n)
18047 .k(k)
18048 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018049 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018050 }
18051 }
18052 }
18053 }
18054
18055 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4) {
18056 TEST_REQUIRES_X86_AVX;
18057 for (uint32_t n = 5; n < 8; n++) {
18058 for (size_t k = 1; k <= 40; k += 9) {
18059 GemmMicrokernelTester()
18060 .mr(1)
18061 .nr(4)
18062 .kr(8)
18063 .sr(1)
18064 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080018065 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018066 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018067 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018068 }
18069 }
18070 }
18071
18072 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4_strided_cn) {
18073 TEST_REQUIRES_X86_AVX;
18074 for (uint32_t n = 5; n < 8; n++) {
18075 for (size_t k = 1; k <= 40; k += 9) {
18076 GemmMicrokernelTester()
18077 .mr(1)
18078 .nr(4)
18079 .kr(8)
18080 .sr(1)
18081 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080018082 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018083 .k(k)
18084 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080018085 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018086 }
18087 }
18088 }
18089
18090 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4_strided_a) {
18091 TEST_REQUIRES_X86_AVX;
18092 for (uint32_t n = 5; n < 8; n++) {
18093 for (size_t k = 1; k <= 40; k += 9) {
18094 GemmMicrokernelTester()
18095 .mr(1)
18096 .nr(4)
18097 .kr(8)
18098 .sr(1)
18099 .m(1)
18100 .n(n)
18101 .k(k)
18102 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080018103 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018104 }
18105 }
18106 }
18107
18108 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4_subtile) {
18109 TEST_REQUIRES_X86_AVX;
18110 for (uint32_t n = 5; n < 8; n++) {
18111 for (size_t k = 1; k <= 40; k += 9) {
18112 for (uint32_t m = 1; m <= 1; m++) {
18113 GemmMicrokernelTester()
18114 .mr(1)
18115 .nr(4)
18116 .kr(8)
18117 .sr(1)
18118 .m(m)
18119 .n(n)
18120 .k(k)
18121 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018122 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018123 }
18124 }
18125 }
18126 }
18127
18128 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4) {
18129 TEST_REQUIRES_X86_AVX;
18130 for (uint32_t n = 8; n <= 12; n += 4) {
18131 for (size_t k = 1; k <= 40; k += 9) {
18132 GemmMicrokernelTester()
18133 .mr(1)
18134 .nr(4)
18135 .kr(8)
18136 .sr(1)
18137 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080018138 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018139 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018140 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018141 }
18142 }
18143 }
18144
18145 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4_strided_cn) {
18146 TEST_REQUIRES_X86_AVX;
18147 for (uint32_t n = 8; n <= 12; n += 4) {
18148 for (size_t k = 1; k <= 40; k += 9) {
18149 GemmMicrokernelTester()
18150 .mr(1)
18151 .nr(4)
18152 .kr(8)
18153 .sr(1)
18154 .m(1)
18155 .n(n)
18156 .k(k)
18157 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080018158 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018159 }
18160 }
18161 }
18162
18163 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4_strided_a) {
18164 TEST_REQUIRES_X86_AVX;
18165 for (uint32_t n = 8; n <= 12; n += 4) {
18166 for (size_t k = 1; k <= 40; k += 9) {
18167 GemmMicrokernelTester()
18168 .mr(1)
18169 .nr(4)
18170 .kr(8)
18171 .sr(1)
18172 .m(1)
18173 .n(n)
18174 .k(k)
18175 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080018176 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018177 }
18178 }
18179 }
18180
18181 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4_subtile) {
18182 TEST_REQUIRES_X86_AVX;
18183 for (uint32_t n = 8; n <= 12; n += 4) {
18184 for (size_t k = 1; k <= 40; k += 9) {
18185 for (uint32_t m = 1; m <= 1; m++) {
18186 GemmMicrokernelTester()
18187 .mr(1)
18188 .nr(4)
18189 .kr(8)
18190 .sr(1)
18191 .m(m)
18192 .n(n)
18193 .k(k)
18194 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018195 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018196 }
18197 }
18198 }
18199 }
18200
18201 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, strided_cm_subtile) {
18202 TEST_REQUIRES_X86_AVX;
18203 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080018204 for (uint32_t n = 1; n <= 4; n++) {
18205 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018206 GemmMicrokernelTester()
18207 .mr(1)
18208 .nr(4)
18209 .kr(8)
18210 .sr(1)
18211 .m(m)
18212 .n(n)
18213 .k(k)
18214 .cm_stride(7)
18215 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018216 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018217 }
18218 }
18219 }
18220 }
18221
18222 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, qmin) {
18223 TEST_REQUIRES_X86_AVX;
18224 GemmMicrokernelTester()
18225 .mr(1)
18226 .nr(4)
18227 .kr(8)
18228 .sr(1)
18229 .m(1)
18230 .n(4)
18231 .k(8)
18232 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080018233 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018234 }
18235
18236 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, qmax) {
18237 TEST_REQUIRES_X86_AVX;
18238 GemmMicrokernelTester()
18239 .mr(1)
18240 .nr(4)
18241 .kr(8)
18242 .sr(1)
18243 .m(1)
18244 .n(4)
18245 .k(8)
18246 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080018247 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018248 }
18249
18250 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, strided_cm) {
18251 TEST_REQUIRES_X86_AVX;
18252 GemmMicrokernelTester()
18253 .mr(1)
18254 .nr(4)
18255 .kr(8)
18256 .sr(1)
18257 .m(1)
18258 .n(4)
18259 .k(8)
18260 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080018261 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018262 }
18263#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
18264
18265
18266#if XNN_ARCH_X86 || XNN_ARCH_X86_64
18267 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8) {
18268 TEST_REQUIRES_X86_AVX;
18269 GemmMicrokernelTester()
18270 .mr(2)
18271 .nr(4)
18272 .kr(8)
18273 .sr(1)
18274 .m(2)
18275 .n(4)
18276 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080018277 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018278 }
18279
18280 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, strided_cn) {
18281 TEST_REQUIRES_X86_AVX;
18282 GemmMicrokernelTester()
18283 .mr(2)
18284 .nr(4)
18285 .kr(8)
18286 .sr(1)
18287 .m(2)
18288 .n(4)
18289 .k(8)
18290 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080018291 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018292 }
18293
18294 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8_strided_a) {
18295 TEST_REQUIRES_X86_AVX;
18296 GemmMicrokernelTester()
18297 .mr(2)
18298 .nr(4)
18299 .kr(8)
18300 .sr(1)
18301 .m(2)
18302 .n(4)
18303 .k(8)
18304 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080018305 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018306 }
18307
18308 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8_subtile) {
18309 TEST_REQUIRES_X86_AVX;
Zhi An Ng83844ae2022-01-14 09:52:25 -080018310 for (uint32_t n = 1; n <= 4; n++) {
18311 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018312 GemmMicrokernelTester()
18313 .mr(2)
18314 .nr(4)
18315 .kr(8)
18316 .sr(1)
18317 .m(m)
18318 .n(n)
18319 .k(8)
18320 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018321 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018322 }
18323 }
18324 }
18325
18326 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8_subtile_m) {
18327 TEST_REQUIRES_X86_AVX;
18328 for (uint32_t m = 1; m <= 2; m++) {
18329 GemmMicrokernelTester()
18330 .mr(2)
18331 .nr(4)
18332 .kr(8)
18333 .sr(1)
18334 .m(m)
18335 .n(4)
18336 .k(8)
18337 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018338 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018339 }
18340 }
18341
18342 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8_subtile_n) {
18343 TEST_REQUIRES_X86_AVX;
18344 for (uint32_t n = 1; n <= 4; n++) {
18345 GemmMicrokernelTester()
18346 .mr(2)
18347 .nr(4)
18348 .kr(8)
18349 .sr(1)
18350 .m(2)
18351 .n(n)
18352 .k(8)
18353 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018354 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018355 }
18356 }
18357
18358 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_lt_8) {
18359 TEST_REQUIRES_X86_AVX;
18360 for (size_t k = 1; k < 8; k++) {
18361 GemmMicrokernelTester()
18362 .mr(2)
18363 .nr(4)
18364 .kr(8)
18365 .sr(1)
18366 .m(2)
18367 .n(4)
18368 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018369 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018370 }
18371 }
18372
18373 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_lt_8_strided_a) {
18374 TEST_REQUIRES_X86_AVX;
18375 for (size_t k = 1; k < 8; k++) {
18376 GemmMicrokernelTester()
18377 .mr(2)
18378 .nr(4)
18379 .kr(8)
18380 .sr(1)
18381 .m(2)
18382 .n(4)
18383 .k(k)
18384 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080018385 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018386 }
18387 }
18388
18389 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_lt_8_subtile) {
18390 TEST_REQUIRES_X86_AVX;
18391 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080018392 for (uint32_t n = 1; n <= 4; n++) {
18393 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018394 GemmMicrokernelTester()
18395 .mr(2)
18396 .nr(4)
18397 .kr(8)
18398 .sr(1)
18399 .m(m)
18400 .n(n)
18401 .k(k)
18402 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018403 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018404 }
18405 }
18406 }
18407 }
18408
18409 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_gt_8) {
18410 TEST_REQUIRES_X86_AVX;
18411 for (size_t k = 9; k < 16; k++) {
18412 GemmMicrokernelTester()
18413 .mr(2)
18414 .nr(4)
18415 .kr(8)
18416 .sr(1)
18417 .m(2)
18418 .n(4)
18419 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018420 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018421 }
18422 }
18423
18424 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_gt_8_strided_a) {
18425 TEST_REQUIRES_X86_AVX;
18426 for (size_t k = 9; k < 16; k++) {
18427 GemmMicrokernelTester()
18428 .mr(2)
18429 .nr(4)
18430 .kr(8)
18431 .sr(1)
18432 .m(2)
18433 .n(4)
18434 .k(k)
18435 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080018436 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018437 }
18438 }
18439
18440 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_gt_8_subtile) {
18441 TEST_REQUIRES_X86_AVX;
18442 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080018443 for (uint32_t n = 1; n <= 4; n++) {
18444 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018445 GemmMicrokernelTester()
18446 .mr(2)
18447 .nr(4)
18448 .kr(8)
18449 .sr(1)
18450 .m(m)
18451 .n(n)
18452 .k(k)
18453 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018454 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018455 }
18456 }
18457 }
18458 }
18459
18460 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_div_8) {
18461 TEST_REQUIRES_X86_AVX;
18462 for (size_t k = 16; k <= 80; k += 8) {
18463 GemmMicrokernelTester()
18464 .mr(2)
18465 .nr(4)
18466 .kr(8)
18467 .sr(1)
18468 .m(2)
18469 .n(4)
18470 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018471 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018472 }
18473 }
18474
18475 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_div_8_strided_a) {
18476 TEST_REQUIRES_X86_AVX;
18477 for (size_t k = 16; k <= 80; k += 8) {
18478 GemmMicrokernelTester()
18479 .mr(2)
18480 .nr(4)
18481 .kr(8)
18482 .sr(1)
18483 .m(2)
18484 .n(4)
18485 .k(k)
18486 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080018487 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018488 }
18489 }
18490
18491 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_div_8_subtile) {
18492 TEST_REQUIRES_X86_AVX;
18493 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080018494 for (uint32_t n = 1; n <= 4; n++) {
18495 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018496 GemmMicrokernelTester()
18497 .mr(2)
18498 .nr(4)
18499 .kr(8)
18500 .sr(1)
18501 .m(m)
18502 .n(n)
18503 .k(k)
18504 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018505 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018506 }
18507 }
18508 }
18509 }
18510
18511 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4) {
18512 TEST_REQUIRES_X86_AVX;
18513 for (uint32_t n = 5; n < 8; n++) {
18514 for (size_t k = 1; k <= 40; k += 9) {
18515 GemmMicrokernelTester()
18516 .mr(2)
18517 .nr(4)
18518 .kr(8)
18519 .sr(1)
18520 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080018521 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018522 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018523 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018524 }
18525 }
18526 }
18527
18528 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4_strided_cn) {
18529 TEST_REQUIRES_X86_AVX;
18530 for (uint32_t n = 5; n < 8; n++) {
18531 for (size_t k = 1; k <= 40; k += 9) {
18532 GemmMicrokernelTester()
18533 .mr(2)
18534 .nr(4)
18535 .kr(8)
18536 .sr(1)
18537 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080018538 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018539 .k(k)
18540 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080018541 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018542 }
18543 }
18544 }
18545
18546 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4_strided_a) {
18547 TEST_REQUIRES_X86_AVX;
18548 for (uint32_t n = 5; n < 8; n++) {
18549 for (size_t k = 1; k <= 40; k += 9) {
18550 GemmMicrokernelTester()
18551 .mr(2)
18552 .nr(4)
18553 .kr(8)
18554 .sr(1)
18555 .m(2)
18556 .n(n)
18557 .k(k)
18558 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080018559 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018560 }
18561 }
18562 }
18563
18564 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4_subtile) {
18565 TEST_REQUIRES_X86_AVX;
18566 for (uint32_t n = 5; n < 8; n++) {
18567 for (size_t k = 1; k <= 40; k += 9) {
18568 for (uint32_t m = 1; m <= 2; m++) {
18569 GemmMicrokernelTester()
18570 .mr(2)
18571 .nr(4)
18572 .kr(8)
18573 .sr(1)
18574 .m(m)
18575 .n(n)
18576 .k(k)
18577 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018578 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018579 }
18580 }
18581 }
18582 }
18583
18584 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4) {
18585 TEST_REQUIRES_X86_AVX;
18586 for (uint32_t n = 8; n <= 12; n += 4) {
18587 for (size_t k = 1; k <= 40; k += 9) {
18588 GemmMicrokernelTester()
18589 .mr(2)
18590 .nr(4)
18591 .kr(8)
18592 .sr(1)
18593 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080018594 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018595 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018596 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018597 }
18598 }
18599 }
18600
18601 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4_strided_cn) {
18602 TEST_REQUIRES_X86_AVX;
18603 for (uint32_t n = 8; n <= 12; n += 4) {
18604 for (size_t k = 1; k <= 40; k += 9) {
18605 GemmMicrokernelTester()
18606 .mr(2)
18607 .nr(4)
18608 .kr(8)
18609 .sr(1)
18610 .m(2)
18611 .n(n)
18612 .k(k)
18613 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080018614 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018615 }
18616 }
18617 }
18618
18619 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4_strided_a) {
18620 TEST_REQUIRES_X86_AVX;
18621 for (uint32_t n = 8; n <= 12; n += 4) {
18622 for (size_t k = 1; k <= 40; k += 9) {
18623 GemmMicrokernelTester()
18624 .mr(2)
18625 .nr(4)
18626 .kr(8)
18627 .sr(1)
18628 .m(2)
18629 .n(n)
18630 .k(k)
18631 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080018632 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018633 }
18634 }
18635 }
18636
18637 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4_subtile) {
18638 TEST_REQUIRES_X86_AVX;
18639 for (uint32_t n = 8; n <= 12; n += 4) {
18640 for (size_t k = 1; k <= 40; k += 9) {
18641 for (uint32_t m = 1; m <= 2; m++) {
18642 GemmMicrokernelTester()
18643 .mr(2)
18644 .nr(4)
18645 .kr(8)
18646 .sr(1)
18647 .m(m)
18648 .n(n)
18649 .k(k)
18650 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018651 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018652 }
18653 }
18654 }
18655 }
18656
18657 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, strided_cm_subtile) {
18658 TEST_REQUIRES_X86_AVX;
18659 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080018660 for (uint32_t n = 1; n <= 4; n++) {
18661 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018662 GemmMicrokernelTester()
18663 .mr(2)
18664 .nr(4)
18665 .kr(8)
18666 .sr(1)
18667 .m(m)
18668 .n(n)
18669 .k(k)
18670 .cm_stride(7)
18671 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018672 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018673 }
18674 }
18675 }
18676 }
18677
18678 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, qmin) {
18679 TEST_REQUIRES_X86_AVX;
18680 GemmMicrokernelTester()
18681 .mr(2)
18682 .nr(4)
18683 .kr(8)
18684 .sr(1)
18685 .m(2)
18686 .n(4)
18687 .k(8)
18688 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080018689 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018690 }
18691
18692 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, qmax) {
18693 TEST_REQUIRES_X86_AVX;
18694 GemmMicrokernelTester()
18695 .mr(2)
18696 .nr(4)
18697 .kr(8)
18698 .sr(1)
18699 .m(2)
18700 .n(4)
18701 .k(8)
18702 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080018703 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018704 }
18705
18706 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, strided_cm) {
18707 TEST_REQUIRES_X86_AVX;
18708 GemmMicrokernelTester()
18709 .mr(2)
18710 .nr(4)
18711 .kr(8)
18712 .sr(1)
18713 .m(2)
18714 .n(4)
18715 .k(8)
18716 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080018717 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018718 }
18719#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
18720
18721
18722#if XNN_ARCH_X86 || XNN_ARCH_X86_64
18723 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8) {
18724 TEST_REQUIRES_X86_AVX;
18725 GemmMicrokernelTester()
18726 .mr(2)
18727 .nr(4)
18728 .kr(8)
18729 .sr(1)
18730 .m(2)
18731 .n(4)
18732 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080018733 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018734 }
18735
18736 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, strided_cn) {
18737 TEST_REQUIRES_X86_AVX;
18738 GemmMicrokernelTester()
18739 .mr(2)
18740 .nr(4)
18741 .kr(8)
18742 .sr(1)
18743 .m(2)
18744 .n(4)
18745 .k(8)
18746 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080018747 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018748 }
18749
18750 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8_strided_a) {
18751 TEST_REQUIRES_X86_AVX;
18752 GemmMicrokernelTester()
18753 .mr(2)
18754 .nr(4)
18755 .kr(8)
18756 .sr(1)
18757 .m(2)
18758 .n(4)
18759 .k(8)
18760 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080018761 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018762 }
18763
18764 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8_subtile) {
18765 TEST_REQUIRES_X86_AVX;
Zhi An Ng83844ae2022-01-14 09:52:25 -080018766 for (uint32_t n = 1; n <= 4; n++) {
18767 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018768 GemmMicrokernelTester()
18769 .mr(2)
18770 .nr(4)
18771 .kr(8)
18772 .sr(1)
18773 .m(m)
18774 .n(n)
18775 .k(8)
18776 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018777 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018778 }
18779 }
18780 }
18781
18782 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8_subtile_m) {
18783 TEST_REQUIRES_X86_AVX;
18784 for (uint32_t m = 1; m <= 2; m++) {
18785 GemmMicrokernelTester()
18786 .mr(2)
18787 .nr(4)
18788 .kr(8)
18789 .sr(1)
18790 .m(m)
18791 .n(4)
18792 .k(8)
18793 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018794 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018795 }
18796 }
18797
18798 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8_subtile_n) {
18799 TEST_REQUIRES_X86_AVX;
18800 for (uint32_t n = 1; n <= 4; n++) {
18801 GemmMicrokernelTester()
18802 .mr(2)
18803 .nr(4)
18804 .kr(8)
18805 .sr(1)
18806 .m(2)
18807 .n(n)
18808 .k(8)
18809 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018810 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018811 }
18812 }
18813
18814 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_lt_8) {
18815 TEST_REQUIRES_X86_AVX;
18816 for (size_t k = 1; k < 8; k++) {
18817 GemmMicrokernelTester()
18818 .mr(2)
18819 .nr(4)
18820 .kr(8)
18821 .sr(1)
18822 .m(2)
18823 .n(4)
18824 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018825 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018826 }
18827 }
18828
18829 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_lt_8_strided_a) {
18830 TEST_REQUIRES_X86_AVX;
18831 for (size_t k = 1; k < 8; k++) {
18832 GemmMicrokernelTester()
18833 .mr(2)
18834 .nr(4)
18835 .kr(8)
18836 .sr(1)
18837 .m(2)
18838 .n(4)
18839 .k(k)
18840 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080018841 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018842 }
18843 }
18844
18845 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_lt_8_subtile) {
18846 TEST_REQUIRES_X86_AVX;
18847 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080018848 for (uint32_t n = 1; n <= 4; n++) {
18849 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018850 GemmMicrokernelTester()
18851 .mr(2)
18852 .nr(4)
18853 .kr(8)
18854 .sr(1)
18855 .m(m)
18856 .n(n)
18857 .k(k)
18858 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018859 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018860 }
18861 }
18862 }
18863 }
18864
18865 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_gt_8) {
18866 TEST_REQUIRES_X86_AVX;
18867 for (size_t k = 9; k < 16; k++) {
18868 GemmMicrokernelTester()
18869 .mr(2)
18870 .nr(4)
18871 .kr(8)
18872 .sr(1)
18873 .m(2)
18874 .n(4)
18875 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018876 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018877 }
18878 }
18879
18880 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_gt_8_strided_a) {
18881 TEST_REQUIRES_X86_AVX;
18882 for (size_t k = 9; k < 16; k++) {
18883 GemmMicrokernelTester()
18884 .mr(2)
18885 .nr(4)
18886 .kr(8)
18887 .sr(1)
18888 .m(2)
18889 .n(4)
18890 .k(k)
18891 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080018892 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018893 }
18894 }
18895
18896 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_gt_8_subtile) {
18897 TEST_REQUIRES_X86_AVX;
18898 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080018899 for (uint32_t n = 1; n <= 4; n++) {
18900 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018901 GemmMicrokernelTester()
18902 .mr(2)
18903 .nr(4)
18904 .kr(8)
18905 .sr(1)
18906 .m(m)
18907 .n(n)
18908 .k(k)
18909 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018910 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018911 }
18912 }
18913 }
18914 }
18915
18916 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_div_8) {
18917 TEST_REQUIRES_X86_AVX;
18918 for (size_t k = 16; k <= 80; k += 8) {
18919 GemmMicrokernelTester()
18920 .mr(2)
18921 .nr(4)
18922 .kr(8)
18923 .sr(1)
18924 .m(2)
18925 .n(4)
18926 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018927 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018928 }
18929 }
18930
18931 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_div_8_strided_a) {
18932 TEST_REQUIRES_X86_AVX;
18933 for (size_t k = 16; k <= 80; k += 8) {
18934 GemmMicrokernelTester()
18935 .mr(2)
18936 .nr(4)
18937 .kr(8)
18938 .sr(1)
18939 .m(2)
18940 .n(4)
18941 .k(k)
18942 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080018943 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018944 }
18945 }
18946
18947 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_div_8_subtile) {
18948 TEST_REQUIRES_X86_AVX;
18949 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080018950 for (uint32_t n = 1; n <= 4; n++) {
18951 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018952 GemmMicrokernelTester()
18953 .mr(2)
18954 .nr(4)
18955 .kr(8)
18956 .sr(1)
18957 .m(m)
18958 .n(n)
18959 .k(k)
18960 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080018961 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018962 }
18963 }
18964 }
18965 }
18966
18967 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4) {
18968 TEST_REQUIRES_X86_AVX;
18969 for (uint32_t n = 5; n < 8; n++) {
18970 for (size_t k = 1; k <= 40; k += 9) {
18971 GemmMicrokernelTester()
18972 .mr(2)
18973 .nr(4)
18974 .kr(8)
18975 .sr(1)
18976 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080018977 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018978 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080018979 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018980 }
18981 }
18982 }
18983
18984 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4_strided_cn) {
18985 TEST_REQUIRES_X86_AVX;
18986 for (uint32_t n = 5; n < 8; n++) {
18987 for (size_t k = 1; k <= 40; k += 9) {
18988 GemmMicrokernelTester()
18989 .mr(2)
18990 .nr(4)
18991 .kr(8)
18992 .sr(1)
18993 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080018994 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018995 .k(k)
18996 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080018997 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080018998 }
18999 }
19000 }
19001
19002 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4_strided_a) {
19003 TEST_REQUIRES_X86_AVX;
19004 for (uint32_t n = 5; n < 8; n++) {
19005 for (size_t k = 1; k <= 40; k += 9) {
19006 GemmMicrokernelTester()
19007 .mr(2)
19008 .nr(4)
19009 .kr(8)
19010 .sr(1)
19011 .m(2)
19012 .n(n)
19013 .k(k)
19014 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080019015 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019016 }
19017 }
19018 }
19019
19020 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4_subtile) {
19021 TEST_REQUIRES_X86_AVX;
19022 for (uint32_t n = 5; n < 8; n++) {
19023 for (size_t k = 1; k <= 40; k += 9) {
19024 for (uint32_t m = 1; m <= 2; m++) {
19025 GemmMicrokernelTester()
19026 .mr(2)
19027 .nr(4)
19028 .kr(8)
19029 .sr(1)
19030 .m(m)
19031 .n(n)
19032 .k(k)
19033 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019034 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019035 }
19036 }
19037 }
19038 }
19039
19040 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4) {
19041 TEST_REQUIRES_X86_AVX;
19042 for (uint32_t n = 8; n <= 12; n += 4) {
19043 for (size_t k = 1; k <= 40; k += 9) {
19044 GemmMicrokernelTester()
19045 .mr(2)
19046 .nr(4)
19047 .kr(8)
19048 .sr(1)
19049 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080019050 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019051 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080019052 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019053 }
19054 }
19055 }
19056
19057 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4_strided_cn) {
19058 TEST_REQUIRES_X86_AVX;
19059 for (uint32_t n = 8; n <= 12; n += 4) {
19060 for (size_t k = 1; k <= 40; k += 9) {
19061 GemmMicrokernelTester()
19062 .mr(2)
19063 .nr(4)
19064 .kr(8)
19065 .sr(1)
19066 .m(2)
19067 .n(n)
19068 .k(k)
19069 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080019070 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019071 }
19072 }
19073 }
19074
19075 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4_strided_a) {
19076 TEST_REQUIRES_X86_AVX;
19077 for (uint32_t n = 8; n <= 12; n += 4) {
19078 for (size_t k = 1; k <= 40; k += 9) {
19079 GemmMicrokernelTester()
19080 .mr(2)
19081 .nr(4)
19082 .kr(8)
19083 .sr(1)
19084 .m(2)
19085 .n(n)
19086 .k(k)
19087 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080019088 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019089 }
19090 }
19091 }
19092
19093 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4_subtile) {
19094 TEST_REQUIRES_X86_AVX;
19095 for (uint32_t n = 8; n <= 12; n += 4) {
19096 for (size_t k = 1; k <= 40; k += 9) {
19097 for (uint32_t m = 1; m <= 2; m++) {
19098 GemmMicrokernelTester()
19099 .mr(2)
19100 .nr(4)
19101 .kr(8)
19102 .sr(1)
19103 .m(m)
19104 .n(n)
19105 .k(k)
19106 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019107 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019108 }
19109 }
19110 }
19111 }
19112
19113 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, strided_cm_subtile) {
19114 TEST_REQUIRES_X86_AVX;
19115 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080019116 for (uint32_t n = 1; n <= 4; n++) {
19117 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019118 GemmMicrokernelTester()
19119 .mr(2)
19120 .nr(4)
19121 .kr(8)
19122 .sr(1)
19123 .m(m)
19124 .n(n)
19125 .k(k)
19126 .cm_stride(7)
19127 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019128 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019129 }
19130 }
19131 }
19132 }
19133
19134 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, qmin) {
19135 TEST_REQUIRES_X86_AVX;
19136 GemmMicrokernelTester()
19137 .mr(2)
19138 .nr(4)
19139 .kr(8)
19140 .sr(1)
19141 .m(2)
19142 .n(4)
19143 .k(8)
19144 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080019145 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019146 }
19147
19148 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, qmax) {
19149 TEST_REQUIRES_X86_AVX;
19150 GemmMicrokernelTester()
19151 .mr(2)
19152 .nr(4)
19153 .kr(8)
19154 .sr(1)
19155 .m(2)
19156 .n(4)
19157 .k(8)
19158 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080019159 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019160 }
19161
19162 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, strided_cm) {
19163 TEST_REQUIRES_X86_AVX;
19164 GemmMicrokernelTester()
19165 .mr(2)
19166 .nr(4)
19167 .kr(8)
19168 .sr(1)
19169 .m(2)
19170 .n(4)
19171 .k(8)
19172 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080019173 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019174 }
19175#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
19176
19177
19178#if XNN_ARCH_X86 || XNN_ARCH_X86_64
19179 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8) {
19180 TEST_REQUIRES_X86_AVX;
19181 GemmMicrokernelTester()
19182 .mr(3)
19183 .nr(4)
19184 .kr(8)
19185 .sr(1)
19186 .m(3)
19187 .n(4)
19188 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080019189 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019190 }
19191
19192 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, strided_cn) {
19193 TEST_REQUIRES_X86_AVX;
19194 GemmMicrokernelTester()
19195 .mr(3)
19196 .nr(4)
19197 .kr(8)
19198 .sr(1)
19199 .m(3)
19200 .n(4)
19201 .k(8)
19202 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080019203 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019204 }
19205
19206 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_strided_a) {
19207 TEST_REQUIRES_X86_AVX;
19208 GemmMicrokernelTester()
19209 .mr(3)
19210 .nr(4)
19211 .kr(8)
19212 .sr(1)
19213 .m(3)
19214 .n(4)
19215 .k(8)
19216 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080019217 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019218 }
19219
19220 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_subtile) {
19221 TEST_REQUIRES_X86_AVX;
Zhi An Ng83844ae2022-01-14 09:52:25 -080019222 for (uint32_t n = 1; n <= 4; n++) {
19223 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019224 GemmMicrokernelTester()
19225 .mr(3)
19226 .nr(4)
19227 .kr(8)
19228 .sr(1)
19229 .m(m)
19230 .n(n)
19231 .k(8)
19232 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019233 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019234 }
19235 }
19236 }
19237
19238 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_subtile_m) {
19239 TEST_REQUIRES_X86_AVX;
19240 for (uint32_t m = 1; m <= 3; m++) {
19241 GemmMicrokernelTester()
19242 .mr(3)
19243 .nr(4)
19244 .kr(8)
19245 .sr(1)
19246 .m(m)
19247 .n(4)
19248 .k(8)
19249 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019250 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019251 }
19252 }
19253
19254 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_subtile_n) {
19255 TEST_REQUIRES_X86_AVX;
19256 for (uint32_t n = 1; n <= 4; n++) {
19257 GemmMicrokernelTester()
19258 .mr(3)
19259 .nr(4)
19260 .kr(8)
19261 .sr(1)
19262 .m(3)
19263 .n(n)
19264 .k(8)
19265 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019266 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019267 }
19268 }
19269
19270 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_lt_8) {
19271 TEST_REQUIRES_X86_AVX;
19272 for (size_t k = 1; k < 8; k++) {
19273 GemmMicrokernelTester()
19274 .mr(3)
19275 .nr(4)
19276 .kr(8)
19277 .sr(1)
19278 .m(3)
19279 .n(4)
19280 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080019281 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019282 }
19283 }
19284
19285 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_lt_8_strided_a) {
19286 TEST_REQUIRES_X86_AVX;
19287 for (size_t k = 1; k < 8; k++) {
19288 GemmMicrokernelTester()
19289 .mr(3)
19290 .nr(4)
19291 .kr(8)
19292 .sr(1)
19293 .m(3)
19294 .n(4)
19295 .k(k)
19296 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080019297 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019298 }
19299 }
19300
19301 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_lt_8_subtile) {
19302 TEST_REQUIRES_X86_AVX;
19303 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080019304 for (uint32_t n = 1; n <= 4; n++) {
19305 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019306 GemmMicrokernelTester()
19307 .mr(3)
19308 .nr(4)
19309 .kr(8)
19310 .sr(1)
19311 .m(m)
19312 .n(n)
19313 .k(k)
19314 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019315 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019316 }
19317 }
19318 }
19319 }
19320
19321 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_gt_8) {
19322 TEST_REQUIRES_X86_AVX;
19323 for (size_t k = 9; k < 16; k++) {
19324 GemmMicrokernelTester()
19325 .mr(3)
19326 .nr(4)
19327 .kr(8)
19328 .sr(1)
19329 .m(3)
19330 .n(4)
19331 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080019332 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019333 }
19334 }
19335
19336 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_gt_8_strided_a) {
19337 TEST_REQUIRES_X86_AVX;
19338 for (size_t k = 9; k < 16; k++) {
19339 GemmMicrokernelTester()
19340 .mr(3)
19341 .nr(4)
19342 .kr(8)
19343 .sr(1)
19344 .m(3)
19345 .n(4)
19346 .k(k)
19347 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080019348 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019349 }
19350 }
19351
19352 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_gt_8_subtile) {
19353 TEST_REQUIRES_X86_AVX;
19354 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080019355 for (uint32_t n = 1; n <= 4; n++) {
19356 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019357 GemmMicrokernelTester()
19358 .mr(3)
19359 .nr(4)
19360 .kr(8)
19361 .sr(1)
19362 .m(m)
19363 .n(n)
19364 .k(k)
19365 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019366 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019367 }
19368 }
19369 }
19370 }
19371
19372 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_div_8) {
19373 TEST_REQUIRES_X86_AVX;
19374 for (size_t k = 16; k <= 80; k += 8) {
19375 GemmMicrokernelTester()
19376 .mr(3)
19377 .nr(4)
19378 .kr(8)
19379 .sr(1)
19380 .m(3)
19381 .n(4)
19382 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080019383 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019384 }
19385 }
19386
19387 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_div_8_strided_a) {
19388 TEST_REQUIRES_X86_AVX;
19389 for (size_t k = 16; k <= 80; k += 8) {
19390 GemmMicrokernelTester()
19391 .mr(3)
19392 .nr(4)
19393 .kr(8)
19394 .sr(1)
19395 .m(3)
19396 .n(4)
19397 .k(k)
19398 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080019399 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019400 }
19401 }
19402
19403 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_div_8_subtile) {
19404 TEST_REQUIRES_X86_AVX;
19405 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080019406 for (uint32_t n = 1; n <= 4; n++) {
19407 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019408 GemmMicrokernelTester()
19409 .mr(3)
19410 .nr(4)
19411 .kr(8)
19412 .sr(1)
19413 .m(m)
19414 .n(n)
19415 .k(k)
19416 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019417 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019418 }
19419 }
19420 }
19421 }
19422
19423 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4) {
19424 TEST_REQUIRES_X86_AVX;
19425 for (uint32_t n = 5; n < 8; n++) {
19426 for (size_t k = 1; k <= 40; k += 9) {
19427 GemmMicrokernelTester()
19428 .mr(3)
19429 .nr(4)
19430 .kr(8)
19431 .sr(1)
19432 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080019433 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019434 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080019435 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019436 }
19437 }
19438 }
19439
19440 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4_strided_cn) {
19441 TEST_REQUIRES_X86_AVX;
19442 for (uint32_t n = 5; n < 8; n++) {
19443 for (size_t k = 1; k <= 40; k += 9) {
19444 GemmMicrokernelTester()
19445 .mr(3)
19446 .nr(4)
19447 .kr(8)
19448 .sr(1)
19449 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080019450 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019451 .k(k)
19452 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080019453 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019454 }
19455 }
19456 }
19457
19458 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4_strided_a) {
19459 TEST_REQUIRES_X86_AVX;
19460 for (uint32_t n = 5; n < 8; n++) {
19461 for (size_t k = 1; k <= 40; k += 9) {
19462 GemmMicrokernelTester()
19463 .mr(3)
19464 .nr(4)
19465 .kr(8)
19466 .sr(1)
19467 .m(3)
19468 .n(n)
19469 .k(k)
19470 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080019471 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019472 }
19473 }
19474 }
19475
19476 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4_subtile) {
19477 TEST_REQUIRES_X86_AVX;
19478 for (uint32_t n = 5; n < 8; n++) {
19479 for (size_t k = 1; k <= 40; k += 9) {
19480 for (uint32_t m = 1; m <= 3; m++) {
19481 GemmMicrokernelTester()
19482 .mr(3)
19483 .nr(4)
19484 .kr(8)
19485 .sr(1)
19486 .m(m)
19487 .n(n)
19488 .k(k)
19489 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019490 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019491 }
19492 }
19493 }
19494 }
19495
19496 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4) {
19497 TEST_REQUIRES_X86_AVX;
19498 for (uint32_t n = 8; n <= 12; n += 4) {
19499 for (size_t k = 1; k <= 40; k += 9) {
19500 GemmMicrokernelTester()
19501 .mr(3)
19502 .nr(4)
19503 .kr(8)
19504 .sr(1)
19505 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080019506 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019507 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080019508 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019509 }
19510 }
19511 }
19512
19513 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4_strided_cn) {
19514 TEST_REQUIRES_X86_AVX;
19515 for (uint32_t n = 8; n <= 12; n += 4) {
19516 for (size_t k = 1; k <= 40; k += 9) {
19517 GemmMicrokernelTester()
19518 .mr(3)
19519 .nr(4)
19520 .kr(8)
19521 .sr(1)
19522 .m(3)
19523 .n(n)
19524 .k(k)
19525 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080019526 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019527 }
19528 }
19529 }
19530
19531 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4_strided_a) {
19532 TEST_REQUIRES_X86_AVX;
19533 for (uint32_t n = 8; n <= 12; n += 4) {
19534 for (size_t k = 1; k <= 40; k += 9) {
19535 GemmMicrokernelTester()
19536 .mr(3)
19537 .nr(4)
19538 .kr(8)
19539 .sr(1)
19540 .m(3)
19541 .n(n)
19542 .k(k)
19543 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080019544 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019545 }
19546 }
19547 }
19548
19549 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4_subtile) {
19550 TEST_REQUIRES_X86_AVX;
19551 for (uint32_t n = 8; n <= 12; n += 4) {
19552 for (size_t k = 1; k <= 40; k += 9) {
19553 for (uint32_t m = 1; m <= 3; m++) {
19554 GemmMicrokernelTester()
19555 .mr(3)
19556 .nr(4)
19557 .kr(8)
19558 .sr(1)
19559 .m(m)
19560 .n(n)
19561 .k(k)
19562 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019563 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019564 }
19565 }
19566 }
19567 }
19568
19569 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, strided_cm_subtile) {
19570 TEST_REQUIRES_X86_AVX;
19571 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080019572 for (uint32_t n = 1; n <= 4; n++) {
19573 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019574 GemmMicrokernelTester()
19575 .mr(3)
19576 .nr(4)
19577 .kr(8)
19578 .sr(1)
19579 .m(m)
19580 .n(n)
19581 .k(k)
19582 .cm_stride(7)
19583 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019584 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019585 }
19586 }
19587 }
19588 }
19589
19590 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, qmin) {
19591 TEST_REQUIRES_X86_AVX;
19592 GemmMicrokernelTester()
19593 .mr(3)
19594 .nr(4)
19595 .kr(8)
19596 .sr(1)
19597 .m(3)
19598 .n(4)
19599 .k(8)
19600 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080019601 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019602 }
19603
19604 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, qmax) {
19605 TEST_REQUIRES_X86_AVX;
19606 GemmMicrokernelTester()
19607 .mr(3)
19608 .nr(4)
19609 .kr(8)
19610 .sr(1)
19611 .m(3)
19612 .n(4)
19613 .k(8)
19614 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080019615 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019616 }
19617
19618 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, strided_cm) {
19619 TEST_REQUIRES_X86_AVX;
19620 GemmMicrokernelTester()
19621 .mr(3)
19622 .nr(4)
19623 .kr(8)
19624 .sr(1)
19625 .m(3)
19626 .n(4)
19627 .k(8)
19628 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080019629 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019630 }
19631#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
19632
19633
19634#if XNN_ARCH_X86 || XNN_ARCH_X86_64
19635 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8) {
19636 TEST_REQUIRES_X86_XOP;
19637 GemmMicrokernelTester()
19638 .mr(1)
19639 .nr(4)
19640 .kr(8)
19641 .sr(1)
19642 .m(1)
19643 .n(4)
19644 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080019645 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019646 }
19647
19648 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, strided_cn) {
19649 TEST_REQUIRES_X86_XOP;
19650 GemmMicrokernelTester()
19651 .mr(1)
19652 .nr(4)
19653 .kr(8)
19654 .sr(1)
19655 .m(1)
19656 .n(4)
19657 .k(8)
19658 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080019659 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019660 }
19661
19662 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_strided_a) {
19663 TEST_REQUIRES_X86_XOP;
19664 GemmMicrokernelTester()
19665 .mr(1)
19666 .nr(4)
19667 .kr(8)
19668 .sr(1)
19669 .m(1)
19670 .n(4)
19671 .k(8)
19672 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080019673 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019674 }
19675
19676 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_subtile) {
19677 TEST_REQUIRES_X86_XOP;
Zhi An Ng83844ae2022-01-14 09:52:25 -080019678 for (uint32_t n = 1; n <= 4; n++) {
19679 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019680 GemmMicrokernelTester()
19681 .mr(1)
19682 .nr(4)
19683 .kr(8)
19684 .sr(1)
19685 .m(m)
19686 .n(n)
19687 .k(8)
19688 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019689 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019690 }
19691 }
19692 }
19693
19694 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_subtile_m) {
19695 TEST_REQUIRES_X86_XOP;
19696 for (uint32_t m = 1; m <= 1; m++) {
19697 GemmMicrokernelTester()
19698 .mr(1)
19699 .nr(4)
19700 .kr(8)
19701 .sr(1)
19702 .m(m)
19703 .n(4)
19704 .k(8)
19705 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019706 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019707 }
19708 }
19709
19710 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_subtile_n) {
19711 TEST_REQUIRES_X86_XOP;
19712 for (uint32_t n = 1; n <= 4; n++) {
19713 GemmMicrokernelTester()
19714 .mr(1)
19715 .nr(4)
19716 .kr(8)
19717 .sr(1)
19718 .m(1)
19719 .n(n)
19720 .k(8)
19721 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019722 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019723 }
19724 }
19725
19726 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_lt_8) {
19727 TEST_REQUIRES_X86_XOP;
19728 for (size_t k = 1; k < 8; k++) {
19729 GemmMicrokernelTester()
19730 .mr(1)
19731 .nr(4)
19732 .kr(8)
19733 .sr(1)
19734 .m(1)
19735 .n(4)
19736 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080019737 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019738 }
19739 }
19740
19741 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_lt_8_strided_a) {
19742 TEST_REQUIRES_X86_XOP;
19743 for (size_t k = 1; k < 8; k++) {
19744 GemmMicrokernelTester()
19745 .mr(1)
19746 .nr(4)
19747 .kr(8)
19748 .sr(1)
19749 .m(1)
19750 .n(4)
19751 .k(k)
19752 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080019753 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019754 }
19755 }
19756
19757 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_lt_8_subtile) {
19758 TEST_REQUIRES_X86_XOP;
19759 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080019760 for (uint32_t n = 1; n <= 4; n++) {
19761 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019762 GemmMicrokernelTester()
19763 .mr(1)
19764 .nr(4)
19765 .kr(8)
19766 .sr(1)
19767 .m(m)
19768 .n(n)
19769 .k(k)
19770 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019771 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019772 }
19773 }
19774 }
19775 }
19776
19777 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_gt_8) {
19778 TEST_REQUIRES_X86_XOP;
19779 for (size_t k = 9; k < 16; k++) {
19780 GemmMicrokernelTester()
19781 .mr(1)
19782 .nr(4)
19783 .kr(8)
19784 .sr(1)
19785 .m(1)
19786 .n(4)
19787 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080019788 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019789 }
19790 }
19791
19792 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_gt_8_strided_a) {
19793 TEST_REQUIRES_X86_XOP;
19794 for (size_t k = 9; k < 16; k++) {
19795 GemmMicrokernelTester()
19796 .mr(1)
19797 .nr(4)
19798 .kr(8)
19799 .sr(1)
19800 .m(1)
19801 .n(4)
19802 .k(k)
19803 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080019804 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019805 }
19806 }
19807
19808 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_gt_8_subtile) {
19809 TEST_REQUIRES_X86_XOP;
19810 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080019811 for (uint32_t n = 1; n <= 4; n++) {
19812 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019813 GemmMicrokernelTester()
19814 .mr(1)
19815 .nr(4)
19816 .kr(8)
19817 .sr(1)
19818 .m(m)
19819 .n(n)
19820 .k(k)
19821 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019822 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019823 }
19824 }
19825 }
19826 }
19827
19828 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_div_8) {
19829 TEST_REQUIRES_X86_XOP;
19830 for (size_t k = 16; k <= 80; k += 8) {
19831 GemmMicrokernelTester()
19832 .mr(1)
19833 .nr(4)
19834 .kr(8)
19835 .sr(1)
19836 .m(1)
19837 .n(4)
19838 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080019839 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019840 }
19841 }
19842
19843 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_div_8_strided_a) {
19844 TEST_REQUIRES_X86_XOP;
19845 for (size_t k = 16; k <= 80; k += 8) {
19846 GemmMicrokernelTester()
19847 .mr(1)
19848 .nr(4)
19849 .kr(8)
19850 .sr(1)
19851 .m(1)
19852 .n(4)
19853 .k(k)
19854 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080019855 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019856 }
19857 }
19858
19859 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_div_8_subtile) {
19860 TEST_REQUIRES_X86_XOP;
19861 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080019862 for (uint32_t n = 1; n <= 4; n++) {
19863 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019864 GemmMicrokernelTester()
19865 .mr(1)
19866 .nr(4)
19867 .kr(8)
19868 .sr(1)
19869 .m(m)
19870 .n(n)
19871 .k(k)
19872 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019873 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019874 }
19875 }
19876 }
19877 }
19878
19879 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4) {
19880 TEST_REQUIRES_X86_XOP;
19881 for (uint32_t n = 5; n < 8; n++) {
19882 for (size_t k = 1; k <= 40; k += 9) {
19883 GemmMicrokernelTester()
19884 .mr(1)
19885 .nr(4)
19886 .kr(8)
19887 .sr(1)
19888 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080019889 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019890 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080019891 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019892 }
19893 }
19894 }
19895
19896 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4_strided_cn) {
19897 TEST_REQUIRES_X86_XOP;
19898 for (uint32_t n = 5; n < 8; n++) {
19899 for (size_t k = 1; k <= 40; k += 9) {
19900 GemmMicrokernelTester()
19901 .mr(1)
19902 .nr(4)
19903 .kr(8)
19904 .sr(1)
19905 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080019906 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019907 .k(k)
19908 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080019909 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019910 }
19911 }
19912 }
19913
19914 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4_strided_a) {
19915 TEST_REQUIRES_X86_XOP;
19916 for (uint32_t n = 5; n < 8; n++) {
19917 for (size_t k = 1; k <= 40; k += 9) {
19918 GemmMicrokernelTester()
19919 .mr(1)
19920 .nr(4)
19921 .kr(8)
19922 .sr(1)
19923 .m(1)
19924 .n(n)
19925 .k(k)
19926 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080019927 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019928 }
19929 }
19930 }
19931
19932 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4_subtile) {
19933 TEST_REQUIRES_X86_XOP;
19934 for (uint32_t n = 5; n < 8; n++) {
19935 for (size_t k = 1; k <= 40; k += 9) {
19936 for (uint32_t m = 1; m <= 1; m++) {
19937 GemmMicrokernelTester()
19938 .mr(1)
19939 .nr(4)
19940 .kr(8)
19941 .sr(1)
19942 .m(m)
19943 .n(n)
19944 .k(k)
19945 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080019946 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019947 }
19948 }
19949 }
19950 }
19951
19952 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4) {
19953 TEST_REQUIRES_X86_XOP;
19954 for (uint32_t n = 8; n <= 12; n += 4) {
19955 for (size_t k = 1; k <= 40; k += 9) {
19956 GemmMicrokernelTester()
19957 .mr(1)
19958 .nr(4)
19959 .kr(8)
19960 .sr(1)
19961 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080019962 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019963 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080019964 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019965 }
19966 }
19967 }
19968
19969 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4_strided_cn) {
19970 TEST_REQUIRES_X86_XOP;
19971 for (uint32_t n = 8; n <= 12; n += 4) {
19972 for (size_t k = 1; k <= 40; k += 9) {
19973 GemmMicrokernelTester()
19974 .mr(1)
19975 .nr(4)
19976 .kr(8)
19977 .sr(1)
19978 .m(1)
19979 .n(n)
19980 .k(k)
19981 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080019982 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080019983 }
19984 }
19985 }
19986
19987 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4_strided_a) {
19988 TEST_REQUIRES_X86_XOP;
19989 for (uint32_t n = 8; n <= 12; n += 4) {
19990 for (size_t k = 1; k <= 40; k += 9) {
19991 GemmMicrokernelTester()
19992 .mr(1)
19993 .nr(4)
19994 .kr(8)
19995 .sr(1)
19996 .m(1)
19997 .n(n)
19998 .k(k)
19999 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080020000 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020001 }
20002 }
20003 }
20004
20005 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4_subtile) {
20006 TEST_REQUIRES_X86_XOP;
20007 for (uint32_t n = 8; n <= 12; n += 4) {
20008 for (size_t k = 1; k <= 40; k += 9) {
20009 for (uint32_t m = 1; m <= 1; m++) {
20010 GemmMicrokernelTester()
20011 .mr(1)
20012 .nr(4)
20013 .kr(8)
20014 .sr(1)
20015 .m(m)
20016 .n(n)
20017 .k(k)
20018 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020019 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020020 }
20021 }
20022 }
20023 }
20024
20025 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, strided_cm_subtile) {
20026 TEST_REQUIRES_X86_XOP;
20027 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080020028 for (uint32_t n = 1; n <= 4; n++) {
20029 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020030 GemmMicrokernelTester()
20031 .mr(1)
20032 .nr(4)
20033 .kr(8)
20034 .sr(1)
20035 .m(m)
20036 .n(n)
20037 .k(k)
20038 .cm_stride(7)
20039 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020040 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020041 }
20042 }
20043 }
20044 }
20045
20046 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, qmin) {
20047 TEST_REQUIRES_X86_XOP;
20048 GemmMicrokernelTester()
20049 .mr(1)
20050 .nr(4)
20051 .kr(8)
20052 .sr(1)
20053 .m(1)
20054 .n(4)
20055 .k(8)
20056 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080020057 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020058 }
20059
20060 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, qmax) {
20061 TEST_REQUIRES_X86_XOP;
20062 GemmMicrokernelTester()
20063 .mr(1)
20064 .nr(4)
20065 .kr(8)
20066 .sr(1)
20067 .m(1)
20068 .n(4)
20069 .k(8)
20070 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080020071 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020072 }
20073
20074 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, strided_cm) {
20075 TEST_REQUIRES_X86_XOP;
20076 GemmMicrokernelTester()
20077 .mr(1)
20078 .nr(4)
20079 .kr(8)
20080 .sr(1)
20081 .m(1)
20082 .n(4)
20083 .k(8)
20084 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080020085 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020086 }
20087#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
20088
20089
20090#if XNN_ARCH_X86 || XNN_ARCH_X86_64
20091 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8) {
20092 TEST_REQUIRES_X86_XOP;
20093 GemmMicrokernelTester()
20094 .mr(2)
20095 .nr(4)
20096 .kr(8)
20097 .sr(1)
20098 .m(2)
20099 .n(4)
20100 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080020101 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020102 }
20103
20104 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, strided_cn) {
20105 TEST_REQUIRES_X86_XOP;
20106 GemmMicrokernelTester()
20107 .mr(2)
20108 .nr(4)
20109 .kr(8)
20110 .sr(1)
20111 .m(2)
20112 .n(4)
20113 .k(8)
20114 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080020115 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020116 }
20117
20118 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_strided_a) {
20119 TEST_REQUIRES_X86_XOP;
20120 GemmMicrokernelTester()
20121 .mr(2)
20122 .nr(4)
20123 .kr(8)
20124 .sr(1)
20125 .m(2)
20126 .n(4)
20127 .k(8)
20128 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080020129 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020130 }
20131
20132 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_subtile) {
20133 TEST_REQUIRES_X86_XOP;
Zhi An Ng83844ae2022-01-14 09:52:25 -080020134 for (uint32_t n = 1; n <= 4; n++) {
20135 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020136 GemmMicrokernelTester()
20137 .mr(2)
20138 .nr(4)
20139 .kr(8)
20140 .sr(1)
20141 .m(m)
20142 .n(n)
20143 .k(8)
20144 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020145 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020146 }
20147 }
20148 }
20149
20150 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_subtile_m) {
20151 TEST_REQUIRES_X86_XOP;
20152 for (uint32_t m = 1; m <= 2; m++) {
20153 GemmMicrokernelTester()
20154 .mr(2)
20155 .nr(4)
20156 .kr(8)
20157 .sr(1)
20158 .m(m)
20159 .n(4)
20160 .k(8)
20161 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020162 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020163 }
20164 }
20165
20166 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_subtile_n) {
20167 TEST_REQUIRES_X86_XOP;
20168 for (uint32_t n = 1; n <= 4; n++) {
20169 GemmMicrokernelTester()
20170 .mr(2)
20171 .nr(4)
20172 .kr(8)
20173 .sr(1)
20174 .m(2)
20175 .n(n)
20176 .k(8)
20177 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020178 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020179 }
20180 }
20181
20182 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_lt_8) {
20183 TEST_REQUIRES_X86_XOP;
20184 for (size_t k = 1; k < 8; k++) {
20185 GemmMicrokernelTester()
20186 .mr(2)
20187 .nr(4)
20188 .kr(8)
20189 .sr(1)
20190 .m(2)
20191 .n(4)
20192 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080020193 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020194 }
20195 }
20196
20197 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_lt_8_strided_a) {
20198 TEST_REQUIRES_X86_XOP;
20199 for (size_t k = 1; k < 8; k++) {
20200 GemmMicrokernelTester()
20201 .mr(2)
20202 .nr(4)
20203 .kr(8)
20204 .sr(1)
20205 .m(2)
20206 .n(4)
20207 .k(k)
20208 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080020209 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020210 }
20211 }
20212
20213 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_lt_8_subtile) {
20214 TEST_REQUIRES_X86_XOP;
20215 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080020216 for (uint32_t n = 1; n <= 4; n++) {
20217 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020218 GemmMicrokernelTester()
20219 .mr(2)
20220 .nr(4)
20221 .kr(8)
20222 .sr(1)
20223 .m(m)
20224 .n(n)
20225 .k(k)
20226 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020227 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020228 }
20229 }
20230 }
20231 }
20232
20233 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_gt_8) {
20234 TEST_REQUIRES_X86_XOP;
20235 for (size_t k = 9; k < 16; k++) {
20236 GemmMicrokernelTester()
20237 .mr(2)
20238 .nr(4)
20239 .kr(8)
20240 .sr(1)
20241 .m(2)
20242 .n(4)
20243 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080020244 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020245 }
20246 }
20247
20248 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_gt_8_strided_a) {
20249 TEST_REQUIRES_X86_XOP;
20250 for (size_t k = 9; k < 16; k++) {
20251 GemmMicrokernelTester()
20252 .mr(2)
20253 .nr(4)
20254 .kr(8)
20255 .sr(1)
20256 .m(2)
20257 .n(4)
20258 .k(k)
20259 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080020260 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020261 }
20262 }
20263
20264 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_gt_8_subtile) {
20265 TEST_REQUIRES_X86_XOP;
20266 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080020267 for (uint32_t n = 1; n <= 4; n++) {
20268 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020269 GemmMicrokernelTester()
20270 .mr(2)
20271 .nr(4)
20272 .kr(8)
20273 .sr(1)
20274 .m(m)
20275 .n(n)
20276 .k(k)
20277 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020278 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020279 }
20280 }
20281 }
20282 }
20283
20284 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_div_8) {
20285 TEST_REQUIRES_X86_XOP;
20286 for (size_t k = 16; k <= 80; k += 8) {
20287 GemmMicrokernelTester()
20288 .mr(2)
20289 .nr(4)
20290 .kr(8)
20291 .sr(1)
20292 .m(2)
20293 .n(4)
20294 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080020295 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020296 }
20297 }
20298
20299 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_div_8_strided_a) {
20300 TEST_REQUIRES_X86_XOP;
20301 for (size_t k = 16; k <= 80; k += 8) {
20302 GemmMicrokernelTester()
20303 .mr(2)
20304 .nr(4)
20305 .kr(8)
20306 .sr(1)
20307 .m(2)
20308 .n(4)
20309 .k(k)
20310 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080020311 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020312 }
20313 }
20314
20315 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_div_8_subtile) {
20316 TEST_REQUIRES_X86_XOP;
20317 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080020318 for (uint32_t n = 1; n <= 4; n++) {
20319 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020320 GemmMicrokernelTester()
20321 .mr(2)
20322 .nr(4)
20323 .kr(8)
20324 .sr(1)
20325 .m(m)
20326 .n(n)
20327 .k(k)
20328 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020329 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020330 }
20331 }
20332 }
20333 }
20334
20335 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4) {
20336 TEST_REQUIRES_X86_XOP;
20337 for (uint32_t n = 5; n < 8; n++) {
20338 for (size_t k = 1; k <= 40; k += 9) {
20339 GemmMicrokernelTester()
20340 .mr(2)
20341 .nr(4)
20342 .kr(8)
20343 .sr(1)
20344 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080020345 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020346 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080020347 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020348 }
20349 }
20350 }
20351
20352 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4_strided_cn) {
20353 TEST_REQUIRES_X86_XOP;
20354 for (uint32_t n = 5; n < 8; n++) {
20355 for (size_t k = 1; k <= 40; k += 9) {
20356 GemmMicrokernelTester()
20357 .mr(2)
20358 .nr(4)
20359 .kr(8)
20360 .sr(1)
20361 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080020362 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020363 .k(k)
20364 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080020365 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020366 }
20367 }
20368 }
20369
20370 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4_strided_a) {
20371 TEST_REQUIRES_X86_XOP;
20372 for (uint32_t n = 5; n < 8; n++) {
20373 for (size_t k = 1; k <= 40; k += 9) {
20374 GemmMicrokernelTester()
20375 .mr(2)
20376 .nr(4)
20377 .kr(8)
20378 .sr(1)
20379 .m(2)
20380 .n(n)
20381 .k(k)
20382 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080020383 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020384 }
20385 }
20386 }
20387
20388 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4_subtile) {
20389 TEST_REQUIRES_X86_XOP;
20390 for (uint32_t n = 5; n < 8; n++) {
20391 for (size_t k = 1; k <= 40; k += 9) {
20392 for (uint32_t m = 1; m <= 2; m++) {
20393 GemmMicrokernelTester()
20394 .mr(2)
20395 .nr(4)
20396 .kr(8)
20397 .sr(1)
20398 .m(m)
20399 .n(n)
20400 .k(k)
20401 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020402 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020403 }
20404 }
20405 }
20406 }
20407
20408 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4) {
20409 TEST_REQUIRES_X86_XOP;
20410 for (uint32_t n = 8; n <= 12; n += 4) {
20411 for (size_t k = 1; k <= 40; k += 9) {
20412 GemmMicrokernelTester()
20413 .mr(2)
20414 .nr(4)
20415 .kr(8)
20416 .sr(1)
20417 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080020418 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020419 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080020420 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020421 }
20422 }
20423 }
20424
20425 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4_strided_cn) {
20426 TEST_REQUIRES_X86_XOP;
20427 for (uint32_t n = 8; n <= 12; n += 4) {
20428 for (size_t k = 1; k <= 40; k += 9) {
20429 GemmMicrokernelTester()
20430 .mr(2)
20431 .nr(4)
20432 .kr(8)
20433 .sr(1)
20434 .m(2)
20435 .n(n)
20436 .k(k)
20437 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080020438 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020439 }
20440 }
20441 }
20442
20443 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4_strided_a) {
20444 TEST_REQUIRES_X86_XOP;
20445 for (uint32_t n = 8; n <= 12; n += 4) {
20446 for (size_t k = 1; k <= 40; k += 9) {
20447 GemmMicrokernelTester()
20448 .mr(2)
20449 .nr(4)
20450 .kr(8)
20451 .sr(1)
20452 .m(2)
20453 .n(n)
20454 .k(k)
20455 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080020456 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020457 }
20458 }
20459 }
20460
20461 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4_subtile) {
20462 TEST_REQUIRES_X86_XOP;
20463 for (uint32_t n = 8; n <= 12; n += 4) {
20464 for (size_t k = 1; k <= 40; k += 9) {
20465 for (uint32_t m = 1; m <= 2; m++) {
20466 GemmMicrokernelTester()
20467 .mr(2)
20468 .nr(4)
20469 .kr(8)
20470 .sr(1)
20471 .m(m)
20472 .n(n)
20473 .k(k)
20474 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020475 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020476 }
20477 }
20478 }
20479 }
20480
20481 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, strided_cm_subtile) {
20482 TEST_REQUIRES_X86_XOP;
20483 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080020484 for (uint32_t n = 1; n <= 4; n++) {
20485 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020486 GemmMicrokernelTester()
20487 .mr(2)
20488 .nr(4)
20489 .kr(8)
20490 .sr(1)
20491 .m(m)
20492 .n(n)
20493 .k(k)
20494 .cm_stride(7)
20495 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020496 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020497 }
20498 }
20499 }
20500 }
20501
20502 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, qmin) {
20503 TEST_REQUIRES_X86_XOP;
20504 GemmMicrokernelTester()
20505 .mr(2)
20506 .nr(4)
20507 .kr(8)
20508 .sr(1)
20509 .m(2)
20510 .n(4)
20511 .k(8)
20512 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080020513 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020514 }
20515
20516 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, qmax) {
20517 TEST_REQUIRES_X86_XOP;
20518 GemmMicrokernelTester()
20519 .mr(2)
20520 .nr(4)
20521 .kr(8)
20522 .sr(1)
20523 .m(2)
20524 .n(4)
20525 .k(8)
20526 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080020527 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020528 }
20529
20530 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, strided_cm) {
20531 TEST_REQUIRES_X86_XOP;
20532 GemmMicrokernelTester()
20533 .mr(2)
20534 .nr(4)
20535 .kr(8)
20536 .sr(1)
20537 .m(2)
20538 .n(4)
20539 .k(8)
20540 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080020541 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020542 }
20543#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
20544
20545
20546#if XNN_ARCH_X86 || XNN_ARCH_X86_64
20547 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_eq_8) {
20548 TEST_REQUIRES_X86_AVX2;
20549 GemmMicrokernelTester()
20550 .mr(2)
20551 .nr(8)
20552 .kr(8)
20553 .sr(1)
20554 .m(2)
20555 .n(8)
20556 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080020557 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020558 }
20559
20560 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, strided_cn) {
20561 TEST_REQUIRES_X86_AVX2;
20562 GemmMicrokernelTester()
20563 .mr(2)
20564 .nr(8)
20565 .kr(8)
20566 .sr(1)
20567 .m(2)
20568 .n(8)
20569 .k(8)
20570 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080020571 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020572 }
20573
20574 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_eq_8_strided_a) {
20575 TEST_REQUIRES_X86_AVX2;
20576 GemmMicrokernelTester()
20577 .mr(2)
20578 .nr(8)
20579 .kr(8)
20580 .sr(1)
20581 .m(2)
20582 .n(8)
20583 .k(8)
20584 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080020585 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020586 }
20587
20588 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_eq_8_subtile) {
20589 TEST_REQUIRES_X86_AVX2;
Zhi An Ng83844ae2022-01-14 09:52:25 -080020590 for (uint32_t n = 1; n <= 8; n++) {
20591 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020592 GemmMicrokernelTester()
20593 .mr(2)
20594 .nr(8)
20595 .kr(8)
20596 .sr(1)
20597 .m(m)
20598 .n(n)
20599 .k(8)
20600 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020601 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020602 }
20603 }
20604 }
20605
20606 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_eq_8_subtile_m) {
20607 TEST_REQUIRES_X86_AVX2;
20608 for (uint32_t m = 1; m <= 2; m++) {
20609 GemmMicrokernelTester()
20610 .mr(2)
20611 .nr(8)
20612 .kr(8)
20613 .sr(1)
20614 .m(m)
20615 .n(8)
20616 .k(8)
20617 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020618 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020619 }
20620 }
20621
20622 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_eq_8_subtile_n) {
20623 TEST_REQUIRES_X86_AVX2;
20624 for (uint32_t n = 1; n <= 8; n++) {
20625 GemmMicrokernelTester()
20626 .mr(2)
20627 .nr(8)
20628 .kr(8)
20629 .sr(1)
20630 .m(2)
20631 .n(n)
20632 .k(8)
20633 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020634 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020635 }
20636 }
20637
20638 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_lt_8) {
20639 TEST_REQUIRES_X86_AVX2;
20640 for (size_t k = 1; k < 8; k++) {
20641 GemmMicrokernelTester()
20642 .mr(2)
20643 .nr(8)
20644 .kr(8)
20645 .sr(1)
20646 .m(2)
20647 .n(8)
20648 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080020649 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020650 }
20651 }
20652
20653 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_lt_8_strided_a) {
20654 TEST_REQUIRES_X86_AVX2;
20655 for (size_t k = 1; k < 8; k++) {
20656 GemmMicrokernelTester()
20657 .mr(2)
20658 .nr(8)
20659 .kr(8)
20660 .sr(1)
20661 .m(2)
20662 .n(8)
20663 .k(k)
20664 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080020665 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020666 }
20667 }
20668
20669 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_lt_8_subtile) {
20670 TEST_REQUIRES_X86_AVX2;
20671 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080020672 for (uint32_t n = 1; n <= 8; n++) {
20673 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020674 GemmMicrokernelTester()
20675 .mr(2)
20676 .nr(8)
20677 .kr(8)
20678 .sr(1)
20679 .m(m)
20680 .n(n)
20681 .k(k)
20682 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020683 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020684 }
20685 }
20686 }
20687 }
20688
20689 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_gt_8) {
20690 TEST_REQUIRES_X86_AVX2;
20691 for (size_t k = 9; k < 16; k++) {
20692 GemmMicrokernelTester()
20693 .mr(2)
20694 .nr(8)
20695 .kr(8)
20696 .sr(1)
20697 .m(2)
20698 .n(8)
20699 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080020700 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020701 }
20702 }
20703
20704 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_gt_8_strided_a) {
20705 TEST_REQUIRES_X86_AVX2;
20706 for (size_t k = 9; k < 16; k++) {
20707 GemmMicrokernelTester()
20708 .mr(2)
20709 .nr(8)
20710 .kr(8)
20711 .sr(1)
20712 .m(2)
20713 .n(8)
20714 .k(k)
20715 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080020716 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020717 }
20718 }
20719
20720 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_gt_8_subtile) {
20721 TEST_REQUIRES_X86_AVX2;
20722 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080020723 for (uint32_t n = 1; n <= 8; n++) {
20724 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020725 GemmMicrokernelTester()
20726 .mr(2)
20727 .nr(8)
20728 .kr(8)
20729 .sr(1)
20730 .m(m)
20731 .n(n)
20732 .k(k)
20733 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020734 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020735 }
20736 }
20737 }
20738 }
20739
20740 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_div_8) {
20741 TEST_REQUIRES_X86_AVX2;
20742 for (size_t k = 16; k <= 80; k += 8) {
20743 GemmMicrokernelTester()
20744 .mr(2)
20745 .nr(8)
20746 .kr(8)
20747 .sr(1)
20748 .m(2)
20749 .n(8)
20750 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080020751 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020752 }
20753 }
20754
20755 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_div_8_strided_a) {
20756 TEST_REQUIRES_X86_AVX2;
20757 for (size_t k = 16; k <= 80; k += 8) {
20758 GemmMicrokernelTester()
20759 .mr(2)
20760 .nr(8)
20761 .kr(8)
20762 .sr(1)
20763 .m(2)
20764 .n(8)
20765 .k(k)
20766 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080020767 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020768 }
20769 }
20770
20771 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_div_8_subtile) {
20772 TEST_REQUIRES_X86_AVX2;
20773 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080020774 for (uint32_t n = 1; n <= 8; n++) {
20775 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020776 GemmMicrokernelTester()
20777 .mr(2)
20778 .nr(8)
20779 .kr(8)
20780 .sr(1)
20781 .m(m)
20782 .n(n)
20783 .k(k)
20784 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020785 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020786 }
20787 }
20788 }
20789 }
20790
20791 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, n_gt_8) {
20792 TEST_REQUIRES_X86_AVX2;
20793 for (uint32_t n = 9; n < 16; n++) {
20794 for (size_t k = 1; k <= 40; k += 9) {
20795 GemmMicrokernelTester()
20796 .mr(2)
20797 .nr(8)
20798 .kr(8)
20799 .sr(1)
20800 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080020801 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020802 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080020803 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020804 }
20805 }
20806 }
20807
20808 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, n_gt_8_strided_cn) {
20809 TEST_REQUIRES_X86_AVX2;
20810 for (uint32_t n = 9; n < 16; n++) {
20811 for (size_t k = 1; k <= 40; k += 9) {
20812 GemmMicrokernelTester()
20813 .mr(2)
20814 .nr(8)
20815 .kr(8)
20816 .sr(1)
20817 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080020818 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020819 .k(k)
20820 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080020821 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020822 }
20823 }
20824 }
20825
20826 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, n_gt_8_strided_a) {
20827 TEST_REQUIRES_X86_AVX2;
20828 for (uint32_t n = 9; n < 16; n++) {
20829 for (size_t k = 1; k <= 40; k += 9) {
20830 GemmMicrokernelTester()
20831 .mr(2)
20832 .nr(8)
20833 .kr(8)
20834 .sr(1)
20835 .m(2)
20836 .n(n)
20837 .k(k)
20838 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080020839 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020840 }
20841 }
20842 }
20843
20844 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, n_gt_8_subtile) {
20845 TEST_REQUIRES_X86_AVX2;
20846 for (uint32_t n = 9; n < 16; n++) {
20847 for (size_t k = 1; k <= 40; k += 9) {
20848 for (uint32_t m = 1; m <= 2; m++) {
20849 GemmMicrokernelTester()
20850 .mr(2)
20851 .nr(8)
20852 .kr(8)
20853 .sr(1)
20854 .m(m)
20855 .n(n)
20856 .k(k)
20857 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020858 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020859 }
20860 }
20861 }
20862 }
20863
20864 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, n_div_8) {
20865 TEST_REQUIRES_X86_AVX2;
20866 for (uint32_t n = 16; n <= 24; n += 8) {
20867 for (size_t k = 1; k <= 40; k += 9) {
20868 GemmMicrokernelTester()
20869 .mr(2)
20870 .nr(8)
20871 .kr(8)
20872 .sr(1)
20873 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080020874 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020875 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080020876 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020877 }
20878 }
20879 }
20880
20881 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, n_div_8_strided_cn) {
20882 TEST_REQUIRES_X86_AVX2;
20883 for (uint32_t n = 16; n <= 24; n += 8) {
20884 for (size_t k = 1; k <= 40; k += 9) {
20885 GemmMicrokernelTester()
20886 .mr(2)
20887 .nr(8)
20888 .kr(8)
20889 .sr(1)
20890 .m(2)
20891 .n(n)
20892 .k(k)
20893 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080020894 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020895 }
20896 }
20897 }
20898
20899 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, n_div_8_strided_a) {
20900 TEST_REQUIRES_X86_AVX2;
20901 for (uint32_t n = 16; n <= 24; n += 8) {
20902 for (size_t k = 1; k <= 40; k += 9) {
20903 GemmMicrokernelTester()
20904 .mr(2)
20905 .nr(8)
20906 .kr(8)
20907 .sr(1)
20908 .m(2)
20909 .n(n)
20910 .k(k)
20911 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080020912 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020913 }
20914 }
20915 }
20916
20917 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, n_div_8_subtile) {
20918 TEST_REQUIRES_X86_AVX2;
20919 for (uint32_t n = 16; n <= 24; n += 8) {
20920 for (size_t k = 1; k <= 40; k += 9) {
20921 for (uint32_t m = 1; m <= 2; m++) {
20922 GemmMicrokernelTester()
20923 .mr(2)
20924 .nr(8)
20925 .kr(8)
20926 .sr(1)
20927 .m(m)
20928 .n(n)
20929 .k(k)
20930 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020931 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020932 }
20933 }
20934 }
20935 }
20936
20937 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, strided_cm_subtile) {
20938 TEST_REQUIRES_X86_AVX2;
20939 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080020940 for (uint32_t n = 1; n <= 8; n++) {
20941 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020942 GemmMicrokernelTester()
20943 .mr(2)
20944 .nr(8)
20945 .kr(8)
20946 .sr(1)
20947 .m(m)
20948 .n(n)
20949 .k(k)
20950 .cm_stride(11)
20951 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080020952 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020953 }
20954 }
20955 }
20956 }
20957
20958 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, qmin) {
20959 TEST_REQUIRES_X86_AVX2;
20960 GemmMicrokernelTester()
20961 .mr(2)
20962 .nr(8)
20963 .kr(8)
20964 .sr(1)
20965 .m(2)
20966 .n(8)
20967 .k(8)
20968 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080020969 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020970 }
20971
20972 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, qmax) {
20973 TEST_REQUIRES_X86_AVX2;
20974 GemmMicrokernelTester()
20975 .mr(2)
20976 .nr(8)
20977 .kr(8)
20978 .sr(1)
20979 .m(2)
20980 .n(8)
20981 .k(8)
20982 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080020983 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020984 }
20985
20986 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, strided_cm) {
20987 TEST_REQUIRES_X86_AVX2;
20988 GemmMicrokernelTester()
20989 .mr(2)
20990 .nr(8)
20991 .kr(8)
20992 .sr(1)
20993 .m(2)
20994 .n(8)
20995 .k(8)
20996 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080020997 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080020998 }
20999#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
21000
21001
21002#if XNN_ARCH_X86 || XNN_ARCH_X86_64
21003 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_eq_8) {
21004 TEST_REQUIRES_X86_AVX2;
21005 GemmMicrokernelTester()
21006 .extended_weights(true)
21007 .mr(1)
21008 .nr(8)
21009 .kr(8)
21010 .sr(1)
21011 .m(1)
21012 .n(8)
21013 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080021014 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021015 }
21016
21017 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, strided_cn) {
21018 TEST_REQUIRES_X86_AVX2;
21019 GemmMicrokernelTester()
21020 .extended_weights(true)
21021 .mr(1)
21022 .nr(8)
21023 .kr(8)
21024 .sr(1)
21025 .m(1)
21026 .n(8)
21027 .k(8)
21028 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080021029 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021030 }
21031
21032 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_eq_8_strided_a) {
21033 TEST_REQUIRES_X86_AVX2;
21034 GemmMicrokernelTester()
21035 .extended_weights(true)
21036 .mr(1)
21037 .nr(8)
21038 .kr(8)
21039 .sr(1)
21040 .m(1)
21041 .n(8)
21042 .k(8)
21043 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080021044 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021045 }
21046
21047 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_eq_8_subtile) {
21048 TEST_REQUIRES_X86_AVX2;
Zhi An Ng83844ae2022-01-14 09:52:25 -080021049 for (uint32_t n = 1; n <= 8; n++) {
21050 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021051 GemmMicrokernelTester()
21052 .extended_weights(true)
21053 .mr(1)
21054 .nr(8)
21055 .kr(8)
21056 .sr(1)
21057 .m(m)
21058 .n(n)
21059 .k(8)
21060 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021061 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021062 }
21063 }
21064 }
21065
21066 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_eq_8_subtile_m) {
21067 TEST_REQUIRES_X86_AVX2;
21068 for (uint32_t m = 1; m <= 1; m++) {
21069 GemmMicrokernelTester()
21070 .extended_weights(true)
21071 .mr(1)
21072 .nr(8)
21073 .kr(8)
21074 .sr(1)
21075 .m(m)
21076 .n(8)
21077 .k(8)
21078 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021079 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021080 }
21081 }
21082
21083 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_eq_8_subtile_n) {
21084 TEST_REQUIRES_X86_AVX2;
21085 for (uint32_t n = 1; n <= 8; n++) {
21086 GemmMicrokernelTester()
21087 .extended_weights(true)
21088 .mr(1)
21089 .nr(8)
21090 .kr(8)
21091 .sr(1)
21092 .m(1)
21093 .n(n)
21094 .k(8)
21095 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021096 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021097 }
21098 }
21099
21100 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_lt_8) {
21101 TEST_REQUIRES_X86_AVX2;
21102 for (size_t k = 1; k < 8; k++) {
21103 GemmMicrokernelTester()
21104 .extended_weights(true)
21105 .mr(1)
21106 .nr(8)
21107 .kr(8)
21108 .sr(1)
21109 .m(1)
21110 .n(8)
21111 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080021112 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021113 }
21114 }
21115
21116 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_lt_8_strided_a) {
21117 TEST_REQUIRES_X86_AVX2;
21118 for (size_t k = 1; k < 8; k++) {
21119 GemmMicrokernelTester()
21120 .extended_weights(true)
21121 .mr(1)
21122 .nr(8)
21123 .kr(8)
21124 .sr(1)
21125 .m(1)
21126 .n(8)
21127 .k(k)
21128 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080021129 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021130 }
21131 }
21132
21133 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_lt_8_subtile) {
21134 TEST_REQUIRES_X86_AVX2;
21135 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080021136 for (uint32_t n = 1; n <= 8; n++) {
21137 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021138 GemmMicrokernelTester()
21139 .extended_weights(true)
21140 .mr(1)
21141 .nr(8)
21142 .kr(8)
21143 .sr(1)
21144 .m(m)
21145 .n(n)
21146 .k(k)
21147 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021148 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021149 }
21150 }
21151 }
21152 }
21153
21154 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_gt_8) {
21155 TEST_REQUIRES_X86_AVX2;
21156 for (size_t k = 9; k < 16; k++) {
21157 GemmMicrokernelTester()
21158 .extended_weights(true)
21159 .mr(1)
21160 .nr(8)
21161 .kr(8)
21162 .sr(1)
21163 .m(1)
21164 .n(8)
21165 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080021166 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021167 }
21168 }
21169
21170 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_gt_8_strided_a) {
21171 TEST_REQUIRES_X86_AVX2;
21172 for (size_t k = 9; k < 16; k++) {
21173 GemmMicrokernelTester()
21174 .extended_weights(true)
21175 .mr(1)
21176 .nr(8)
21177 .kr(8)
21178 .sr(1)
21179 .m(1)
21180 .n(8)
21181 .k(k)
21182 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080021183 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021184 }
21185 }
21186
21187 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_gt_8_subtile) {
21188 TEST_REQUIRES_X86_AVX2;
21189 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080021190 for (uint32_t n = 1; n <= 8; n++) {
21191 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021192 GemmMicrokernelTester()
21193 .extended_weights(true)
21194 .mr(1)
21195 .nr(8)
21196 .kr(8)
21197 .sr(1)
21198 .m(m)
21199 .n(n)
21200 .k(k)
21201 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021202 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021203 }
21204 }
21205 }
21206 }
21207
21208 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_div_8) {
21209 TEST_REQUIRES_X86_AVX2;
21210 for (size_t k = 16; k <= 80; k += 8) {
21211 GemmMicrokernelTester()
21212 .extended_weights(true)
21213 .mr(1)
21214 .nr(8)
21215 .kr(8)
21216 .sr(1)
21217 .m(1)
21218 .n(8)
21219 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080021220 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021221 }
21222 }
21223
21224 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_div_8_strided_a) {
21225 TEST_REQUIRES_X86_AVX2;
21226 for (size_t k = 16; k <= 80; k += 8) {
21227 GemmMicrokernelTester()
21228 .extended_weights(true)
21229 .mr(1)
21230 .nr(8)
21231 .kr(8)
21232 .sr(1)
21233 .m(1)
21234 .n(8)
21235 .k(k)
21236 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080021237 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021238 }
21239 }
21240
21241 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_div_8_subtile) {
21242 TEST_REQUIRES_X86_AVX2;
21243 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080021244 for (uint32_t n = 1; n <= 8; n++) {
21245 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021246 GemmMicrokernelTester()
21247 .extended_weights(true)
21248 .mr(1)
21249 .nr(8)
21250 .kr(8)
21251 .sr(1)
21252 .m(m)
21253 .n(n)
21254 .k(k)
21255 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021256 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021257 }
21258 }
21259 }
21260 }
21261
21262 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, n_gt_8) {
21263 TEST_REQUIRES_X86_AVX2;
21264 for (uint32_t n = 9; n < 16; n++) {
21265 for (size_t k = 1; k <= 40; k += 9) {
21266 GemmMicrokernelTester()
21267 .extended_weights(true)
21268 .mr(1)
21269 .nr(8)
21270 .kr(8)
21271 .sr(1)
21272 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080021273 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021274 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080021275 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021276 }
21277 }
21278 }
21279
21280 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, n_gt_8_strided_cn) {
21281 TEST_REQUIRES_X86_AVX2;
21282 for (uint32_t n = 9; n < 16; n++) {
21283 for (size_t k = 1; k <= 40; k += 9) {
21284 GemmMicrokernelTester()
21285 .extended_weights(true)
21286 .mr(1)
21287 .nr(8)
21288 .kr(8)
21289 .sr(1)
21290 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080021291 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021292 .k(k)
21293 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080021294 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021295 }
21296 }
21297 }
21298
21299 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, n_gt_8_strided_a) {
21300 TEST_REQUIRES_X86_AVX2;
21301 for (uint32_t n = 9; n < 16; n++) {
21302 for (size_t k = 1; k <= 40; k += 9) {
21303 GemmMicrokernelTester()
21304 .extended_weights(true)
21305 .mr(1)
21306 .nr(8)
21307 .kr(8)
21308 .sr(1)
21309 .m(1)
21310 .n(n)
21311 .k(k)
21312 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080021313 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021314 }
21315 }
21316 }
21317
21318 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, n_gt_8_subtile) {
21319 TEST_REQUIRES_X86_AVX2;
21320 for (uint32_t n = 9; n < 16; n++) {
21321 for (size_t k = 1; k <= 40; k += 9) {
21322 for (uint32_t m = 1; m <= 1; m++) {
21323 GemmMicrokernelTester()
21324 .extended_weights(true)
21325 .mr(1)
21326 .nr(8)
21327 .kr(8)
21328 .sr(1)
21329 .m(m)
21330 .n(n)
21331 .k(k)
21332 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021333 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021334 }
21335 }
21336 }
21337 }
21338
21339 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, n_div_8) {
21340 TEST_REQUIRES_X86_AVX2;
21341 for (uint32_t n = 16; n <= 24; n += 8) {
21342 for (size_t k = 1; k <= 40; k += 9) {
21343 GemmMicrokernelTester()
21344 .extended_weights(true)
21345 .mr(1)
21346 .nr(8)
21347 .kr(8)
21348 .sr(1)
21349 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080021350 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021351 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080021352 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021353 }
21354 }
21355 }
21356
21357 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, n_div_8_strided_cn) {
21358 TEST_REQUIRES_X86_AVX2;
21359 for (uint32_t n = 16; n <= 24; n += 8) {
21360 for (size_t k = 1; k <= 40; k += 9) {
21361 GemmMicrokernelTester()
21362 .extended_weights(true)
21363 .mr(1)
21364 .nr(8)
21365 .kr(8)
21366 .sr(1)
21367 .m(1)
21368 .n(n)
21369 .k(k)
21370 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080021371 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021372 }
21373 }
21374 }
21375
21376 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, n_div_8_strided_a) {
21377 TEST_REQUIRES_X86_AVX2;
21378 for (uint32_t n = 16; n <= 24; n += 8) {
21379 for (size_t k = 1; k <= 40; k += 9) {
21380 GemmMicrokernelTester()
21381 .extended_weights(true)
21382 .mr(1)
21383 .nr(8)
21384 .kr(8)
21385 .sr(1)
21386 .m(1)
21387 .n(n)
21388 .k(k)
21389 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080021390 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021391 }
21392 }
21393 }
21394
21395 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, n_div_8_subtile) {
21396 TEST_REQUIRES_X86_AVX2;
21397 for (uint32_t n = 16; n <= 24; n += 8) {
21398 for (size_t k = 1; k <= 40; k += 9) {
21399 for (uint32_t m = 1; m <= 1; m++) {
21400 GemmMicrokernelTester()
21401 .extended_weights(true)
21402 .mr(1)
21403 .nr(8)
21404 .kr(8)
21405 .sr(1)
21406 .m(m)
21407 .n(n)
21408 .k(k)
21409 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021410 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021411 }
21412 }
21413 }
21414 }
21415
21416 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, strided_cm_subtile) {
21417 TEST_REQUIRES_X86_AVX2;
21418 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080021419 for (uint32_t n = 1; n <= 8; n++) {
21420 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021421 GemmMicrokernelTester()
21422 .extended_weights(true)
21423 .mr(1)
21424 .nr(8)
21425 .kr(8)
21426 .sr(1)
21427 .m(m)
21428 .n(n)
21429 .k(k)
21430 .cm_stride(11)
21431 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021432 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021433 }
21434 }
21435 }
21436 }
21437
21438 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, strided_cm) {
21439 TEST_REQUIRES_X86_AVX2;
21440 GemmMicrokernelTester()
21441 .extended_weights(true)
21442 .mr(1)
21443 .nr(8)
21444 .kr(8)
21445 .sr(1)
21446 .m(1)
21447 .n(8)
21448 .k(8)
21449 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080021450 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021451 }
21452#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
21453
21454
21455#if XNN_ARCH_X86 || XNN_ARCH_X86_64
21456 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_eq_8) {
21457 TEST_REQUIRES_X86_AVX2;
21458 GemmMicrokernelTester()
21459 .extended_weights(true)
21460 .mr(3)
21461 .nr(8)
21462 .kr(8)
21463 .sr(1)
21464 .m(3)
21465 .n(8)
21466 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080021467 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021468 }
21469
21470 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, strided_cn) {
21471 TEST_REQUIRES_X86_AVX2;
21472 GemmMicrokernelTester()
21473 .extended_weights(true)
21474 .mr(3)
21475 .nr(8)
21476 .kr(8)
21477 .sr(1)
21478 .m(3)
21479 .n(8)
21480 .k(8)
21481 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080021482 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021483 }
21484
21485 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_eq_8_strided_a) {
21486 TEST_REQUIRES_X86_AVX2;
21487 GemmMicrokernelTester()
21488 .extended_weights(true)
21489 .mr(3)
21490 .nr(8)
21491 .kr(8)
21492 .sr(1)
21493 .m(3)
21494 .n(8)
21495 .k(8)
21496 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080021497 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021498 }
21499
21500 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_eq_8_subtile) {
21501 TEST_REQUIRES_X86_AVX2;
Zhi An Ng83844ae2022-01-14 09:52:25 -080021502 for (uint32_t n = 1; n <= 8; n++) {
21503 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021504 GemmMicrokernelTester()
21505 .extended_weights(true)
21506 .mr(3)
21507 .nr(8)
21508 .kr(8)
21509 .sr(1)
21510 .m(m)
21511 .n(n)
21512 .k(8)
21513 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021514 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021515 }
21516 }
21517 }
21518
21519 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_eq_8_subtile_m) {
21520 TEST_REQUIRES_X86_AVX2;
21521 for (uint32_t m = 1; m <= 3; m++) {
21522 GemmMicrokernelTester()
21523 .extended_weights(true)
21524 .mr(3)
21525 .nr(8)
21526 .kr(8)
21527 .sr(1)
21528 .m(m)
21529 .n(8)
21530 .k(8)
21531 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021532 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021533 }
21534 }
21535
21536 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_eq_8_subtile_n) {
21537 TEST_REQUIRES_X86_AVX2;
21538 for (uint32_t n = 1; n <= 8; n++) {
21539 GemmMicrokernelTester()
21540 .extended_weights(true)
21541 .mr(3)
21542 .nr(8)
21543 .kr(8)
21544 .sr(1)
21545 .m(3)
21546 .n(n)
21547 .k(8)
21548 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021549 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021550 }
21551 }
21552
21553 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_lt_8) {
21554 TEST_REQUIRES_X86_AVX2;
21555 for (size_t k = 1; k < 8; k++) {
21556 GemmMicrokernelTester()
21557 .extended_weights(true)
21558 .mr(3)
21559 .nr(8)
21560 .kr(8)
21561 .sr(1)
21562 .m(3)
21563 .n(8)
21564 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080021565 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021566 }
21567 }
21568
21569 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_lt_8_strided_a) {
21570 TEST_REQUIRES_X86_AVX2;
21571 for (size_t k = 1; k < 8; k++) {
21572 GemmMicrokernelTester()
21573 .extended_weights(true)
21574 .mr(3)
21575 .nr(8)
21576 .kr(8)
21577 .sr(1)
21578 .m(3)
21579 .n(8)
21580 .k(k)
21581 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080021582 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021583 }
21584 }
21585
21586 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_lt_8_subtile) {
21587 TEST_REQUIRES_X86_AVX2;
21588 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080021589 for (uint32_t n = 1; n <= 8; n++) {
21590 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021591 GemmMicrokernelTester()
21592 .extended_weights(true)
21593 .mr(3)
21594 .nr(8)
21595 .kr(8)
21596 .sr(1)
21597 .m(m)
21598 .n(n)
21599 .k(k)
21600 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021601 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021602 }
21603 }
21604 }
21605 }
21606
21607 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_gt_8) {
21608 TEST_REQUIRES_X86_AVX2;
21609 for (size_t k = 9; k < 16; k++) {
21610 GemmMicrokernelTester()
21611 .extended_weights(true)
21612 .mr(3)
21613 .nr(8)
21614 .kr(8)
21615 .sr(1)
21616 .m(3)
21617 .n(8)
21618 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080021619 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021620 }
21621 }
21622
21623 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_gt_8_strided_a) {
21624 TEST_REQUIRES_X86_AVX2;
21625 for (size_t k = 9; k < 16; k++) {
21626 GemmMicrokernelTester()
21627 .extended_weights(true)
21628 .mr(3)
21629 .nr(8)
21630 .kr(8)
21631 .sr(1)
21632 .m(3)
21633 .n(8)
21634 .k(k)
21635 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080021636 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021637 }
21638 }
21639
21640 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_gt_8_subtile) {
21641 TEST_REQUIRES_X86_AVX2;
21642 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080021643 for (uint32_t n = 1; n <= 8; n++) {
21644 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021645 GemmMicrokernelTester()
21646 .extended_weights(true)
21647 .mr(3)
21648 .nr(8)
21649 .kr(8)
21650 .sr(1)
21651 .m(m)
21652 .n(n)
21653 .k(k)
21654 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021655 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021656 }
21657 }
21658 }
21659 }
21660
21661 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_div_8) {
21662 TEST_REQUIRES_X86_AVX2;
21663 for (size_t k = 16; k <= 80; k += 8) {
21664 GemmMicrokernelTester()
21665 .extended_weights(true)
21666 .mr(3)
21667 .nr(8)
21668 .kr(8)
21669 .sr(1)
21670 .m(3)
21671 .n(8)
21672 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080021673 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021674 }
21675 }
21676
21677 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_div_8_strided_a) {
21678 TEST_REQUIRES_X86_AVX2;
21679 for (size_t k = 16; k <= 80; k += 8) {
21680 GemmMicrokernelTester()
21681 .extended_weights(true)
21682 .mr(3)
21683 .nr(8)
21684 .kr(8)
21685 .sr(1)
21686 .m(3)
21687 .n(8)
21688 .k(k)
21689 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080021690 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021691 }
21692 }
21693
21694 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_div_8_subtile) {
21695 TEST_REQUIRES_X86_AVX2;
21696 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080021697 for (uint32_t n = 1; n <= 8; n++) {
21698 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021699 GemmMicrokernelTester()
21700 .extended_weights(true)
21701 .mr(3)
21702 .nr(8)
21703 .kr(8)
21704 .sr(1)
21705 .m(m)
21706 .n(n)
21707 .k(k)
21708 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021709 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021710 }
21711 }
21712 }
21713 }
21714
21715 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, n_gt_8) {
21716 TEST_REQUIRES_X86_AVX2;
21717 for (uint32_t n = 9; n < 16; n++) {
21718 for (size_t k = 1; k <= 40; k += 9) {
21719 GemmMicrokernelTester()
21720 .extended_weights(true)
21721 .mr(3)
21722 .nr(8)
21723 .kr(8)
21724 .sr(1)
21725 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080021726 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021727 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080021728 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021729 }
21730 }
21731 }
21732
21733 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, n_gt_8_strided_cn) {
21734 TEST_REQUIRES_X86_AVX2;
21735 for (uint32_t n = 9; n < 16; n++) {
21736 for (size_t k = 1; k <= 40; k += 9) {
21737 GemmMicrokernelTester()
21738 .extended_weights(true)
21739 .mr(3)
21740 .nr(8)
21741 .kr(8)
21742 .sr(1)
21743 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080021744 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021745 .k(k)
21746 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080021747 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021748 }
21749 }
21750 }
21751
21752 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, n_gt_8_strided_a) {
21753 TEST_REQUIRES_X86_AVX2;
21754 for (uint32_t n = 9; n < 16; n++) {
21755 for (size_t k = 1; k <= 40; k += 9) {
21756 GemmMicrokernelTester()
21757 .extended_weights(true)
21758 .mr(3)
21759 .nr(8)
21760 .kr(8)
21761 .sr(1)
21762 .m(3)
21763 .n(n)
21764 .k(k)
21765 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080021766 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021767 }
21768 }
21769 }
21770
21771 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, n_gt_8_subtile) {
21772 TEST_REQUIRES_X86_AVX2;
21773 for (uint32_t n = 9; n < 16; n++) {
21774 for (size_t k = 1; k <= 40; k += 9) {
21775 for (uint32_t m = 1; m <= 3; m++) {
21776 GemmMicrokernelTester()
21777 .extended_weights(true)
21778 .mr(3)
21779 .nr(8)
21780 .kr(8)
21781 .sr(1)
21782 .m(m)
21783 .n(n)
21784 .k(k)
21785 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021786 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021787 }
21788 }
21789 }
21790 }
21791
21792 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, n_div_8) {
21793 TEST_REQUIRES_X86_AVX2;
21794 for (uint32_t n = 16; n <= 24; n += 8) {
21795 for (size_t k = 1; k <= 40; k += 9) {
21796 GemmMicrokernelTester()
21797 .extended_weights(true)
21798 .mr(3)
21799 .nr(8)
21800 .kr(8)
21801 .sr(1)
21802 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080021803 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021804 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080021805 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021806 }
21807 }
21808 }
21809
21810 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, n_div_8_strided_cn) {
21811 TEST_REQUIRES_X86_AVX2;
21812 for (uint32_t n = 16; n <= 24; n += 8) {
21813 for (size_t k = 1; k <= 40; k += 9) {
21814 GemmMicrokernelTester()
21815 .extended_weights(true)
21816 .mr(3)
21817 .nr(8)
21818 .kr(8)
21819 .sr(1)
21820 .m(3)
21821 .n(n)
21822 .k(k)
21823 .cn_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080021824 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021825 }
21826 }
21827 }
21828
21829 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, n_div_8_strided_a) {
21830 TEST_REQUIRES_X86_AVX2;
21831 for (uint32_t n = 16; n <= 24; n += 8) {
21832 for (size_t k = 1; k <= 40; k += 9) {
21833 GemmMicrokernelTester()
21834 .extended_weights(true)
21835 .mr(3)
21836 .nr(8)
21837 .kr(8)
21838 .sr(1)
21839 .m(3)
21840 .n(n)
21841 .k(k)
21842 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080021843 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021844 }
21845 }
21846 }
21847
21848 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, n_div_8_subtile) {
21849 TEST_REQUIRES_X86_AVX2;
21850 for (uint32_t n = 16; n <= 24; n += 8) {
21851 for (size_t k = 1; k <= 40; k += 9) {
21852 for (uint32_t m = 1; m <= 3; m++) {
21853 GemmMicrokernelTester()
21854 .extended_weights(true)
21855 .mr(3)
21856 .nr(8)
21857 .kr(8)
21858 .sr(1)
21859 .m(m)
21860 .n(n)
21861 .k(k)
21862 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021863 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021864 }
21865 }
21866 }
21867 }
21868
21869 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, strided_cm_subtile) {
21870 TEST_REQUIRES_X86_AVX2;
21871 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080021872 for (uint32_t n = 1; n <= 8; n++) {
21873 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021874 GemmMicrokernelTester()
21875 .extended_weights(true)
21876 .mr(3)
21877 .nr(8)
21878 .kr(8)
21879 .sr(1)
21880 .m(m)
21881 .n(n)
21882 .k(k)
21883 .cm_stride(11)
21884 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021885 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021886 }
21887 }
21888 }
21889 }
21890
21891 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, strided_cm) {
21892 TEST_REQUIRES_X86_AVX2;
21893 GemmMicrokernelTester()
21894 .extended_weights(true)
21895 .mr(3)
21896 .nr(8)
21897 .kr(8)
21898 .sr(1)
21899 .m(3)
21900 .n(8)
21901 .k(8)
21902 .cm_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080021903 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021904 }
21905#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
21906
21907
21908#if XNN_ARCH_X86 || XNN_ARCH_X86_64
21909 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_eq_8) {
21910 TEST_REQUIRES_X86_AVX512SKX;
21911 GemmMicrokernelTester()
21912 .mr(1)
21913 .nr(16)
21914 .kr(8)
21915 .sr(1)
21916 .m(1)
21917 .n(16)
21918 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080021919 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021920 }
21921
21922 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, strided_cn) {
21923 TEST_REQUIRES_X86_AVX512SKX;
21924 GemmMicrokernelTester()
21925 .mr(1)
21926 .nr(16)
21927 .kr(8)
21928 .sr(1)
21929 .m(1)
21930 .n(16)
21931 .k(8)
21932 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080021933 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021934 }
21935
21936 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_eq_8_strided_a) {
21937 TEST_REQUIRES_X86_AVX512SKX;
21938 GemmMicrokernelTester()
21939 .mr(1)
21940 .nr(16)
21941 .kr(8)
21942 .sr(1)
21943 .m(1)
21944 .n(16)
21945 .k(8)
21946 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080021947 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021948 }
21949
21950 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_eq_8_subtile) {
21951 TEST_REQUIRES_X86_AVX512SKX;
Zhi An Ng83844ae2022-01-14 09:52:25 -080021952 for (uint32_t n = 1; n <= 16; n++) {
21953 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021954 GemmMicrokernelTester()
21955 .mr(1)
21956 .nr(16)
21957 .kr(8)
21958 .sr(1)
21959 .m(m)
21960 .n(n)
21961 .k(8)
21962 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021963 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021964 }
21965 }
21966 }
21967
21968 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_eq_8_subtile_m) {
21969 TEST_REQUIRES_X86_AVX512SKX;
21970 for (uint32_t m = 1; m <= 1; m++) {
21971 GemmMicrokernelTester()
21972 .mr(1)
21973 .nr(16)
21974 .kr(8)
21975 .sr(1)
21976 .m(m)
21977 .n(16)
21978 .k(8)
21979 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021980 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021981 }
21982 }
21983
21984 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_eq_8_subtile_n) {
21985 TEST_REQUIRES_X86_AVX512SKX;
21986 for (uint32_t n = 1; n <= 16; n++) {
21987 GemmMicrokernelTester()
21988 .mr(1)
21989 .nr(16)
21990 .kr(8)
21991 .sr(1)
21992 .m(1)
21993 .n(n)
21994 .k(8)
21995 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080021996 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080021997 }
21998 }
21999
22000 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_lt_8) {
22001 TEST_REQUIRES_X86_AVX512SKX;
22002 for (size_t k = 1; k < 8; k++) {
22003 GemmMicrokernelTester()
22004 .mr(1)
22005 .nr(16)
22006 .kr(8)
22007 .sr(1)
22008 .m(1)
22009 .n(16)
22010 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022011 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022012 }
22013 }
22014
22015 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_lt_8_strided_a) {
22016 TEST_REQUIRES_X86_AVX512SKX;
22017 for (size_t k = 1; k < 8; k++) {
22018 GemmMicrokernelTester()
22019 .mr(1)
22020 .nr(16)
22021 .kr(8)
22022 .sr(1)
22023 .m(1)
22024 .n(16)
22025 .k(k)
22026 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080022027 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022028 }
22029 }
22030
22031 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_lt_8_subtile) {
22032 TEST_REQUIRES_X86_AVX512SKX;
22033 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080022034 for (uint32_t n = 1; n <= 16; n++) {
22035 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022036 GemmMicrokernelTester()
22037 .mr(1)
22038 .nr(16)
22039 .kr(8)
22040 .sr(1)
22041 .m(m)
22042 .n(n)
22043 .k(k)
22044 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022045 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022046 }
22047 }
22048 }
22049 }
22050
22051 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_gt_8) {
22052 TEST_REQUIRES_X86_AVX512SKX;
22053 for (size_t k = 9; k < 16; k++) {
22054 GemmMicrokernelTester()
22055 .mr(1)
22056 .nr(16)
22057 .kr(8)
22058 .sr(1)
22059 .m(1)
22060 .n(16)
22061 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022062 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022063 }
22064 }
22065
22066 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_gt_8_strided_a) {
22067 TEST_REQUIRES_X86_AVX512SKX;
22068 for (size_t k = 9; k < 16; k++) {
22069 GemmMicrokernelTester()
22070 .mr(1)
22071 .nr(16)
22072 .kr(8)
22073 .sr(1)
22074 .m(1)
22075 .n(16)
22076 .k(k)
22077 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080022078 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022079 }
22080 }
22081
22082 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_gt_8_subtile) {
22083 TEST_REQUIRES_X86_AVX512SKX;
22084 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080022085 for (uint32_t n = 1; n <= 16; n++) {
22086 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022087 GemmMicrokernelTester()
22088 .mr(1)
22089 .nr(16)
22090 .kr(8)
22091 .sr(1)
22092 .m(m)
22093 .n(n)
22094 .k(k)
22095 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022096 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022097 }
22098 }
22099 }
22100 }
22101
22102 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_div_8) {
22103 TEST_REQUIRES_X86_AVX512SKX;
22104 for (size_t k = 16; k <= 80; k += 8) {
22105 GemmMicrokernelTester()
22106 .mr(1)
22107 .nr(16)
22108 .kr(8)
22109 .sr(1)
22110 .m(1)
22111 .n(16)
22112 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022113 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022114 }
22115 }
22116
22117 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_div_8_strided_a) {
22118 TEST_REQUIRES_X86_AVX512SKX;
22119 for (size_t k = 16; k <= 80; k += 8) {
22120 GemmMicrokernelTester()
22121 .mr(1)
22122 .nr(16)
22123 .kr(8)
22124 .sr(1)
22125 .m(1)
22126 .n(16)
22127 .k(k)
22128 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080022129 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022130 }
22131 }
22132
22133 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_div_8_subtile) {
22134 TEST_REQUIRES_X86_AVX512SKX;
22135 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080022136 for (uint32_t n = 1; n <= 16; n++) {
22137 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022138 GemmMicrokernelTester()
22139 .mr(1)
22140 .nr(16)
22141 .kr(8)
22142 .sr(1)
22143 .m(m)
22144 .n(n)
22145 .k(k)
22146 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022147 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022148 }
22149 }
22150 }
22151 }
22152
22153 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_gt_16) {
22154 TEST_REQUIRES_X86_AVX512SKX;
22155 for (uint32_t n = 17; n < 32; n++) {
22156 for (size_t k = 1; k <= 40; k += 9) {
22157 GemmMicrokernelTester()
22158 .mr(1)
22159 .nr(16)
22160 .kr(8)
22161 .sr(1)
22162 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080022163 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022164 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022165 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022166 }
22167 }
22168 }
22169
22170 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_gt_16_strided_cn) {
22171 TEST_REQUIRES_X86_AVX512SKX;
22172 for (uint32_t n = 17; n < 32; n++) {
22173 for (size_t k = 1; k <= 40; k += 9) {
22174 GemmMicrokernelTester()
22175 .mr(1)
22176 .nr(16)
22177 .kr(8)
22178 .sr(1)
22179 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080022180 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022181 .k(k)
22182 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080022183 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022184 }
22185 }
22186 }
22187
22188 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_gt_16_strided_a) {
22189 TEST_REQUIRES_X86_AVX512SKX;
22190 for (uint32_t n = 17; n < 32; n++) {
22191 for (size_t k = 1; k <= 40; k += 9) {
22192 GemmMicrokernelTester()
22193 .mr(1)
22194 .nr(16)
22195 .kr(8)
22196 .sr(1)
22197 .m(1)
22198 .n(n)
22199 .k(k)
22200 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080022201 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022202 }
22203 }
22204 }
22205
22206 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_gt_16_subtile) {
22207 TEST_REQUIRES_X86_AVX512SKX;
22208 for (uint32_t n = 17; n < 32; n++) {
22209 for (size_t k = 1; k <= 40; k += 9) {
22210 for (uint32_t m = 1; m <= 1; m++) {
22211 GemmMicrokernelTester()
22212 .mr(1)
22213 .nr(16)
22214 .kr(8)
22215 .sr(1)
22216 .m(m)
22217 .n(n)
22218 .k(k)
22219 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022220 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022221 }
22222 }
22223 }
22224 }
22225
22226 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_div_16) {
22227 TEST_REQUIRES_X86_AVX512SKX;
22228 for (uint32_t n = 32; n <= 48; n += 16) {
22229 for (size_t k = 1; k <= 40; k += 9) {
22230 GemmMicrokernelTester()
22231 .mr(1)
22232 .nr(16)
22233 .kr(8)
22234 .sr(1)
22235 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080022236 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022237 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022238 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022239 }
22240 }
22241 }
22242
22243 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_div_16_strided_cn) {
22244 TEST_REQUIRES_X86_AVX512SKX;
22245 for (uint32_t n = 32; n <= 48; n += 16) {
22246 for (size_t k = 1; k <= 40; k += 9) {
22247 GemmMicrokernelTester()
22248 .mr(1)
22249 .nr(16)
22250 .kr(8)
22251 .sr(1)
22252 .m(1)
22253 .n(n)
22254 .k(k)
22255 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080022256 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022257 }
22258 }
22259 }
22260
22261 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_div_16_strided_a) {
22262 TEST_REQUIRES_X86_AVX512SKX;
22263 for (uint32_t n = 32; n <= 48; n += 16) {
22264 for (size_t k = 1; k <= 40; k += 9) {
22265 GemmMicrokernelTester()
22266 .mr(1)
22267 .nr(16)
22268 .kr(8)
22269 .sr(1)
22270 .m(1)
22271 .n(n)
22272 .k(k)
22273 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080022274 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022275 }
22276 }
22277 }
22278
22279 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_div_16_subtile) {
22280 TEST_REQUIRES_X86_AVX512SKX;
22281 for (uint32_t n = 32; n <= 48; n += 16) {
22282 for (size_t k = 1; k <= 40; k += 9) {
22283 for (uint32_t m = 1; m <= 1; m++) {
22284 GemmMicrokernelTester()
22285 .mr(1)
22286 .nr(16)
22287 .kr(8)
22288 .sr(1)
22289 .m(m)
22290 .n(n)
22291 .k(k)
22292 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022293 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022294 }
22295 }
22296 }
22297 }
22298
22299 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, strided_cm_subtile) {
22300 TEST_REQUIRES_X86_AVX512SKX;
22301 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080022302 for (uint32_t n = 1; n <= 16; n++) {
22303 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022304 GemmMicrokernelTester()
22305 .mr(1)
22306 .nr(16)
22307 .kr(8)
22308 .sr(1)
22309 .m(m)
22310 .n(n)
22311 .k(k)
22312 .cm_stride(19)
22313 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022314 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022315 }
22316 }
22317 }
22318 }
22319
22320 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, qmin) {
22321 TEST_REQUIRES_X86_AVX512SKX;
22322 GemmMicrokernelTester()
22323 .mr(1)
22324 .nr(16)
22325 .kr(8)
22326 .sr(1)
22327 .m(1)
22328 .n(16)
22329 .k(8)
22330 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080022331 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022332 }
22333
22334 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, qmax) {
22335 TEST_REQUIRES_X86_AVX512SKX;
22336 GemmMicrokernelTester()
22337 .mr(1)
22338 .nr(16)
22339 .kr(8)
22340 .sr(1)
22341 .m(1)
22342 .n(16)
22343 .k(8)
22344 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080022345 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022346 }
22347
22348 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, strided_cm) {
22349 TEST_REQUIRES_X86_AVX512SKX;
22350 GemmMicrokernelTester()
22351 .mr(1)
22352 .nr(16)
22353 .kr(8)
22354 .sr(1)
22355 .m(1)
22356 .n(16)
22357 .k(8)
22358 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080022359 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022360 }
22361#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
22362
22363
22364#if XNN_ARCH_X86 || XNN_ARCH_X86_64
22365 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_eq_8) {
22366 TEST_REQUIRES_X86_AVX512SKX;
22367 GemmMicrokernelTester()
22368 .mr(2)
22369 .nr(16)
22370 .kr(8)
22371 .sr(1)
22372 .m(2)
22373 .n(16)
22374 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080022375 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022376 }
22377
22378 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, strided_cn) {
22379 TEST_REQUIRES_X86_AVX512SKX;
22380 GemmMicrokernelTester()
22381 .mr(2)
22382 .nr(16)
22383 .kr(8)
22384 .sr(1)
22385 .m(2)
22386 .n(16)
22387 .k(8)
22388 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080022389 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022390 }
22391
22392 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_eq_8_strided_a) {
22393 TEST_REQUIRES_X86_AVX512SKX;
22394 GemmMicrokernelTester()
22395 .mr(2)
22396 .nr(16)
22397 .kr(8)
22398 .sr(1)
22399 .m(2)
22400 .n(16)
22401 .k(8)
22402 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080022403 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022404 }
22405
22406 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_eq_8_subtile) {
22407 TEST_REQUIRES_X86_AVX512SKX;
Zhi An Ng83844ae2022-01-14 09:52:25 -080022408 for (uint32_t n = 1; n <= 16; n++) {
22409 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022410 GemmMicrokernelTester()
22411 .mr(2)
22412 .nr(16)
22413 .kr(8)
22414 .sr(1)
22415 .m(m)
22416 .n(n)
22417 .k(8)
22418 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022419 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022420 }
22421 }
22422 }
22423
22424 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_eq_8_subtile_m) {
22425 TEST_REQUIRES_X86_AVX512SKX;
22426 for (uint32_t m = 1; m <= 2; m++) {
22427 GemmMicrokernelTester()
22428 .mr(2)
22429 .nr(16)
22430 .kr(8)
22431 .sr(1)
22432 .m(m)
22433 .n(16)
22434 .k(8)
22435 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022436 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022437 }
22438 }
22439
22440 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_eq_8_subtile_n) {
22441 TEST_REQUIRES_X86_AVX512SKX;
22442 for (uint32_t n = 1; n <= 16; n++) {
22443 GemmMicrokernelTester()
22444 .mr(2)
22445 .nr(16)
22446 .kr(8)
22447 .sr(1)
22448 .m(2)
22449 .n(n)
22450 .k(8)
22451 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022452 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022453 }
22454 }
22455
22456 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_lt_8) {
22457 TEST_REQUIRES_X86_AVX512SKX;
22458 for (size_t k = 1; k < 8; k++) {
22459 GemmMicrokernelTester()
22460 .mr(2)
22461 .nr(16)
22462 .kr(8)
22463 .sr(1)
22464 .m(2)
22465 .n(16)
22466 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022467 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022468 }
22469 }
22470
22471 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_lt_8_strided_a) {
22472 TEST_REQUIRES_X86_AVX512SKX;
22473 for (size_t k = 1; k < 8; k++) {
22474 GemmMicrokernelTester()
22475 .mr(2)
22476 .nr(16)
22477 .kr(8)
22478 .sr(1)
22479 .m(2)
22480 .n(16)
22481 .k(k)
22482 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080022483 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022484 }
22485 }
22486
22487 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_lt_8_subtile) {
22488 TEST_REQUIRES_X86_AVX512SKX;
22489 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080022490 for (uint32_t n = 1; n <= 16; n++) {
22491 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022492 GemmMicrokernelTester()
22493 .mr(2)
22494 .nr(16)
22495 .kr(8)
22496 .sr(1)
22497 .m(m)
22498 .n(n)
22499 .k(k)
22500 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022501 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022502 }
22503 }
22504 }
22505 }
22506
22507 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_gt_8) {
22508 TEST_REQUIRES_X86_AVX512SKX;
22509 for (size_t k = 9; k < 16; k++) {
22510 GemmMicrokernelTester()
22511 .mr(2)
22512 .nr(16)
22513 .kr(8)
22514 .sr(1)
22515 .m(2)
22516 .n(16)
22517 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022518 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022519 }
22520 }
22521
22522 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_gt_8_strided_a) {
22523 TEST_REQUIRES_X86_AVX512SKX;
22524 for (size_t k = 9; k < 16; k++) {
22525 GemmMicrokernelTester()
22526 .mr(2)
22527 .nr(16)
22528 .kr(8)
22529 .sr(1)
22530 .m(2)
22531 .n(16)
22532 .k(k)
22533 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080022534 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022535 }
22536 }
22537
22538 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_gt_8_subtile) {
22539 TEST_REQUIRES_X86_AVX512SKX;
22540 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080022541 for (uint32_t n = 1; n <= 16; n++) {
22542 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022543 GemmMicrokernelTester()
22544 .mr(2)
22545 .nr(16)
22546 .kr(8)
22547 .sr(1)
22548 .m(m)
22549 .n(n)
22550 .k(k)
22551 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022552 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022553 }
22554 }
22555 }
22556 }
22557
22558 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_div_8) {
22559 TEST_REQUIRES_X86_AVX512SKX;
22560 for (size_t k = 16; k <= 80; k += 8) {
22561 GemmMicrokernelTester()
22562 .mr(2)
22563 .nr(16)
22564 .kr(8)
22565 .sr(1)
22566 .m(2)
22567 .n(16)
22568 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022569 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022570 }
22571 }
22572
22573 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_div_8_strided_a) {
22574 TEST_REQUIRES_X86_AVX512SKX;
22575 for (size_t k = 16; k <= 80; k += 8) {
22576 GemmMicrokernelTester()
22577 .mr(2)
22578 .nr(16)
22579 .kr(8)
22580 .sr(1)
22581 .m(2)
22582 .n(16)
22583 .k(k)
22584 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080022585 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022586 }
22587 }
22588
22589 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_div_8_subtile) {
22590 TEST_REQUIRES_X86_AVX512SKX;
22591 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080022592 for (uint32_t n = 1; n <= 16; n++) {
22593 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022594 GemmMicrokernelTester()
22595 .mr(2)
22596 .nr(16)
22597 .kr(8)
22598 .sr(1)
22599 .m(m)
22600 .n(n)
22601 .k(k)
22602 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022603 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022604 }
22605 }
22606 }
22607 }
22608
22609 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_gt_16) {
22610 TEST_REQUIRES_X86_AVX512SKX;
22611 for (uint32_t n = 17; n < 32; n++) {
22612 for (size_t k = 1; k <= 40; k += 9) {
22613 GemmMicrokernelTester()
22614 .mr(2)
22615 .nr(16)
22616 .kr(8)
22617 .sr(1)
22618 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080022619 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022620 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022621 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022622 }
22623 }
22624 }
22625
22626 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_gt_16_strided_cn) {
22627 TEST_REQUIRES_X86_AVX512SKX;
22628 for (uint32_t n = 17; n < 32; n++) {
22629 for (size_t k = 1; k <= 40; k += 9) {
22630 GemmMicrokernelTester()
22631 .mr(2)
22632 .nr(16)
22633 .kr(8)
22634 .sr(1)
22635 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080022636 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022637 .k(k)
22638 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080022639 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022640 }
22641 }
22642 }
22643
22644 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_gt_16_strided_a) {
22645 TEST_REQUIRES_X86_AVX512SKX;
22646 for (uint32_t n = 17; n < 32; n++) {
22647 for (size_t k = 1; k <= 40; k += 9) {
22648 GemmMicrokernelTester()
22649 .mr(2)
22650 .nr(16)
22651 .kr(8)
22652 .sr(1)
22653 .m(2)
22654 .n(n)
22655 .k(k)
22656 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080022657 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022658 }
22659 }
22660 }
22661
22662 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_gt_16_subtile) {
22663 TEST_REQUIRES_X86_AVX512SKX;
22664 for (uint32_t n = 17; n < 32; n++) {
22665 for (size_t k = 1; k <= 40; k += 9) {
22666 for (uint32_t m = 1; m <= 2; m++) {
22667 GemmMicrokernelTester()
22668 .mr(2)
22669 .nr(16)
22670 .kr(8)
22671 .sr(1)
22672 .m(m)
22673 .n(n)
22674 .k(k)
22675 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022676 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022677 }
22678 }
22679 }
22680 }
22681
22682 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_div_16) {
22683 TEST_REQUIRES_X86_AVX512SKX;
22684 for (uint32_t n = 32; n <= 48; n += 16) {
22685 for (size_t k = 1; k <= 40; k += 9) {
22686 GemmMicrokernelTester()
22687 .mr(2)
22688 .nr(16)
22689 .kr(8)
22690 .sr(1)
22691 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080022692 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022693 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022694 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022695 }
22696 }
22697 }
22698
22699 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_div_16_strided_cn) {
22700 TEST_REQUIRES_X86_AVX512SKX;
22701 for (uint32_t n = 32; n <= 48; n += 16) {
22702 for (size_t k = 1; k <= 40; k += 9) {
22703 GemmMicrokernelTester()
22704 .mr(2)
22705 .nr(16)
22706 .kr(8)
22707 .sr(1)
22708 .m(2)
22709 .n(n)
22710 .k(k)
22711 .cn_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080022712 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022713 }
22714 }
22715 }
22716
22717 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_div_16_strided_a) {
22718 TEST_REQUIRES_X86_AVX512SKX;
22719 for (uint32_t n = 32; n <= 48; n += 16) {
22720 for (size_t k = 1; k <= 40; k += 9) {
22721 GemmMicrokernelTester()
22722 .mr(2)
22723 .nr(16)
22724 .kr(8)
22725 .sr(1)
22726 .m(2)
22727 .n(n)
22728 .k(k)
22729 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080022730 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022731 }
22732 }
22733 }
22734
22735 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_div_16_subtile) {
22736 TEST_REQUIRES_X86_AVX512SKX;
22737 for (uint32_t n = 32; n <= 48; n += 16) {
22738 for (size_t k = 1; k <= 40; k += 9) {
22739 for (uint32_t m = 1; m <= 2; m++) {
22740 GemmMicrokernelTester()
22741 .mr(2)
22742 .nr(16)
22743 .kr(8)
22744 .sr(1)
22745 .m(m)
22746 .n(n)
22747 .k(k)
22748 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022749 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022750 }
22751 }
22752 }
22753 }
22754
22755 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, strided_cm_subtile) {
22756 TEST_REQUIRES_X86_AVX512SKX;
22757 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080022758 for (uint32_t n = 1; n <= 16; n++) {
22759 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022760 GemmMicrokernelTester()
22761 .mr(2)
22762 .nr(16)
22763 .kr(8)
22764 .sr(1)
22765 .m(m)
22766 .n(n)
22767 .k(k)
22768 .cm_stride(19)
22769 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022770 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022771 }
22772 }
22773 }
22774 }
22775
22776 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, qmin) {
22777 TEST_REQUIRES_X86_AVX512SKX;
22778 GemmMicrokernelTester()
22779 .mr(2)
22780 .nr(16)
22781 .kr(8)
22782 .sr(1)
22783 .m(2)
22784 .n(16)
22785 .k(8)
22786 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080022787 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022788 }
22789
22790 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, qmax) {
22791 TEST_REQUIRES_X86_AVX512SKX;
22792 GemmMicrokernelTester()
22793 .mr(2)
22794 .nr(16)
22795 .kr(8)
22796 .sr(1)
22797 .m(2)
22798 .n(16)
22799 .k(8)
22800 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080022801 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022802 }
22803
22804 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, strided_cm) {
22805 TEST_REQUIRES_X86_AVX512SKX;
22806 GemmMicrokernelTester()
22807 .mr(2)
22808 .nr(16)
22809 .kr(8)
22810 .sr(1)
22811 .m(2)
22812 .n(16)
22813 .k(8)
22814 .cm_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080022815 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022816 }
22817#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
22818
22819
22820#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
22821 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8) {
22822 GemmMicrokernelTester()
22823 .mr(1)
22824 .nr(4)
22825 .kr(2)
22826 .sr(1)
22827 .m(1)
22828 .n(4)
22829 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080022830 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022831 }
22832
22833 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, strided_cn) {
22834 GemmMicrokernelTester()
22835 .mr(1)
22836 .nr(4)
22837 .kr(2)
22838 .sr(1)
22839 .m(1)
22840 .n(4)
22841 .k(8)
22842 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080022843 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022844 }
22845
22846 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_strided_a) {
22847 GemmMicrokernelTester()
22848 .mr(1)
22849 .nr(4)
22850 .kr(2)
22851 .sr(1)
22852 .m(1)
22853 .n(4)
22854 .k(8)
22855 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080022856 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022857 }
22858
22859 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080022860 for (uint32_t n = 1; n <= 4; n++) {
22861 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022862 GemmMicrokernelTester()
22863 .mr(1)
22864 .nr(4)
22865 .kr(2)
22866 .sr(1)
22867 .m(m)
22868 .n(n)
22869 .k(8)
22870 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022871 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022872 }
22873 }
22874 }
22875
22876 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_m) {
22877 for (uint32_t m = 1; m <= 1; m++) {
22878 GemmMicrokernelTester()
22879 .mr(1)
22880 .nr(4)
22881 .kr(2)
22882 .sr(1)
22883 .m(m)
22884 .n(4)
22885 .k(8)
22886 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022887 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022888 }
22889 }
22890
22891 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_n) {
22892 for (uint32_t n = 1; n <= 4; n++) {
22893 GemmMicrokernelTester()
22894 .mr(1)
22895 .nr(4)
22896 .kr(2)
22897 .sr(1)
22898 .m(1)
22899 .n(n)
22900 .k(8)
22901 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022902 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022903 }
22904 }
22905
22906 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_lt_8) {
22907 for (size_t k = 1; k < 8; k++) {
22908 GemmMicrokernelTester()
22909 .mr(1)
22910 .nr(4)
22911 .kr(2)
22912 .sr(1)
22913 .m(1)
22914 .n(4)
22915 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022916 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022917 }
22918 }
22919
22920 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_lt_8_strided_a) {
22921 for (size_t k = 1; k < 8; k++) {
22922 GemmMicrokernelTester()
22923 .mr(1)
22924 .nr(4)
22925 .kr(2)
22926 .sr(1)
22927 .m(1)
22928 .n(4)
22929 .k(k)
22930 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080022931 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022932 }
22933 }
22934
22935 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_lt_8_subtile) {
22936 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080022937 for (uint32_t n = 1; n <= 4; n++) {
22938 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022939 GemmMicrokernelTester()
22940 .mr(1)
22941 .nr(4)
22942 .kr(2)
22943 .sr(1)
22944 .m(m)
22945 .n(n)
22946 .k(k)
22947 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022948 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022949 }
22950 }
22951 }
22952 }
22953
22954 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_gt_8) {
22955 for (size_t k = 9; k < 16; k++) {
22956 GemmMicrokernelTester()
22957 .mr(1)
22958 .nr(4)
22959 .kr(2)
22960 .sr(1)
22961 .m(1)
22962 .n(4)
22963 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080022964 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022965 }
22966 }
22967
22968 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_gt_8_strided_a) {
22969 for (size_t k = 9; k < 16; k++) {
22970 GemmMicrokernelTester()
22971 .mr(1)
22972 .nr(4)
22973 .kr(2)
22974 .sr(1)
22975 .m(1)
22976 .n(4)
22977 .k(k)
22978 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080022979 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022980 }
22981 }
22982
22983 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_gt_8_subtile) {
22984 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080022985 for (uint32_t n = 1; n <= 4; n++) {
22986 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022987 GemmMicrokernelTester()
22988 .mr(1)
22989 .nr(4)
22990 .kr(2)
22991 .sr(1)
22992 .m(m)
22993 .n(n)
22994 .k(k)
22995 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080022996 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080022997 }
22998 }
22999 }
23000 }
23001
23002 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_div_8) {
23003 for (size_t k = 16; k <= 80; k += 8) {
23004 GemmMicrokernelTester()
23005 .mr(1)
23006 .nr(4)
23007 .kr(2)
23008 .sr(1)
23009 .m(1)
23010 .n(4)
23011 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023012 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023013 }
23014 }
23015
23016 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_div_8_strided_a) {
23017 for (size_t k = 16; k <= 80; k += 8) {
23018 GemmMicrokernelTester()
23019 .mr(1)
23020 .nr(4)
23021 .kr(2)
23022 .sr(1)
23023 .m(1)
23024 .n(4)
23025 .k(k)
23026 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080023027 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023028 }
23029 }
23030
23031 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_div_8_subtile) {
23032 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080023033 for (uint32_t n = 1; n <= 4; n++) {
23034 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023035 GemmMicrokernelTester()
23036 .mr(1)
23037 .nr(4)
23038 .kr(2)
23039 .sr(1)
23040 .m(m)
23041 .n(n)
23042 .k(k)
23043 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023044 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023045 }
23046 }
23047 }
23048 }
23049
23050 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4) {
23051 for (uint32_t n = 5; n < 8; n++) {
23052 for (size_t k = 1; k <= 40; k += 9) {
23053 GemmMicrokernelTester()
23054 .mr(1)
23055 .nr(4)
23056 .kr(2)
23057 .sr(1)
23058 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080023059 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023060 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023061 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023062 }
23063 }
23064 }
23065
23066 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_cn) {
23067 for (uint32_t n = 5; n < 8; n++) {
23068 for (size_t k = 1; k <= 40; k += 9) {
23069 GemmMicrokernelTester()
23070 .mr(1)
23071 .nr(4)
23072 .kr(2)
23073 .sr(1)
23074 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080023075 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023076 .k(k)
23077 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080023078 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023079 }
23080 }
23081 }
23082
23083 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_a) {
23084 for (uint32_t n = 5; n < 8; n++) {
23085 for (size_t k = 1; k <= 40; k += 9) {
23086 GemmMicrokernelTester()
23087 .mr(1)
23088 .nr(4)
23089 .kr(2)
23090 .sr(1)
23091 .m(1)
23092 .n(n)
23093 .k(k)
23094 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080023095 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023096 }
23097 }
23098 }
23099
23100 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4_subtile) {
23101 for (uint32_t n = 5; n < 8; n++) {
23102 for (size_t k = 1; k <= 40; k += 9) {
23103 for (uint32_t m = 1; m <= 1; m++) {
23104 GemmMicrokernelTester()
23105 .mr(1)
23106 .nr(4)
23107 .kr(2)
23108 .sr(1)
23109 .m(m)
23110 .n(n)
23111 .k(k)
23112 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023113 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023114 }
23115 }
23116 }
23117 }
23118
23119 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, n_div_4) {
23120 for (uint32_t n = 8; n <= 12; n += 4) {
23121 for (size_t k = 1; k <= 40; k += 9) {
23122 GemmMicrokernelTester()
23123 .mr(1)
23124 .nr(4)
23125 .kr(2)
23126 .sr(1)
23127 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080023128 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023129 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023130 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023131 }
23132 }
23133 }
23134
23135 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, n_div_4_strided_cn) {
23136 for (uint32_t n = 8; n <= 12; n += 4) {
23137 for (size_t k = 1; k <= 40; k += 9) {
23138 GemmMicrokernelTester()
23139 .mr(1)
23140 .nr(4)
23141 .kr(2)
23142 .sr(1)
23143 .m(1)
23144 .n(n)
23145 .k(k)
23146 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080023147 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023148 }
23149 }
23150 }
23151
23152 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, n_div_4_strided_a) {
23153 for (uint32_t n = 8; n <= 12; n += 4) {
23154 for (size_t k = 1; k <= 40; k += 9) {
23155 GemmMicrokernelTester()
23156 .mr(1)
23157 .nr(4)
23158 .kr(2)
23159 .sr(1)
23160 .m(1)
23161 .n(n)
23162 .k(k)
23163 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080023164 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023165 }
23166 }
23167 }
23168
23169 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, n_div_4_subtile) {
23170 for (uint32_t n = 8; n <= 12; n += 4) {
23171 for (size_t k = 1; k <= 40; k += 9) {
23172 for (uint32_t m = 1; m <= 1; m++) {
23173 GemmMicrokernelTester()
23174 .mr(1)
23175 .nr(4)
23176 .kr(2)
23177 .sr(1)
23178 .m(m)
23179 .n(n)
23180 .k(k)
23181 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023182 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023183 }
23184 }
23185 }
23186 }
23187
23188 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, strided_cm_subtile) {
23189 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080023190 for (uint32_t n = 1; n <= 4; n++) {
23191 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023192 GemmMicrokernelTester()
23193 .mr(1)
23194 .nr(4)
23195 .kr(2)
23196 .sr(1)
23197 .m(m)
23198 .n(n)
23199 .k(k)
23200 .cm_stride(7)
23201 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023202 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023203 }
23204 }
23205 }
23206 }
23207
23208 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, qmin) {
23209 GemmMicrokernelTester()
23210 .mr(1)
23211 .nr(4)
23212 .kr(2)
23213 .sr(1)
23214 .m(1)
23215 .n(4)
23216 .k(8)
23217 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080023218 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023219 }
23220
23221 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, qmax) {
23222 GemmMicrokernelTester()
23223 .mr(1)
23224 .nr(4)
23225 .kr(2)
23226 .sr(1)
23227 .m(1)
23228 .n(4)
23229 .k(8)
23230 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080023231 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023232 }
23233
23234 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, strided_cm) {
23235 GemmMicrokernelTester()
23236 .mr(1)
23237 .nr(4)
23238 .kr(2)
23239 .sr(1)
23240 .m(1)
23241 .n(4)
23242 .k(8)
23243 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080023244 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023245 }
23246#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
23247
23248
23249#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
23250 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8) {
23251 GemmMicrokernelTester()
23252 .mr(2)
23253 .nr(4)
23254 .kr(2)
23255 .sr(1)
23256 .m(2)
23257 .n(4)
23258 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080023259 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023260 }
23261
23262 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, strided_cn) {
23263 GemmMicrokernelTester()
23264 .mr(2)
23265 .nr(4)
23266 .kr(2)
23267 .sr(1)
23268 .m(2)
23269 .n(4)
23270 .k(8)
23271 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080023272 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023273 }
23274
23275 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_strided_a) {
23276 GemmMicrokernelTester()
23277 .mr(2)
23278 .nr(4)
23279 .kr(2)
23280 .sr(1)
23281 .m(2)
23282 .n(4)
23283 .k(8)
23284 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080023285 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023286 }
23287
23288 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080023289 for (uint32_t n = 1; n <= 4; n++) {
23290 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023291 GemmMicrokernelTester()
23292 .mr(2)
23293 .nr(4)
23294 .kr(2)
23295 .sr(1)
23296 .m(m)
23297 .n(n)
23298 .k(8)
23299 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023300 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023301 }
23302 }
23303 }
23304
23305 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_m) {
23306 for (uint32_t m = 1; m <= 2; m++) {
23307 GemmMicrokernelTester()
23308 .mr(2)
23309 .nr(4)
23310 .kr(2)
23311 .sr(1)
23312 .m(m)
23313 .n(4)
23314 .k(8)
23315 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023316 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023317 }
23318 }
23319
23320 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_n) {
23321 for (uint32_t n = 1; n <= 4; n++) {
23322 GemmMicrokernelTester()
23323 .mr(2)
23324 .nr(4)
23325 .kr(2)
23326 .sr(1)
23327 .m(2)
23328 .n(n)
23329 .k(8)
23330 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023331 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023332 }
23333 }
23334
23335 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, k_lt_8) {
23336 for (size_t k = 1; k < 8; k++) {
23337 GemmMicrokernelTester()
23338 .mr(2)
23339 .nr(4)
23340 .kr(2)
23341 .sr(1)
23342 .m(2)
23343 .n(4)
23344 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023345 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023346 }
23347 }
23348
23349 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, k_lt_8_strided_a) {
23350 for (size_t k = 1; k < 8; k++) {
23351 GemmMicrokernelTester()
23352 .mr(2)
23353 .nr(4)
23354 .kr(2)
23355 .sr(1)
23356 .m(2)
23357 .n(4)
23358 .k(k)
23359 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080023360 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023361 }
23362 }
23363
23364 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, k_lt_8_subtile) {
23365 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080023366 for (uint32_t n = 1; n <= 4; n++) {
23367 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023368 GemmMicrokernelTester()
23369 .mr(2)
23370 .nr(4)
23371 .kr(2)
23372 .sr(1)
23373 .m(m)
23374 .n(n)
23375 .k(k)
23376 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023377 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023378 }
23379 }
23380 }
23381 }
23382
23383 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, k_gt_8) {
23384 for (size_t k = 9; k < 16; k++) {
23385 GemmMicrokernelTester()
23386 .mr(2)
23387 .nr(4)
23388 .kr(2)
23389 .sr(1)
23390 .m(2)
23391 .n(4)
23392 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023393 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023394 }
23395 }
23396
23397 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, k_gt_8_strided_a) {
23398 for (size_t k = 9; k < 16; k++) {
23399 GemmMicrokernelTester()
23400 .mr(2)
23401 .nr(4)
23402 .kr(2)
23403 .sr(1)
23404 .m(2)
23405 .n(4)
23406 .k(k)
23407 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080023408 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023409 }
23410 }
23411
23412 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, k_gt_8_subtile) {
23413 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080023414 for (uint32_t n = 1; n <= 4; n++) {
23415 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023416 GemmMicrokernelTester()
23417 .mr(2)
23418 .nr(4)
23419 .kr(2)
23420 .sr(1)
23421 .m(m)
23422 .n(n)
23423 .k(k)
23424 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023425 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023426 }
23427 }
23428 }
23429 }
23430
23431 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, k_div_8) {
23432 for (size_t k = 16; k <= 80; k += 8) {
23433 GemmMicrokernelTester()
23434 .mr(2)
23435 .nr(4)
23436 .kr(2)
23437 .sr(1)
23438 .m(2)
23439 .n(4)
23440 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023441 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023442 }
23443 }
23444
23445 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, k_div_8_strided_a) {
23446 for (size_t k = 16; k <= 80; k += 8) {
23447 GemmMicrokernelTester()
23448 .mr(2)
23449 .nr(4)
23450 .kr(2)
23451 .sr(1)
23452 .m(2)
23453 .n(4)
23454 .k(k)
23455 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080023456 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023457 }
23458 }
23459
23460 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, k_div_8_subtile) {
23461 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080023462 for (uint32_t n = 1; n <= 4; n++) {
23463 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023464 GemmMicrokernelTester()
23465 .mr(2)
23466 .nr(4)
23467 .kr(2)
23468 .sr(1)
23469 .m(m)
23470 .n(n)
23471 .k(k)
23472 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023473 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023474 }
23475 }
23476 }
23477 }
23478
23479 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4) {
23480 for (uint32_t n = 5; n < 8; n++) {
23481 for (size_t k = 1; k <= 40; k += 9) {
23482 GemmMicrokernelTester()
23483 .mr(2)
23484 .nr(4)
23485 .kr(2)
23486 .sr(1)
23487 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080023488 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023489 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023490 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023491 }
23492 }
23493 }
23494
23495 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_cn) {
23496 for (uint32_t n = 5; n < 8; n++) {
23497 for (size_t k = 1; k <= 40; k += 9) {
23498 GemmMicrokernelTester()
23499 .mr(2)
23500 .nr(4)
23501 .kr(2)
23502 .sr(1)
23503 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080023504 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023505 .k(k)
23506 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080023507 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023508 }
23509 }
23510 }
23511
23512 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_a) {
23513 for (uint32_t n = 5; n < 8; n++) {
23514 for (size_t k = 1; k <= 40; k += 9) {
23515 GemmMicrokernelTester()
23516 .mr(2)
23517 .nr(4)
23518 .kr(2)
23519 .sr(1)
23520 .m(2)
23521 .n(n)
23522 .k(k)
23523 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080023524 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023525 }
23526 }
23527 }
23528
23529 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4_subtile) {
23530 for (uint32_t n = 5; n < 8; n++) {
23531 for (size_t k = 1; k <= 40; k += 9) {
23532 for (uint32_t m = 1; m <= 2; m++) {
23533 GemmMicrokernelTester()
23534 .mr(2)
23535 .nr(4)
23536 .kr(2)
23537 .sr(1)
23538 .m(m)
23539 .n(n)
23540 .k(k)
23541 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023542 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023543 }
23544 }
23545 }
23546 }
23547
23548 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, n_div_4) {
23549 for (uint32_t n = 8; n <= 12; n += 4) {
23550 for (size_t k = 1; k <= 40; k += 9) {
23551 GemmMicrokernelTester()
23552 .mr(2)
23553 .nr(4)
23554 .kr(2)
23555 .sr(1)
23556 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080023557 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023558 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023559 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023560 }
23561 }
23562 }
23563
23564 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, n_div_4_strided_cn) {
23565 for (uint32_t n = 8; n <= 12; n += 4) {
23566 for (size_t k = 1; k <= 40; k += 9) {
23567 GemmMicrokernelTester()
23568 .mr(2)
23569 .nr(4)
23570 .kr(2)
23571 .sr(1)
23572 .m(2)
23573 .n(n)
23574 .k(k)
23575 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080023576 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023577 }
23578 }
23579 }
23580
23581 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, n_div_4_strided_a) {
23582 for (uint32_t n = 8; n <= 12; n += 4) {
23583 for (size_t k = 1; k <= 40; k += 9) {
23584 GemmMicrokernelTester()
23585 .mr(2)
23586 .nr(4)
23587 .kr(2)
23588 .sr(1)
23589 .m(2)
23590 .n(n)
23591 .k(k)
23592 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080023593 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023594 }
23595 }
23596 }
23597
23598 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, n_div_4_subtile) {
23599 for (uint32_t n = 8; n <= 12; n += 4) {
23600 for (size_t k = 1; k <= 40; k += 9) {
23601 for (uint32_t m = 1; m <= 2; m++) {
23602 GemmMicrokernelTester()
23603 .mr(2)
23604 .nr(4)
23605 .kr(2)
23606 .sr(1)
23607 .m(m)
23608 .n(n)
23609 .k(k)
23610 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023611 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023612 }
23613 }
23614 }
23615 }
23616
23617 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, strided_cm_subtile) {
23618 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080023619 for (uint32_t n = 1; n <= 4; n++) {
23620 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023621 GemmMicrokernelTester()
23622 .mr(2)
23623 .nr(4)
23624 .kr(2)
23625 .sr(1)
23626 .m(m)
23627 .n(n)
23628 .k(k)
23629 .cm_stride(7)
23630 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023631 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023632 }
23633 }
23634 }
23635 }
23636
23637 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, qmin) {
23638 GemmMicrokernelTester()
23639 .mr(2)
23640 .nr(4)
23641 .kr(2)
23642 .sr(1)
23643 .m(2)
23644 .n(4)
23645 .k(8)
23646 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080023647 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023648 }
23649
23650 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, qmax) {
23651 GemmMicrokernelTester()
23652 .mr(2)
23653 .nr(4)
23654 .kr(2)
23655 .sr(1)
23656 .m(2)
23657 .n(4)
23658 .k(8)
23659 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080023660 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023661 }
23662
23663 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD64, strided_cm) {
23664 GemmMicrokernelTester()
23665 .mr(2)
23666 .nr(4)
23667 .kr(2)
23668 .sr(1)
23669 .m(2)
23670 .n(4)
23671 .k(8)
23672 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080023673 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023674 }
23675#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
23676
23677
23678#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
23679 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8) {
23680 GemmMicrokernelTester()
23681 .mr(3)
23682 .nr(4)
23683 .kr(2)
23684 .sr(1)
23685 .m(3)
23686 .n(4)
23687 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080023688 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023689 }
23690
23691 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, strided_cn) {
23692 GemmMicrokernelTester()
23693 .mr(3)
23694 .nr(4)
23695 .kr(2)
23696 .sr(1)
23697 .m(3)
23698 .n(4)
23699 .k(8)
23700 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080023701 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023702 }
23703
23704 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_strided_a) {
23705 GemmMicrokernelTester()
23706 .mr(3)
23707 .nr(4)
23708 .kr(2)
23709 .sr(1)
23710 .m(3)
23711 .n(4)
23712 .k(8)
23713 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080023714 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023715 }
23716
23717 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080023718 for (uint32_t n = 1; n <= 4; n++) {
23719 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023720 GemmMicrokernelTester()
23721 .mr(3)
23722 .nr(4)
23723 .kr(2)
23724 .sr(1)
23725 .m(m)
23726 .n(n)
23727 .k(8)
23728 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023729 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023730 }
23731 }
23732 }
23733
23734 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_m) {
23735 for (uint32_t m = 1; m <= 3; m++) {
23736 GemmMicrokernelTester()
23737 .mr(3)
23738 .nr(4)
23739 .kr(2)
23740 .sr(1)
23741 .m(m)
23742 .n(4)
23743 .k(8)
23744 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023745 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023746 }
23747 }
23748
23749 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_n) {
23750 for (uint32_t n = 1; n <= 4; n++) {
23751 GemmMicrokernelTester()
23752 .mr(3)
23753 .nr(4)
23754 .kr(2)
23755 .sr(1)
23756 .m(3)
23757 .n(n)
23758 .k(8)
23759 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023760 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023761 }
23762 }
23763
23764 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_lt_8) {
23765 for (size_t k = 1; k < 8; k++) {
23766 GemmMicrokernelTester()
23767 .mr(3)
23768 .nr(4)
23769 .kr(2)
23770 .sr(1)
23771 .m(3)
23772 .n(4)
23773 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023774 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023775 }
23776 }
23777
23778 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_lt_8_strided_a) {
23779 for (size_t k = 1; k < 8; k++) {
23780 GemmMicrokernelTester()
23781 .mr(3)
23782 .nr(4)
23783 .kr(2)
23784 .sr(1)
23785 .m(3)
23786 .n(4)
23787 .k(k)
23788 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080023789 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023790 }
23791 }
23792
23793 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_lt_8_subtile) {
23794 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080023795 for (uint32_t n = 1; n <= 4; n++) {
23796 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023797 GemmMicrokernelTester()
23798 .mr(3)
23799 .nr(4)
23800 .kr(2)
23801 .sr(1)
23802 .m(m)
23803 .n(n)
23804 .k(k)
23805 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023806 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023807 }
23808 }
23809 }
23810 }
23811
23812 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_gt_8) {
23813 for (size_t k = 9; k < 16; k++) {
23814 GemmMicrokernelTester()
23815 .mr(3)
23816 .nr(4)
23817 .kr(2)
23818 .sr(1)
23819 .m(3)
23820 .n(4)
23821 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023822 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023823 }
23824 }
23825
23826 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_gt_8_strided_a) {
23827 for (size_t k = 9; k < 16; k++) {
23828 GemmMicrokernelTester()
23829 .mr(3)
23830 .nr(4)
23831 .kr(2)
23832 .sr(1)
23833 .m(3)
23834 .n(4)
23835 .k(k)
23836 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080023837 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023838 }
23839 }
23840
23841 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_gt_8_subtile) {
23842 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080023843 for (uint32_t n = 1; n <= 4; n++) {
23844 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023845 GemmMicrokernelTester()
23846 .mr(3)
23847 .nr(4)
23848 .kr(2)
23849 .sr(1)
23850 .m(m)
23851 .n(n)
23852 .k(k)
23853 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023854 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023855 }
23856 }
23857 }
23858 }
23859
23860 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_div_8) {
23861 for (size_t k = 16; k <= 80; k += 8) {
23862 GemmMicrokernelTester()
23863 .mr(3)
23864 .nr(4)
23865 .kr(2)
23866 .sr(1)
23867 .m(3)
23868 .n(4)
23869 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023870 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023871 }
23872 }
23873
23874 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_div_8_strided_a) {
23875 for (size_t k = 16; k <= 80; k += 8) {
23876 GemmMicrokernelTester()
23877 .mr(3)
23878 .nr(4)
23879 .kr(2)
23880 .sr(1)
23881 .m(3)
23882 .n(4)
23883 .k(k)
23884 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080023885 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023886 }
23887 }
23888
23889 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_div_8_subtile) {
23890 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080023891 for (uint32_t n = 1; n <= 4; n++) {
23892 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023893 GemmMicrokernelTester()
23894 .mr(3)
23895 .nr(4)
23896 .kr(2)
23897 .sr(1)
23898 .m(m)
23899 .n(n)
23900 .k(k)
23901 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023902 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023903 }
23904 }
23905 }
23906 }
23907
23908 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4) {
23909 for (uint32_t n = 5; n < 8; n++) {
23910 for (size_t k = 1; k <= 40; k += 9) {
23911 GemmMicrokernelTester()
23912 .mr(3)
23913 .nr(4)
23914 .kr(2)
23915 .sr(1)
23916 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080023917 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023918 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023919 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023920 }
23921 }
23922 }
23923
23924 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_cn) {
23925 for (uint32_t n = 5; n < 8; n++) {
23926 for (size_t k = 1; k <= 40; k += 9) {
23927 GemmMicrokernelTester()
23928 .mr(3)
23929 .nr(4)
23930 .kr(2)
23931 .sr(1)
23932 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080023933 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023934 .k(k)
23935 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080023936 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023937 }
23938 }
23939 }
23940
23941 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_a) {
23942 for (uint32_t n = 5; n < 8; n++) {
23943 for (size_t k = 1; k <= 40; k += 9) {
23944 GemmMicrokernelTester()
23945 .mr(3)
23946 .nr(4)
23947 .kr(2)
23948 .sr(1)
23949 .m(3)
23950 .n(n)
23951 .k(k)
23952 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080023953 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023954 }
23955 }
23956 }
23957
23958 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4_subtile) {
23959 for (uint32_t n = 5; n < 8; n++) {
23960 for (size_t k = 1; k <= 40; k += 9) {
23961 for (uint32_t m = 1; m <= 3; m++) {
23962 GemmMicrokernelTester()
23963 .mr(3)
23964 .nr(4)
23965 .kr(2)
23966 .sr(1)
23967 .m(m)
23968 .n(n)
23969 .k(k)
23970 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080023971 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023972 }
23973 }
23974 }
23975 }
23976
23977 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, n_div_4) {
23978 for (uint32_t n = 8; n <= 12; n += 4) {
23979 for (size_t k = 1; k <= 40; k += 9) {
23980 GemmMicrokernelTester()
23981 .mr(3)
23982 .nr(4)
23983 .kr(2)
23984 .sr(1)
23985 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080023986 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023987 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080023988 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080023989 }
23990 }
23991 }
23992
23993 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, n_div_4_strided_cn) {
23994 for (uint32_t n = 8; n <= 12; n += 4) {
23995 for (size_t k = 1; k <= 40; k += 9) {
23996 GemmMicrokernelTester()
23997 .mr(3)
23998 .nr(4)
23999 .kr(2)
24000 .sr(1)
24001 .m(3)
24002 .n(n)
24003 .k(k)
24004 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080024005 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024006 }
24007 }
24008 }
24009
24010 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, n_div_4_strided_a) {
24011 for (uint32_t n = 8; n <= 12; n += 4) {
24012 for (size_t k = 1; k <= 40; k += 9) {
24013 GemmMicrokernelTester()
24014 .mr(3)
24015 .nr(4)
24016 .kr(2)
24017 .sr(1)
24018 .m(3)
24019 .n(n)
24020 .k(k)
24021 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080024022 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024023 }
24024 }
24025 }
24026
24027 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, n_div_4_subtile) {
24028 for (uint32_t n = 8; n <= 12; n += 4) {
24029 for (size_t k = 1; k <= 40; k += 9) {
24030 for (uint32_t m = 1; m <= 3; m++) {
24031 GemmMicrokernelTester()
24032 .mr(3)
24033 .nr(4)
24034 .kr(2)
24035 .sr(1)
24036 .m(m)
24037 .n(n)
24038 .k(k)
24039 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024040 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024041 }
24042 }
24043 }
24044 }
24045
24046 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, strided_cm_subtile) {
24047 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080024048 for (uint32_t n = 1; n <= 4; n++) {
24049 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024050 GemmMicrokernelTester()
24051 .mr(3)
24052 .nr(4)
24053 .kr(2)
24054 .sr(1)
24055 .m(m)
24056 .n(n)
24057 .k(k)
24058 .cm_stride(7)
24059 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024060 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024061 }
24062 }
24063 }
24064 }
24065
24066 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, qmin) {
24067 GemmMicrokernelTester()
24068 .mr(3)
24069 .nr(4)
24070 .kr(2)
24071 .sr(1)
24072 .m(3)
24073 .n(4)
24074 .k(8)
24075 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080024076 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024077 }
24078
24079 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, qmax) {
24080 GemmMicrokernelTester()
24081 .mr(3)
24082 .nr(4)
24083 .kr(2)
24084 .sr(1)
24085 .m(3)
24086 .n(4)
24087 .k(8)
24088 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080024089 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024090 }
24091
24092 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, strided_cm) {
24093 GemmMicrokernelTester()
24094 .mr(3)
24095 .nr(4)
24096 .kr(2)
24097 .sr(1)
24098 .m(3)
24099 .n(4)
24100 .k(8)
24101 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080024102 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024103 }
24104#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
24105
24106
24107#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
24108 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8) {
24109 GemmMicrokernelTester()
24110 .mr(4)
24111 .nr(4)
24112 .kr(2)
24113 .sr(1)
24114 .m(4)
24115 .n(4)
24116 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080024117 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024118 }
24119
24120 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, strided_cn) {
24121 GemmMicrokernelTester()
24122 .mr(4)
24123 .nr(4)
24124 .kr(2)
24125 .sr(1)
24126 .m(4)
24127 .n(4)
24128 .k(8)
24129 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080024130 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024131 }
24132
24133 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_strided_a) {
24134 GemmMicrokernelTester()
24135 .mr(4)
24136 .nr(4)
24137 .kr(2)
24138 .sr(1)
24139 .m(4)
24140 .n(4)
24141 .k(8)
24142 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080024143 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024144 }
24145
24146 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080024147 for (uint32_t n = 1; n <= 4; n++) {
24148 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024149 GemmMicrokernelTester()
24150 .mr(4)
24151 .nr(4)
24152 .kr(2)
24153 .sr(1)
24154 .m(m)
24155 .n(n)
24156 .k(8)
24157 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024158 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024159 }
24160 }
24161 }
24162
24163 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_m) {
24164 for (uint32_t m = 1; m <= 4; m++) {
24165 GemmMicrokernelTester()
24166 .mr(4)
24167 .nr(4)
24168 .kr(2)
24169 .sr(1)
24170 .m(m)
24171 .n(4)
24172 .k(8)
24173 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024174 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024175 }
24176 }
24177
24178 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_n) {
24179 for (uint32_t n = 1; n <= 4; n++) {
24180 GemmMicrokernelTester()
24181 .mr(4)
24182 .nr(4)
24183 .kr(2)
24184 .sr(1)
24185 .m(4)
24186 .n(n)
24187 .k(8)
24188 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024189 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024190 }
24191 }
24192
24193 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, k_lt_8) {
24194 for (size_t k = 1; k < 8; k++) {
24195 GemmMicrokernelTester()
24196 .mr(4)
24197 .nr(4)
24198 .kr(2)
24199 .sr(1)
24200 .m(4)
24201 .n(4)
24202 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080024203 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024204 }
24205 }
24206
24207 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, k_lt_8_strided_a) {
24208 for (size_t k = 1; k < 8; k++) {
24209 GemmMicrokernelTester()
24210 .mr(4)
24211 .nr(4)
24212 .kr(2)
24213 .sr(1)
24214 .m(4)
24215 .n(4)
24216 .k(k)
24217 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080024218 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024219 }
24220 }
24221
24222 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, k_lt_8_subtile) {
24223 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080024224 for (uint32_t n = 1; n <= 4; n++) {
24225 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024226 GemmMicrokernelTester()
24227 .mr(4)
24228 .nr(4)
24229 .kr(2)
24230 .sr(1)
24231 .m(m)
24232 .n(n)
24233 .k(k)
24234 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024235 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024236 }
24237 }
24238 }
24239 }
24240
24241 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, k_gt_8) {
24242 for (size_t k = 9; k < 16; k++) {
24243 GemmMicrokernelTester()
24244 .mr(4)
24245 .nr(4)
24246 .kr(2)
24247 .sr(1)
24248 .m(4)
24249 .n(4)
24250 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080024251 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024252 }
24253 }
24254
24255 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, k_gt_8_strided_a) {
24256 for (size_t k = 9; k < 16; k++) {
24257 GemmMicrokernelTester()
24258 .mr(4)
24259 .nr(4)
24260 .kr(2)
24261 .sr(1)
24262 .m(4)
24263 .n(4)
24264 .k(k)
24265 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080024266 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024267 }
24268 }
24269
24270 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, k_gt_8_subtile) {
24271 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080024272 for (uint32_t n = 1; n <= 4; n++) {
24273 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024274 GemmMicrokernelTester()
24275 .mr(4)
24276 .nr(4)
24277 .kr(2)
24278 .sr(1)
24279 .m(m)
24280 .n(n)
24281 .k(k)
24282 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024283 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024284 }
24285 }
24286 }
24287 }
24288
24289 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, k_div_8) {
24290 for (size_t k = 16; k <= 80; k += 8) {
24291 GemmMicrokernelTester()
24292 .mr(4)
24293 .nr(4)
24294 .kr(2)
24295 .sr(1)
24296 .m(4)
24297 .n(4)
24298 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080024299 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024300 }
24301 }
24302
24303 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, k_div_8_strided_a) {
24304 for (size_t k = 16; k <= 80; k += 8) {
24305 GemmMicrokernelTester()
24306 .mr(4)
24307 .nr(4)
24308 .kr(2)
24309 .sr(1)
24310 .m(4)
24311 .n(4)
24312 .k(k)
24313 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080024314 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024315 }
24316 }
24317
24318 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, k_div_8_subtile) {
24319 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080024320 for (uint32_t n = 1; n <= 4; n++) {
24321 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024322 GemmMicrokernelTester()
24323 .mr(4)
24324 .nr(4)
24325 .kr(2)
24326 .sr(1)
24327 .m(m)
24328 .n(n)
24329 .k(k)
24330 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024331 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024332 }
24333 }
24334 }
24335 }
24336
24337 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4) {
24338 for (uint32_t n = 5; n < 8; n++) {
24339 for (size_t k = 1; k <= 40; k += 9) {
24340 GemmMicrokernelTester()
24341 .mr(4)
24342 .nr(4)
24343 .kr(2)
24344 .sr(1)
24345 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080024346 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024347 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080024348 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024349 }
24350 }
24351 }
24352
24353 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_cn) {
24354 for (uint32_t n = 5; n < 8; n++) {
24355 for (size_t k = 1; k <= 40; k += 9) {
24356 GemmMicrokernelTester()
24357 .mr(4)
24358 .nr(4)
24359 .kr(2)
24360 .sr(1)
24361 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080024362 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024363 .k(k)
24364 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080024365 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024366 }
24367 }
24368 }
24369
24370 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_a) {
24371 for (uint32_t n = 5; n < 8; n++) {
24372 for (size_t k = 1; k <= 40; k += 9) {
24373 GemmMicrokernelTester()
24374 .mr(4)
24375 .nr(4)
24376 .kr(2)
24377 .sr(1)
24378 .m(4)
24379 .n(n)
24380 .k(k)
24381 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080024382 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024383 }
24384 }
24385 }
24386
24387 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4_subtile) {
24388 for (uint32_t n = 5; n < 8; n++) {
24389 for (size_t k = 1; k <= 40; k += 9) {
24390 for (uint32_t m = 1; m <= 4; m++) {
24391 GemmMicrokernelTester()
24392 .mr(4)
24393 .nr(4)
24394 .kr(2)
24395 .sr(1)
24396 .m(m)
24397 .n(n)
24398 .k(k)
24399 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024400 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024401 }
24402 }
24403 }
24404 }
24405
24406 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, n_div_4) {
24407 for (uint32_t n = 8; n <= 12; n += 4) {
24408 for (size_t k = 1; k <= 40; k += 9) {
24409 GemmMicrokernelTester()
24410 .mr(4)
24411 .nr(4)
24412 .kr(2)
24413 .sr(1)
24414 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080024415 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024416 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080024417 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024418 }
24419 }
24420 }
24421
24422 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, n_div_4_strided_cn) {
24423 for (uint32_t n = 8; n <= 12; n += 4) {
24424 for (size_t k = 1; k <= 40; k += 9) {
24425 GemmMicrokernelTester()
24426 .mr(4)
24427 .nr(4)
24428 .kr(2)
24429 .sr(1)
24430 .m(4)
24431 .n(n)
24432 .k(k)
24433 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080024434 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024435 }
24436 }
24437 }
24438
24439 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, n_div_4_strided_a) {
24440 for (uint32_t n = 8; n <= 12; n += 4) {
24441 for (size_t k = 1; k <= 40; k += 9) {
24442 GemmMicrokernelTester()
24443 .mr(4)
24444 .nr(4)
24445 .kr(2)
24446 .sr(1)
24447 .m(4)
24448 .n(n)
24449 .k(k)
24450 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080024451 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024452 }
24453 }
24454 }
24455
24456 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, n_div_4_subtile) {
24457 for (uint32_t n = 8; n <= 12; n += 4) {
24458 for (size_t k = 1; k <= 40; k += 9) {
24459 for (uint32_t m = 1; m <= 4; m++) {
24460 GemmMicrokernelTester()
24461 .mr(4)
24462 .nr(4)
24463 .kr(2)
24464 .sr(1)
24465 .m(m)
24466 .n(n)
24467 .k(k)
24468 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024469 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024470 }
24471 }
24472 }
24473 }
24474
24475 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, strided_cm_subtile) {
24476 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080024477 for (uint32_t n = 1; n <= 4; n++) {
24478 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024479 GemmMicrokernelTester()
24480 .mr(4)
24481 .nr(4)
24482 .kr(2)
24483 .sr(1)
24484 .m(m)
24485 .n(n)
24486 .k(k)
24487 .cm_stride(7)
24488 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024489 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024490 }
24491 }
24492 }
24493 }
24494
24495 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, qmin) {
24496 GemmMicrokernelTester()
24497 .mr(4)
24498 .nr(4)
24499 .kr(2)
24500 .sr(1)
24501 .m(4)
24502 .n(4)
24503 .k(8)
24504 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080024505 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024506 }
24507
24508 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, qmax) {
24509 GemmMicrokernelTester()
24510 .mr(4)
24511 .nr(4)
24512 .kr(2)
24513 .sr(1)
24514 .m(4)
24515 .n(4)
24516 .k(8)
24517 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080024518 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024519 }
24520
24521 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__WASMSIMD_DOT16X2_LD64, strided_cm) {
24522 GemmMicrokernelTester()
24523 .mr(4)
24524 .nr(4)
24525 .kr(2)
24526 .sr(1)
24527 .m(4)
24528 .n(4)
24529 .k(8)
24530 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080024531 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024532 }
24533#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
24534
24535
24536#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
24537 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_eq_8) {
24538 GemmMicrokernelTester()
24539 .mr(3)
24540 .nr(4)
24541 .kr(2)
24542 .sr(1)
24543 .m(3)
24544 .n(4)
24545 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080024546 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024547 }
24548
24549 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, strided_cn) {
24550 GemmMicrokernelTester()
24551 .mr(3)
24552 .nr(4)
24553 .kr(2)
24554 .sr(1)
24555 .m(3)
24556 .n(4)
24557 .k(8)
24558 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080024559 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024560 }
24561
24562 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_eq_8_strided_a) {
24563 GemmMicrokernelTester()
24564 .mr(3)
24565 .nr(4)
24566 .kr(2)
24567 .sr(1)
24568 .m(3)
24569 .n(4)
24570 .k(8)
24571 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080024572 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024573 }
24574
24575 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080024576 for (uint32_t n = 1; n <= 4; n++) {
24577 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024578 GemmMicrokernelTester()
24579 .mr(3)
24580 .nr(4)
24581 .kr(2)
24582 .sr(1)
24583 .m(m)
24584 .n(n)
24585 .k(8)
24586 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024587 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024588 }
24589 }
24590 }
24591
24592 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_m) {
24593 for (uint32_t m = 1; m <= 3; m++) {
24594 GemmMicrokernelTester()
24595 .mr(3)
24596 .nr(4)
24597 .kr(2)
24598 .sr(1)
24599 .m(m)
24600 .n(4)
24601 .k(8)
24602 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024603 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024604 }
24605 }
24606
24607 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_n) {
24608 for (uint32_t n = 1; n <= 4; n++) {
24609 GemmMicrokernelTester()
24610 .mr(3)
24611 .nr(4)
24612 .kr(2)
24613 .sr(1)
24614 .m(3)
24615 .n(n)
24616 .k(8)
24617 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024618 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024619 }
24620 }
24621
24622 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_lt_8) {
24623 for (size_t k = 1; k < 8; k++) {
24624 GemmMicrokernelTester()
24625 .mr(3)
24626 .nr(4)
24627 .kr(2)
24628 .sr(1)
24629 .m(3)
24630 .n(4)
24631 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080024632 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024633 }
24634 }
24635
24636 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_lt_8_strided_a) {
24637 for (size_t k = 1; k < 8; k++) {
24638 GemmMicrokernelTester()
24639 .mr(3)
24640 .nr(4)
24641 .kr(2)
24642 .sr(1)
24643 .m(3)
24644 .n(4)
24645 .k(k)
24646 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080024647 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024648 }
24649 }
24650
24651 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_lt_8_subtile) {
24652 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080024653 for (uint32_t n = 1; n <= 4; n++) {
24654 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024655 GemmMicrokernelTester()
24656 .mr(3)
24657 .nr(4)
24658 .kr(2)
24659 .sr(1)
24660 .m(m)
24661 .n(n)
24662 .k(k)
24663 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024664 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024665 }
24666 }
24667 }
24668 }
24669
24670 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_gt_8) {
24671 for (size_t k = 9; k < 16; k++) {
24672 GemmMicrokernelTester()
24673 .mr(3)
24674 .nr(4)
24675 .kr(2)
24676 .sr(1)
24677 .m(3)
24678 .n(4)
24679 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080024680 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024681 }
24682 }
24683
24684 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_gt_8_strided_a) {
24685 for (size_t k = 9; k < 16; k++) {
24686 GemmMicrokernelTester()
24687 .mr(3)
24688 .nr(4)
24689 .kr(2)
24690 .sr(1)
24691 .m(3)
24692 .n(4)
24693 .k(k)
24694 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080024695 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024696 }
24697 }
24698
24699 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_gt_8_subtile) {
24700 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080024701 for (uint32_t n = 1; n <= 4; n++) {
24702 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024703 GemmMicrokernelTester()
24704 .mr(3)
24705 .nr(4)
24706 .kr(2)
24707 .sr(1)
24708 .m(m)
24709 .n(n)
24710 .k(k)
24711 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024712 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024713 }
24714 }
24715 }
24716 }
24717
24718 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_div_8) {
24719 for (size_t k = 16; k <= 80; k += 8) {
24720 GemmMicrokernelTester()
24721 .mr(3)
24722 .nr(4)
24723 .kr(2)
24724 .sr(1)
24725 .m(3)
24726 .n(4)
24727 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080024728 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024729 }
24730 }
24731
24732 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_div_8_strided_a) {
24733 for (size_t k = 16; k <= 80; k += 8) {
24734 GemmMicrokernelTester()
24735 .mr(3)
24736 .nr(4)
24737 .kr(2)
24738 .sr(1)
24739 .m(3)
24740 .n(4)
24741 .k(k)
24742 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080024743 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024744 }
24745 }
24746
24747 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_div_8_subtile) {
24748 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080024749 for (uint32_t n = 1; n <= 4; n++) {
24750 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024751 GemmMicrokernelTester()
24752 .mr(3)
24753 .nr(4)
24754 .kr(2)
24755 .sr(1)
24756 .m(m)
24757 .n(n)
24758 .k(k)
24759 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024760 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024761 }
24762 }
24763 }
24764 }
24765
24766 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, n_gt_4) {
24767 for (uint32_t n = 5; n < 8; n++) {
24768 for (size_t k = 1; k <= 40; k += 9) {
24769 GemmMicrokernelTester()
24770 .mr(3)
24771 .nr(4)
24772 .kr(2)
24773 .sr(1)
24774 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080024775 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024776 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080024777 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024778 }
24779 }
24780 }
24781
24782 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_cn) {
24783 for (uint32_t n = 5; n < 8; n++) {
24784 for (size_t k = 1; k <= 40; k += 9) {
24785 GemmMicrokernelTester()
24786 .mr(3)
24787 .nr(4)
24788 .kr(2)
24789 .sr(1)
24790 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080024791 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024792 .k(k)
24793 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080024794 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024795 }
24796 }
24797 }
24798
24799 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_a) {
24800 for (uint32_t n = 5; n < 8; n++) {
24801 for (size_t k = 1; k <= 40; k += 9) {
24802 GemmMicrokernelTester()
24803 .mr(3)
24804 .nr(4)
24805 .kr(2)
24806 .sr(1)
24807 .m(3)
24808 .n(n)
24809 .k(k)
24810 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080024811 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024812 }
24813 }
24814 }
24815
24816 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, n_gt_4_subtile) {
24817 for (uint32_t n = 5; n < 8; n++) {
24818 for (size_t k = 1; k <= 40; k += 9) {
24819 for (uint32_t m = 1; m <= 3; m++) {
24820 GemmMicrokernelTester()
24821 .mr(3)
24822 .nr(4)
24823 .kr(2)
24824 .sr(1)
24825 .m(m)
24826 .n(n)
24827 .k(k)
24828 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024829 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024830 }
24831 }
24832 }
24833 }
24834
24835 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, n_div_4) {
24836 for (uint32_t n = 8; n <= 12; n += 4) {
24837 for (size_t k = 1; k <= 40; k += 9) {
24838 GemmMicrokernelTester()
24839 .mr(3)
24840 .nr(4)
24841 .kr(2)
24842 .sr(1)
24843 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080024844 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024845 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080024846 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024847 }
24848 }
24849 }
24850
24851 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, n_div_4_strided_cn) {
24852 for (uint32_t n = 8; n <= 12; n += 4) {
24853 for (size_t k = 1; k <= 40; k += 9) {
24854 GemmMicrokernelTester()
24855 .mr(3)
24856 .nr(4)
24857 .kr(2)
24858 .sr(1)
24859 .m(3)
24860 .n(n)
24861 .k(k)
24862 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080024863 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024864 }
24865 }
24866 }
24867
24868 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, n_div_4_strided_a) {
24869 for (uint32_t n = 8; n <= 12; n += 4) {
24870 for (size_t k = 1; k <= 40; k += 9) {
24871 GemmMicrokernelTester()
24872 .mr(3)
24873 .nr(4)
24874 .kr(2)
24875 .sr(1)
24876 .m(3)
24877 .n(n)
24878 .k(k)
24879 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080024880 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024881 }
24882 }
24883 }
24884
24885 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, n_div_4_subtile) {
24886 for (uint32_t n = 8; n <= 12; n += 4) {
24887 for (size_t k = 1; k <= 40; k += 9) {
24888 for (uint32_t m = 1; m <= 3; m++) {
24889 GemmMicrokernelTester()
24890 .mr(3)
24891 .nr(4)
24892 .kr(2)
24893 .sr(1)
24894 .m(m)
24895 .n(n)
24896 .k(k)
24897 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024898 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024899 }
24900 }
24901 }
24902 }
24903
24904 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, strided_cm_subtile) {
24905 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080024906 for (uint32_t n = 1; n <= 4; n++) {
24907 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024908 GemmMicrokernelTester()
24909 .mr(3)
24910 .nr(4)
24911 .kr(2)
24912 .sr(1)
24913 .m(m)
24914 .n(n)
24915 .k(k)
24916 .cm_stride(7)
24917 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080024918 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024919 }
24920 }
24921 }
24922 }
24923
24924 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, qmin) {
24925 GemmMicrokernelTester()
24926 .mr(3)
24927 .nr(4)
24928 .kr(2)
24929 .sr(1)
24930 .m(3)
24931 .n(4)
24932 .k(8)
24933 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080024934 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024935 }
24936
24937 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, qmax) {
24938 GemmMicrokernelTester()
24939 .mr(3)
24940 .nr(4)
24941 .kr(2)
24942 .sr(1)
24943 .m(3)
24944 .n(4)
24945 .k(8)
24946 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080024947 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024948 }
24949
24950 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, strided_cm) {
24951 GemmMicrokernelTester()
24952 .mr(3)
24953 .nr(4)
24954 .kr(2)
24955 .sr(1)
24956 .m(3)
24957 .n(4)
24958 .k(8)
24959 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080024960 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024961 }
24962#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
24963
24964
24965#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
24966 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8) {
24967 GemmMicrokernelTester()
24968 .mr(4)
24969 .nr(4)
24970 .kr(2)
24971 .sr(4)
24972 .m(4)
24973 .n(4)
24974 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080024975 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024976 }
24977
24978 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, strided_cn) {
24979 GemmMicrokernelTester()
24980 .mr(4)
24981 .nr(4)
24982 .kr(2)
24983 .sr(4)
24984 .m(4)
24985 .n(4)
24986 .k(8)
24987 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080024988 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080024989 }
24990
24991 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8_strided_a) {
24992 GemmMicrokernelTester()
24993 .mr(4)
24994 .nr(4)
24995 .kr(2)
24996 .sr(4)
24997 .m(4)
24998 .n(4)
24999 .k(8)
25000 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080025001 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025002 }
25003
25004 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080025005 for (uint32_t n = 1; n <= 4; n++) {
25006 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025007 GemmMicrokernelTester()
25008 .mr(4)
25009 .nr(4)
25010 .kr(2)
25011 .sr(4)
25012 .m(m)
25013 .n(n)
25014 .k(8)
25015 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025016 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025017 }
25018 }
25019 }
25020
25021 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_m) {
25022 for (uint32_t m = 1; m <= 4; m++) {
25023 GemmMicrokernelTester()
25024 .mr(4)
25025 .nr(4)
25026 .kr(2)
25027 .sr(4)
25028 .m(m)
25029 .n(4)
25030 .k(8)
25031 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025032 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025033 }
25034 }
25035
25036 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_n) {
25037 for (uint32_t n = 1; n <= 4; n++) {
25038 GemmMicrokernelTester()
25039 .mr(4)
25040 .nr(4)
25041 .kr(2)
25042 .sr(4)
25043 .m(4)
25044 .n(n)
25045 .k(8)
25046 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025047 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025048 }
25049 }
25050
25051 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_lt_8) {
25052 for (size_t k = 1; k < 8; k++) {
25053 GemmMicrokernelTester()
25054 .mr(4)
25055 .nr(4)
25056 .kr(2)
25057 .sr(4)
25058 .m(4)
25059 .n(4)
25060 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080025061 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025062 }
25063 }
25064
25065 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_lt_8_strided_a) {
25066 for (size_t k = 1; k < 8; k++) {
25067 GemmMicrokernelTester()
25068 .mr(4)
25069 .nr(4)
25070 .kr(2)
25071 .sr(4)
25072 .m(4)
25073 .n(4)
25074 .k(k)
25075 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080025076 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025077 }
25078 }
25079
25080 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_lt_8_subtile) {
25081 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080025082 for (uint32_t n = 1; n <= 4; n++) {
25083 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025084 GemmMicrokernelTester()
25085 .mr(4)
25086 .nr(4)
25087 .kr(2)
25088 .sr(4)
25089 .m(m)
25090 .n(n)
25091 .k(k)
25092 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025093 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025094 }
25095 }
25096 }
25097 }
25098
25099 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_gt_8) {
25100 for (size_t k = 9; k < 16; k++) {
25101 GemmMicrokernelTester()
25102 .mr(4)
25103 .nr(4)
25104 .kr(2)
25105 .sr(4)
25106 .m(4)
25107 .n(4)
25108 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080025109 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025110 }
25111 }
25112
25113 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_gt_8_strided_a) {
25114 for (size_t k = 9; k < 16; k++) {
25115 GemmMicrokernelTester()
25116 .mr(4)
25117 .nr(4)
25118 .kr(2)
25119 .sr(4)
25120 .m(4)
25121 .n(4)
25122 .k(k)
25123 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080025124 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025125 }
25126 }
25127
25128 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_gt_8_subtile) {
25129 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080025130 for (uint32_t n = 1; n <= 4; n++) {
25131 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025132 GemmMicrokernelTester()
25133 .mr(4)
25134 .nr(4)
25135 .kr(2)
25136 .sr(4)
25137 .m(m)
25138 .n(n)
25139 .k(k)
25140 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025141 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025142 }
25143 }
25144 }
25145 }
25146
25147 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_div_8) {
25148 for (size_t k = 16; k <= 80; k += 8) {
25149 GemmMicrokernelTester()
25150 .mr(4)
25151 .nr(4)
25152 .kr(2)
25153 .sr(4)
25154 .m(4)
25155 .n(4)
25156 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080025157 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025158 }
25159 }
25160
25161 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_div_8_strided_a) {
25162 for (size_t k = 16; k <= 80; k += 8) {
25163 GemmMicrokernelTester()
25164 .mr(4)
25165 .nr(4)
25166 .kr(2)
25167 .sr(4)
25168 .m(4)
25169 .n(4)
25170 .k(k)
25171 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080025172 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025173 }
25174 }
25175
25176 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_div_8_subtile) {
25177 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080025178 for (uint32_t n = 1; n <= 4; n++) {
25179 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025180 GemmMicrokernelTester()
25181 .mr(4)
25182 .nr(4)
25183 .kr(2)
25184 .sr(4)
25185 .m(m)
25186 .n(n)
25187 .k(k)
25188 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025189 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025190 }
25191 }
25192 }
25193 }
25194
25195 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, n_gt_4) {
25196 for (uint32_t n = 5; n < 8; n++) {
25197 for (size_t k = 1; k <= 40; k += 9) {
25198 GemmMicrokernelTester()
25199 .mr(4)
25200 .nr(4)
25201 .kr(2)
25202 .sr(4)
25203 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080025204 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025205 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080025206 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025207 }
25208 }
25209 }
25210
25211 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_cn) {
25212 for (uint32_t n = 5; n < 8; n++) {
25213 for (size_t k = 1; k <= 40; k += 9) {
25214 GemmMicrokernelTester()
25215 .mr(4)
25216 .nr(4)
25217 .kr(2)
25218 .sr(4)
25219 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080025220 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025221 .k(k)
25222 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080025223 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025224 }
25225 }
25226 }
25227
25228 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_a) {
25229 for (uint32_t n = 5; n < 8; n++) {
25230 for (size_t k = 1; k <= 40; k += 9) {
25231 GemmMicrokernelTester()
25232 .mr(4)
25233 .nr(4)
25234 .kr(2)
25235 .sr(4)
25236 .m(4)
25237 .n(n)
25238 .k(k)
25239 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080025240 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025241 }
25242 }
25243 }
25244
25245 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, n_gt_4_subtile) {
25246 for (uint32_t n = 5; n < 8; n++) {
25247 for (size_t k = 1; k <= 40; k += 9) {
25248 for (uint32_t m = 1; m <= 4; m++) {
25249 GemmMicrokernelTester()
25250 .mr(4)
25251 .nr(4)
25252 .kr(2)
25253 .sr(4)
25254 .m(m)
25255 .n(n)
25256 .k(k)
25257 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025258 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025259 }
25260 }
25261 }
25262 }
25263
25264 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, n_div_4) {
25265 for (uint32_t n = 8; n <= 12; n += 4) {
25266 for (size_t k = 1; k <= 40; k += 9) {
25267 GemmMicrokernelTester()
25268 .mr(4)
25269 .nr(4)
25270 .kr(2)
25271 .sr(4)
25272 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080025273 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025274 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080025275 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025276 }
25277 }
25278 }
25279
25280 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, n_div_4_strided_cn) {
25281 for (uint32_t n = 8; n <= 12; n += 4) {
25282 for (size_t k = 1; k <= 40; k += 9) {
25283 GemmMicrokernelTester()
25284 .mr(4)
25285 .nr(4)
25286 .kr(2)
25287 .sr(4)
25288 .m(4)
25289 .n(n)
25290 .k(k)
25291 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080025292 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025293 }
25294 }
25295 }
25296
25297 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, n_div_4_strided_a) {
25298 for (uint32_t n = 8; n <= 12; n += 4) {
25299 for (size_t k = 1; k <= 40; k += 9) {
25300 GemmMicrokernelTester()
25301 .mr(4)
25302 .nr(4)
25303 .kr(2)
25304 .sr(4)
25305 .m(4)
25306 .n(n)
25307 .k(k)
25308 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080025309 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025310 }
25311 }
25312 }
25313
25314 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, n_div_4_subtile) {
25315 for (uint32_t n = 8; n <= 12; n += 4) {
25316 for (size_t k = 1; k <= 40; k += 9) {
25317 for (uint32_t m = 1; m <= 4; m++) {
25318 GemmMicrokernelTester()
25319 .mr(4)
25320 .nr(4)
25321 .kr(2)
25322 .sr(4)
25323 .m(m)
25324 .n(n)
25325 .k(k)
25326 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025327 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025328 }
25329 }
25330 }
25331 }
25332
25333 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, strided_cm_subtile) {
25334 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080025335 for (uint32_t n = 1; n <= 4; n++) {
25336 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025337 GemmMicrokernelTester()
25338 .mr(4)
25339 .nr(4)
25340 .kr(2)
25341 .sr(4)
25342 .m(m)
25343 .n(n)
25344 .k(k)
25345 .cm_stride(7)
25346 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025347 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025348 }
25349 }
25350 }
25351 }
25352
25353 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, qmin) {
25354 GemmMicrokernelTester()
25355 .mr(4)
25356 .nr(4)
25357 .kr(2)
25358 .sr(4)
25359 .m(4)
25360 .n(4)
25361 .k(8)
25362 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080025363 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025364 }
25365
25366 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, qmax) {
25367 GemmMicrokernelTester()
25368 .mr(4)
25369 .nr(4)
25370 .kr(2)
25371 .sr(4)
25372 .m(4)
25373 .n(4)
25374 .k(8)
25375 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080025376 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025377 }
25378
25379 TEST(QC8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, strided_cm) {
25380 GemmMicrokernelTester()
25381 .mr(4)
25382 .nr(4)
25383 .kr(2)
25384 .sr(4)
25385 .m(4)
25386 .n(4)
25387 .k(8)
25388 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080025389 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025390 }
25391#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
25392
25393
25394#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
25395 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8) {
25396 GemmMicrokernelTester()
25397 .mr(3)
25398 .nr(4)
25399 .kr(8)
25400 .sr(1)
25401 .m(3)
25402 .n(4)
25403 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080025404 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025405 }
25406
25407 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, strided_cn) {
25408 GemmMicrokernelTester()
25409 .mr(3)
25410 .nr(4)
25411 .kr(8)
25412 .sr(1)
25413 .m(3)
25414 .n(4)
25415 .k(8)
25416 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080025417 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025418 }
25419
25420 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_strided_a) {
25421 GemmMicrokernelTester()
25422 .mr(3)
25423 .nr(4)
25424 .kr(8)
25425 .sr(1)
25426 .m(3)
25427 .n(4)
25428 .k(8)
25429 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080025430 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025431 }
25432
25433 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080025434 for (uint32_t n = 1; n <= 4; n++) {
25435 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025436 GemmMicrokernelTester()
25437 .mr(3)
25438 .nr(4)
25439 .kr(8)
25440 .sr(1)
25441 .m(m)
25442 .n(n)
25443 .k(8)
25444 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025445 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025446 }
25447 }
25448 }
25449
25450 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_m) {
25451 for (uint32_t m = 1; m <= 3; m++) {
25452 GemmMicrokernelTester()
25453 .mr(3)
25454 .nr(4)
25455 .kr(8)
25456 .sr(1)
25457 .m(m)
25458 .n(4)
25459 .k(8)
25460 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025461 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025462 }
25463 }
25464
25465 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_n) {
25466 for (uint32_t n = 1; n <= 4; n++) {
25467 GemmMicrokernelTester()
25468 .mr(3)
25469 .nr(4)
25470 .kr(8)
25471 .sr(1)
25472 .m(3)
25473 .n(n)
25474 .k(8)
25475 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025476 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025477 }
25478 }
25479
25480 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, k_lt_8) {
25481 for (size_t k = 1; k < 8; k++) {
25482 GemmMicrokernelTester()
25483 .mr(3)
25484 .nr(4)
25485 .kr(8)
25486 .sr(1)
25487 .m(3)
25488 .n(4)
25489 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080025490 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025491 }
25492 }
25493
25494 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, k_lt_8_strided_a) {
25495 for (size_t k = 1; k < 8; k++) {
25496 GemmMicrokernelTester()
25497 .mr(3)
25498 .nr(4)
25499 .kr(8)
25500 .sr(1)
25501 .m(3)
25502 .n(4)
25503 .k(k)
25504 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080025505 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025506 }
25507 }
25508
25509 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, k_lt_8_subtile) {
25510 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080025511 for (uint32_t n = 1; n <= 4; n++) {
25512 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025513 GemmMicrokernelTester()
25514 .mr(3)
25515 .nr(4)
25516 .kr(8)
25517 .sr(1)
25518 .m(m)
25519 .n(n)
25520 .k(k)
25521 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025522 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025523 }
25524 }
25525 }
25526 }
25527
25528 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, k_gt_8) {
25529 for (size_t k = 9; k < 16; k++) {
25530 GemmMicrokernelTester()
25531 .mr(3)
25532 .nr(4)
25533 .kr(8)
25534 .sr(1)
25535 .m(3)
25536 .n(4)
25537 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080025538 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025539 }
25540 }
25541
25542 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, k_gt_8_strided_a) {
25543 for (size_t k = 9; k < 16; k++) {
25544 GemmMicrokernelTester()
25545 .mr(3)
25546 .nr(4)
25547 .kr(8)
25548 .sr(1)
25549 .m(3)
25550 .n(4)
25551 .k(k)
25552 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080025553 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025554 }
25555 }
25556
25557 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, k_gt_8_subtile) {
25558 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080025559 for (uint32_t n = 1; n <= 4; n++) {
25560 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025561 GemmMicrokernelTester()
25562 .mr(3)
25563 .nr(4)
25564 .kr(8)
25565 .sr(1)
25566 .m(m)
25567 .n(n)
25568 .k(k)
25569 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025570 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025571 }
25572 }
25573 }
25574 }
25575
25576 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, k_div_8) {
25577 for (size_t k = 16; k <= 80; k += 8) {
25578 GemmMicrokernelTester()
25579 .mr(3)
25580 .nr(4)
25581 .kr(8)
25582 .sr(1)
25583 .m(3)
25584 .n(4)
25585 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080025586 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025587 }
25588 }
25589
25590 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, k_div_8_strided_a) {
25591 for (size_t k = 16; k <= 80; k += 8) {
25592 GemmMicrokernelTester()
25593 .mr(3)
25594 .nr(4)
25595 .kr(8)
25596 .sr(1)
25597 .m(3)
25598 .n(4)
25599 .k(k)
25600 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080025601 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025602 }
25603 }
25604
25605 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, k_div_8_subtile) {
25606 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080025607 for (uint32_t n = 1; n <= 4; n++) {
25608 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025609 GemmMicrokernelTester()
25610 .mr(3)
25611 .nr(4)
25612 .kr(8)
25613 .sr(1)
25614 .m(m)
25615 .n(n)
25616 .k(k)
25617 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025618 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025619 }
25620 }
25621 }
25622 }
25623
25624 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4) {
25625 for (uint32_t n = 5; n < 8; n++) {
25626 for (size_t k = 1; k <= 40; k += 9) {
25627 GemmMicrokernelTester()
25628 .mr(3)
25629 .nr(4)
25630 .kr(8)
25631 .sr(1)
25632 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080025633 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025634 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080025635 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025636 }
25637 }
25638 }
25639
25640 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_cn) {
25641 for (uint32_t n = 5; n < 8; n++) {
25642 for (size_t k = 1; k <= 40; k += 9) {
25643 GemmMicrokernelTester()
25644 .mr(3)
25645 .nr(4)
25646 .kr(8)
25647 .sr(1)
25648 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080025649 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025650 .k(k)
25651 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080025652 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025653 }
25654 }
25655 }
25656
25657 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_a) {
25658 for (uint32_t n = 5; n < 8; n++) {
25659 for (size_t k = 1; k <= 40; k += 9) {
25660 GemmMicrokernelTester()
25661 .mr(3)
25662 .nr(4)
25663 .kr(8)
25664 .sr(1)
25665 .m(3)
25666 .n(n)
25667 .k(k)
25668 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080025669 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025670 }
25671 }
25672 }
25673
25674 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4_subtile) {
25675 for (uint32_t n = 5; n < 8; n++) {
25676 for (size_t k = 1; k <= 40; k += 9) {
25677 for (uint32_t m = 1; m <= 3; m++) {
25678 GemmMicrokernelTester()
25679 .mr(3)
25680 .nr(4)
25681 .kr(8)
25682 .sr(1)
25683 .m(m)
25684 .n(n)
25685 .k(k)
25686 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025687 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025688 }
25689 }
25690 }
25691 }
25692
25693 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, n_div_4) {
25694 for (uint32_t n = 8; n <= 12; n += 4) {
25695 for (size_t k = 1; k <= 40; k += 9) {
25696 GemmMicrokernelTester()
25697 .mr(3)
25698 .nr(4)
25699 .kr(8)
25700 .sr(1)
25701 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080025702 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025703 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080025704 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025705 }
25706 }
25707 }
25708
25709 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, n_div_4_strided_cn) {
25710 for (uint32_t n = 8; n <= 12; n += 4) {
25711 for (size_t k = 1; k <= 40; k += 9) {
25712 GemmMicrokernelTester()
25713 .mr(3)
25714 .nr(4)
25715 .kr(8)
25716 .sr(1)
25717 .m(3)
25718 .n(n)
25719 .k(k)
25720 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080025721 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025722 }
25723 }
25724 }
25725
25726 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, n_div_4_strided_a) {
25727 for (uint32_t n = 8; n <= 12; n += 4) {
25728 for (size_t k = 1; k <= 40; k += 9) {
25729 GemmMicrokernelTester()
25730 .mr(3)
25731 .nr(4)
25732 .kr(8)
25733 .sr(1)
25734 .m(3)
25735 .n(n)
25736 .k(k)
25737 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080025738 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025739 }
25740 }
25741 }
25742
25743 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, n_div_4_subtile) {
25744 for (uint32_t n = 8; n <= 12; n += 4) {
25745 for (size_t k = 1; k <= 40; k += 9) {
25746 for (uint32_t m = 1; m <= 3; m++) {
25747 GemmMicrokernelTester()
25748 .mr(3)
25749 .nr(4)
25750 .kr(8)
25751 .sr(1)
25752 .m(m)
25753 .n(n)
25754 .k(k)
25755 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025756 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025757 }
25758 }
25759 }
25760 }
25761
25762 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, strided_cm_subtile) {
25763 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080025764 for (uint32_t n = 1; n <= 4; n++) {
25765 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025766 GemmMicrokernelTester()
25767 .mr(3)
25768 .nr(4)
25769 .kr(8)
25770 .sr(1)
25771 .m(m)
25772 .n(n)
25773 .k(k)
25774 .cm_stride(7)
25775 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025776 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025777 }
25778 }
25779 }
25780 }
25781
25782 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, qmin) {
25783 GemmMicrokernelTester()
25784 .mr(3)
25785 .nr(4)
25786 .kr(8)
25787 .sr(1)
25788 .m(3)
25789 .n(4)
25790 .k(8)
25791 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080025792 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025793 }
25794
25795 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, qmax) {
25796 GemmMicrokernelTester()
25797 .mr(3)
25798 .nr(4)
25799 .kr(8)
25800 .sr(1)
25801 .m(3)
25802 .n(4)
25803 .k(8)
25804 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080025805 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025806 }
25807
25808 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD64, strided_cm) {
25809 GemmMicrokernelTester()
25810 .mr(3)
25811 .nr(4)
25812 .kr(8)
25813 .sr(1)
25814 .m(3)
25815 .n(4)
25816 .k(8)
25817 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080025818 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025819 }
25820#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
25821
25822
25823#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
25824 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8) {
25825 GemmMicrokernelTester()
25826 .mr(4)
25827 .nr(4)
25828 .kr(8)
25829 .sr(1)
25830 .m(4)
25831 .n(4)
25832 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080025833 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025834 }
25835
25836 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, strided_cn) {
25837 GemmMicrokernelTester()
25838 .mr(4)
25839 .nr(4)
25840 .kr(8)
25841 .sr(1)
25842 .m(4)
25843 .n(4)
25844 .k(8)
25845 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080025846 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025847 }
25848
25849 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8_strided_a) {
25850 GemmMicrokernelTester()
25851 .mr(4)
25852 .nr(4)
25853 .kr(8)
25854 .sr(1)
25855 .m(4)
25856 .n(4)
25857 .k(8)
25858 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080025859 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025860 }
25861
25862 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080025863 for (uint32_t n = 1; n <= 4; n++) {
25864 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025865 GemmMicrokernelTester()
25866 .mr(4)
25867 .nr(4)
25868 .kr(8)
25869 .sr(1)
25870 .m(m)
25871 .n(n)
25872 .k(8)
25873 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025874 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025875 }
25876 }
25877 }
25878
25879 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_m) {
25880 for (uint32_t m = 1; m <= 4; m++) {
25881 GemmMicrokernelTester()
25882 .mr(4)
25883 .nr(4)
25884 .kr(8)
25885 .sr(1)
25886 .m(m)
25887 .n(4)
25888 .k(8)
25889 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025890 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025891 }
25892 }
25893
25894 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_n) {
25895 for (uint32_t n = 1; n <= 4; n++) {
25896 GemmMicrokernelTester()
25897 .mr(4)
25898 .nr(4)
25899 .kr(8)
25900 .sr(1)
25901 .m(4)
25902 .n(n)
25903 .k(8)
25904 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025905 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025906 }
25907 }
25908
25909 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, k_lt_8) {
25910 for (size_t k = 1; k < 8; k++) {
25911 GemmMicrokernelTester()
25912 .mr(4)
25913 .nr(4)
25914 .kr(8)
25915 .sr(1)
25916 .m(4)
25917 .n(4)
25918 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080025919 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025920 }
25921 }
25922
25923 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, k_lt_8_strided_a) {
25924 for (size_t k = 1; k < 8; k++) {
25925 GemmMicrokernelTester()
25926 .mr(4)
25927 .nr(4)
25928 .kr(8)
25929 .sr(1)
25930 .m(4)
25931 .n(4)
25932 .k(k)
25933 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080025934 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025935 }
25936 }
25937
25938 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, k_lt_8_subtile) {
25939 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080025940 for (uint32_t n = 1; n <= 4; n++) {
25941 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025942 GemmMicrokernelTester()
25943 .mr(4)
25944 .nr(4)
25945 .kr(8)
25946 .sr(1)
25947 .m(m)
25948 .n(n)
25949 .k(k)
25950 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025951 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025952 }
25953 }
25954 }
25955 }
25956
25957 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, k_gt_8) {
25958 for (size_t k = 9; k < 16; k++) {
25959 GemmMicrokernelTester()
25960 .mr(4)
25961 .nr(4)
25962 .kr(8)
25963 .sr(1)
25964 .m(4)
25965 .n(4)
25966 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080025967 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025968 }
25969 }
25970
25971 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, k_gt_8_strided_a) {
25972 for (size_t k = 9; k < 16; k++) {
25973 GemmMicrokernelTester()
25974 .mr(4)
25975 .nr(4)
25976 .kr(8)
25977 .sr(1)
25978 .m(4)
25979 .n(4)
25980 .k(k)
25981 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080025982 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025983 }
25984 }
25985
25986 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, k_gt_8_subtile) {
25987 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080025988 for (uint32_t n = 1; n <= 4; n++) {
25989 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080025990 GemmMicrokernelTester()
25991 .mr(4)
25992 .nr(4)
25993 .kr(8)
25994 .sr(1)
25995 .m(m)
25996 .n(n)
25997 .k(k)
25998 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080025999 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026000 }
26001 }
26002 }
26003 }
26004
26005 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, k_div_8) {
26006 for (size_t k = 16; k <= 80; k += 8) {
26007 GemmMicrokernelTester()
26008 .mr(4)
26009 .nr(4)
26010 .kr(8)
26011 .sr(1)
26012 .m(4)
26013 .n(4)
26014 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026015 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026016 }
26017 }
26018
26019 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, k_div_8_strided_a) {
26020 for (size_t k = 16; k <= 80; k += 8) {
26021 GemmMicrokernelTester()
26022 .mr(4)
26023 .nr(4)
26024 .kr(8)
26025 .sr(1)
26026 .m(4)
26027 .n(4)
26028 .k(k)
26029 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080026030 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026031 }
26032 }
26033
26034 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, k_div_8_subtile) {
26035 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080026036 for (uint32_t n = 1; n <= 4; n++) {
26037 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026038 GemmMicrokernelTester()
26039 .mr(4)
26040 .nr(4)
26041 .kr(8)
26042 .sr(1)
26043 .m(m)
26044 .n(n)
26045 .k(k)
26046 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026047 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026048 }
26049 }
26050 }
26051 }
26052
26053 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, n_gt_4) {
26054 for (uint32_t n = 5; n < 8; n++) {
26055 for (size_t k = 1; k <= 40; k += 9) {
26056 GemmMicrokernelTester()
26057 .mr(4)
26058 .nr(4)
26059 .kr(8)
26060 .sr(1)
26061 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080026062 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026063 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026064 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026065 }
26066 }
26067 }
26068
26069 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_cn) {
26070 for (uint32_t n = 5; n < 8; n++) {
26071 for (size_t k = 1; k <= 40; k += 9) {
26072 GemmMicrokernelTester()
26073 .mr(4)
26074 .nr(4)
26075 .kr(8)
26076 .sr(1)
26077 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080026078 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026079 .k(k)
26080 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080026081 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026082 }
26083 }
26084 }
26085
26086 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_a) {
26087 for (uint32_t n = 5; n < 8; n++) {
26088 for (size_t k = 1; k <= 40; k += 9) {
26089 GemmMicrokernelTester()
26090 .mr(4)
26091 .nr(4)
26092 .kr(8)
26093 .sr(1)
26094 .m(4)
26095 .n(n)
26096 .k(k)
26097 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080026098 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026099 }
26100 }
26101 }
26102
26103 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, n_gt_4_subtile) {
26104 for (uint32_t n = 5; n < 8; n++) {
26105 for (size_t k = 1; k <= 40; k += 9) {
26106 for (uint32_t m = 1; m <= 4; m++) {
26107 GemmMicrokernelTester()
26108 .mr(4)
26109 .nr(4)
26110 .kr(8)
26111 .sr(1)
26112 .m(m)
26113 .n(n)
26114 .k(k)
26115 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026116 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026117 }
26118 }
26119 }
26120 }
26121
26122 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, n_div_4) {
26123 for (uint32_t n = 8; n <= 12; n += 4) {
26124 for (size_t k = 1; k <= 40; k += 9) {
26125 GemmMicrokernelTester()
26126 .mr(4)
26127 .nr(4)
26128 .kr(8)
26129 .sr(1)
26130 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080026131 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026132 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026133 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026134 }
26135 }
26136 }
26137
26138 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, n_div_4_strided_cn) {
26139 for (uint32_t n = 8; n <= 12; n += 4) {
26140 for (size_t k = 1; k <= 40; k += 9) {
26141 GemmMicrokernelTester()
26142 .mr(4)
26143 .nr(4)
26144 .kr(8)
26145 .sr(1)
26146 .m(4)
26147 .n(n)
26148 .k(k)
26149 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080026150 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026151 }
26152 }
26153 }
26154
26155 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, n_div_4_strided_a) {
26156 for (uint32_t n = 8; n <= 12; n += 4) {
26157 for (size_t k = 1; k <= 40; k += 9) {
26158 GemmMicrokernelTester()
26159 .mr(4)
26160 .nr(4)
26161 .kr(8)
26162 .sr(1)
26163 .m(4)
26164 .n(n)
26165 .k(k)
26166 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080026167 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026168 }
26169 }
26170 }
26171
26172 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, n_div_4_subtile) {
26173 for (uint32_t n = 8; n <= 12; n += 4) {
26174 for (size_t k = 1; k <= 40; k += 9) {
26175 for (uint32_t m = 1; m <= 4; m++) {
26176 GemmMicrokernelTester()
26177 .mr(4)
26178 .nr(4)
26179 .kr(8)
26180 .sr(1)
26181 .m(m)
26182 .n(n)
26183 .k(k)
26184 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026185 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026186 }
26187 }
26188 }
26189 }
26190
26191 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, strided_cm_subtile) {
26192 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080026193 for (uint32_t n = 1; n <= 4; n++) {
26194 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026195 GemmMicrokernelTester()
26196 .mr(4)
26197 .nr(4)
26198 .kr(8)
26199 .sr(1)
26200 .m(m)
26201 .n(n)
26202 .k(k)
26203 .cm_stride(7)
26204 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026205 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026206 }
26207 }
26208 }
26209 }
26210
26211 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, qmin) {
26212 GemmMicrokernelTester()
26213 .mr(4)
26214 .nr(4)
26215 .kr(8)
26216 .sr(1)
26217 .m(4)
26218 .n(4)
26219 .k(8)
26220 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080026221 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026222 }
26223
26224 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, qmax) {
26225 GemmMicrokernelTester()
26226 .mr(4)
26227 .nr(4)
26228 .kr(8)
26229 .sr(1)
26230 .m(4)
26231 .n(4)
26232 .k(8)
26233 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080026234 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026235 }
26236
26237 TEST(QC8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD128, strided_cm) {
26238 GemmMicrokernelTester()
26239 .mr(4)
26240 .nr(4)
26241 .kr(8)
26242 .sr(1)
26243 .m(4)
26244 .n(4)
26245 .k(8)
26246 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080026247 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026248 }
26249#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
26250
26251
26252#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
26253 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8) {
26254 GemmMicrokernelTester()
26255 .mr(2)
26256 .nr(4)
26257 .kr(8)
26258 .sr(1)
26259 .m(2)
26260 .n(4)
26261 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080026262 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026263 }
26264
26265 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, strided_cn) {
26266 GemmMicrokernelTester()
26267 .mr(2)
26268 .nr(4)
26269 .kr(8)
26270 .sr(1)
26271 .m(2)
26272 .n(4)
26273 .k(8)
26274 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080026275 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026276 }
26277
26278 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8_strided_a) {
26279 GemmMicrokernelTester()
26280 .mr(2)
26281 .nr(4)
26282 .kr(8)
26283 .sr(1)
26284 .m(2)
26285 .n(4)
26286 .k(8)
26287 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080026288 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026289 }
26290
26291 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080026292 for (uint32_t n = 1; n <= 4; n++) {
26293 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026294 GemmMicrokernelTester()
26295 .mr(2)
26296 .nr(4)
26297 .kr(8)
26298 .sr(1)
26299 .m(m)
26300 .n(n)
26301 .k(8)
26302 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026303 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026304 }
26305 }
26306 }
26307
26308 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_m) {
26309 for (uint32_t m = 1; m <= 2; m++) {
26310 GemmMicrokernelTester()
26311 .mr(2)
26312 .nr(4)
26313 .kr(8)
26314 .sr(1)
26315 .m(m)
26316 .n(4)
26317 .k(8)
26318 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026319 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026320 }
26321 }
26322
26323 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_n) {
26324 for (uint32_t n = 1; n <= 4; n++) {
26325 GemmMicrokernelTester()
26326 .mr(2)
26327 .nr(4)
26328 .kr(8)
26329 .sr(1)
26330 .m(2)
26331 .n(n)
26332 .k(8)
26333 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026334 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026335 }
26336 }
26337
26338 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_lt_8) {
26339 for (size_t k = 1; k < 8; k++) {
26340 GemmMicrokernelTester()
26341 .mr(2)
26342 .nr(4)
26343 .kr(8)
26344 .sr(1)
26345 .m(2)
26346 .n(4)
26347 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026348 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026349 }
26350 }
26351
26352 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_lt_8_strided_a) {
26353 for (size_t k = 1; k < 8; k++) {
26354 GemmMicrokernelTester()
26355 .mr(2)
26356 .nr(4)
26357 .kr(8)
26358 .sr(1)
26359 .m(2)
26360 .n(4)
26361 .k(k)
26362 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080026363 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026364 }
26365 }
26366
26367 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_lt_8_subtile) {
26368 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080026369 for (uint32_t n = 1; n <= 4; n++) {
26370 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026371 GemmMicrokernelTester()
26372 .mr(2)
26373 .nr(4)
26374 .kr(8)
26375 .sr(1)
26376 .m(m)
26377 .n(n)
26378 .k(k)
26379 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026380 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026381 }
26382 }
26383 }
26384 }
26385
26386 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_gt_8) {
26387 for (size_t k = 9; k < 16; k++) {
26388 GemmMicrokernelTester()
26389 .mr(2)
26390 .nr(4)
26391 .kr(8)
26392 .sr(1)
26393 .m(2)
26394 .n(4)
26395 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026396 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026397 }
26398 }
26399
26400 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_gt_8_strided_a) {
26401 for (size_t k = 9; k < 16; k++) {
26402 GemmMicrokernelTester()
26403 .mr(2)
26404 .nr(4)
26405 .kr(8)
26406 .sr(1)
26407 .m(2)
26408 .n(4)
26409 .k(k)
26410 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080026411 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026412 }
26413 }
26414
26415 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_gt_8_subtile) {
26416 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080026417 for (uint32_t n = 1; n <= 4; n++) {
26418 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026419 GemmMicrokernelTester()
26420 .mr(2)
26421 .nr(4)
26422 .kr(8)
26423 .sr(1)
26424 .m(m)
26425 .n(n)
26426 .k(k)
26427 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026428 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026429 }
26430 }
26431 }
26432 }
26433
26434 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_div_8) {
26435 for (size_t k = 16; k <= 80; k += 8) {
26436 GemmMicrokernelTester()
26437 .mr(2)
26438 .nr(4)
26439 .kr(8)
26440 .sr(1)
26441 .m(2)
26442 .n(4)
26443 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026444 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026445 }
26446 }
26447
26448 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_div_8_strided_a) {
26449 for (size_t k = 16; k <= 80; k += 8) {
26450 GemmMicrokernelTester()
26451 .mr(2)
26452 .nr(4)
26453 .kr(8)
26454 .sr(1)
26455 .m(2)
26456 .n(4)
26457 .k(k)
26458 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080026459 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026460 }
26461 }
26462
26463 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_div_8_subtile) {
26464 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080026465 for (uint32_t n = 1; n <= 4; n++) {
26466 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026467 GemmMicrokernelTester()
26468 .mr(2)
26469 .nr(4)
26470 .kr(8)
26471 .sr(1)
26472 .m(m)
26473 .n(n)
26474 .k(k)
26475 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026476 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026477 }
26478 }
26479 }
26480 }
26481
26482 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_gt_4) {
26483 for (uint32_t n = 5; n < 8; n++) {
26484 for (size_t k = 1; k <= 40; k += 9) {
26485 GemmMicrokernelTester()
26486 .mr(2)
26487 .nr(4)
26488 .kr(8)
26489 .sr(1)
26490 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080026491 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026492 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026493 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026494 }
26495 }
26496 }
26497
26498 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_gt_4_strided_cn) {
26499 for (uint32_t n = 5; n < 8; n++) {
26500 for (size_t k = 1; k <= 40; k += 9) {
26501 GemmMicrokernelTester()
26502 .mr(2)
26503 .nr(4)
26504 .kr(8)
26505 .sr(1)
26506 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080026507 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026508 .k(k)
26509 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080026510 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026511 }
26512 }
26513 }
26514
26515 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_gt_4_strided_a) {
26516 for (uint32_t n = 5; n < 8; n++) {
26517 for (size_t k = 1; k <= 40; k += 9) {
26518 GemmMicrokernelTester()
26519 .mr(2)
26520 .nr(4)
26521 .kr(8)
26522 .sr(1)
26523 .m(2)
26524 .n(n)
26525 .k(k)
26526 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080026527 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026528 }
26529 }
26530 }
26531
26532 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_gt_4_subtile) {
26533 for (uint32_t n = 5; n < 8; n++) {
26534 for (size_t k = 1; k <= 40; k += 9) {
26535 for (uint32_t m = 1; m <= 2; m++) {
26536 GemmMicrokernelTester()
26537 .mr(2)
26538 .nr(4)
26539 .kr(8)
26540 .sr(1)
26541 .m(m)
26542 .n(n)
26543 .k(k)
26544 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026545 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026546 }
26547 }
26548 }
26549 }
26550
26551 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_div_4) {
26552 for (uint32_t n = 8; n <= 12; n += 4) {
26553 for (size_t k = 1; k <= 40; k += 9) {
26554 GemmMicrokernelTester()
26555 .mr(2)
26556 .nr(4)
26557 .kr(8)
26558 .sr(1)
26559 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080026560 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026561 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026562 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026563 }
26564 }
26565 }
26566
26567 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_div_4_strided_cn) {
26568 for (uint32_t n = 8; n <= 12; n += 4) {
26569 for (size_t k = 1; k <= 40; k += 9) {
26570 GemmMicrokernelTester()
26571 .mr(2)
26572 .nr(4)
26573 .kr(8)
26574 .sr(1)
26575 .m(2)
26576 .n(n)
26577 .k(k)
26578 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080026579 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026580 }
26581 }
26582 }
26583
26584 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_div_4_strided_a) {
26585 for (uint32_t n = 8; n <= 12; n += 4) {
26586 for (size_t k = 1; k <= 40; k += 9) {
26587 GemmMicrokernelTester()
26588 .mr(2)
26589 .nr(4)
26590 .kr(8)
26591 .sr(1)
26592 .m(2)
26593 .n(n)
26594 .k(k)
26595 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080026596 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026597 }
26598 }
26599 }
26600
26601 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_div_4_subtile) {
26602 for (uint32_t n = 8; n <= 12; n += 4) {
26603 for (size_t k = 1; k <= 40; k += 9) {
26604 for (uint32_t m = 1; m <= 2; m++) {
26605 GemmMicrokernelTester()
26606 .mr(2)
26607 .nr(4)
26608 .kr(8)
26609 .sr(1)
26610 .m(m)
26611 .n(n)
26612 .k(k)
26613 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026614 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026615 }
26616 }
26617 }
26618 }
26619
26620 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, strided_cm_subtile) {
26621 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080026622 for (uint32_t n = 1; n <= 4; n++) {
26623 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026624 GemmMicrokernelTester()
26625 .mr(2)
26626 .nr(4)
26627 .kr(8)
26628 .sr(1)
26629 .m(m)
26630 .n(n)
26631 .k(k)
26632 .cm_stride(7)
26633 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026634 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026635 }
26636 }
26637 }
26638 }
26639
26640 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, qmin) {
26641 GemmMicrokernelTester()
26642 .mr(2)
26643 .nr(4)
26644 .kr(8)
26645 .sr(1)
26646 .m(2)
26647 .n(4)
26648 .k(8)
26649 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080026650 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026651 }
26652
26653 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, qmax) {
26654 GemmMicrokernelTester()
26655 .mr(2)
26656 .nr(4)
26657 .kr(8)
26658 .sr(1)
26659 .m(2)
26660 .n(4)
26661 .k(8)
26662 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080026663 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026664 }
26665
26666 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, strided_cm) {
26667 GemmMicrokernelTester()
26668 .mr(2)
26669 .nr(4)
26670 .kr(8)
26671 .sr(1)
26672 .m(2)
26673 .n(4)
26674 .k(8)
26675 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080026676 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026677 }
26678#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
26679
26680
26681#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
26682 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8) {
26683 GemmMicrokernelTester()
26684 .mr(3)
26685 .nr(4)
26686 .kr(8)
26687 .sr(1)
26688 .m(3)
26689 .n(4)
26690 .k(8)
Marat Dukhan50323b82022-01-11 00:12:01 -080026691 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026692 }
26693
26694 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, strided_cn) {
26695 GemmMicrokernelTester()
26696 .mr(3)
26697 .nr(4)
26698 .kr(8)
26699 .sr(1)
26700 .m(3)
26701 .n(4)
26702 .k(8)
26703 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080026704 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026705 }
26706
26707 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8_strided_a) {
26708 GemmMicrokernelTester()
26709 .mr(3)
26710 .nr(4)
26711 .kr(8)
26712 .sr(1)
26713 .m(3)
26714 .n(4)
26715 .k(8)
26716 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080026717 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026718 }
26719
26720 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080026721 for (uint32_t n = 1; n <= 4; n++) {
26722 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026723 GemmMicrokernelTester()
26724 .mr(3)
26725 .nr(4)
26726 .kr(8)
26727 .sr(1)
26728 .m(m)
26729 .n(n)
26730 .k(8)
26731 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026732 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026733 }
26734 }
26735 }
26736
26737 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_m) {
26738 for (uint32_t m = 1; m <= 3; m++) {
26739 GemmMicrokernelTester()
26740 .mr(3)
26741 .nr(4)
26742 .kr(8)
26743 .sr(1)
26744 .m(m)
26745 .n(4)
26746 .k(8)
26747 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026748 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026749 }
26750 }
26751
26752 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_n) {
26753 for (uint32_t n = 1; n <= 4; n++) {
26754 GemmMicrokernelTester()
26755 .mr(3)
26756 .nr(4)
26757 .kr(8)
26758 .sr(1)
26759 .m(3)
26760 .n(n)
26761 .k(8)
26762 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026763 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026764 }
26765 }
26766
26767 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_lt_8) {
26768 for (size_t k = 1; k < 8; k++) {
26769 GemmMicrokernelTester()
26770 .mr(3)
26771 .nr(4)
26772 .kr(8)
26773 .sr(1)
26774 .m(3)
26775 .n(4)
26776 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026777 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026778 }
26779 }
26780
26781 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_lt_8_strided_a) {
26782 for (size_t k = 1; k < 8; k++) {
26783 GemmMicrokernelTester()
26784 .mr(3)
26785 .nr(4)
26786 .kr(8)
26787 .sr(1)
26788 .m(3)
26789 .n(4)
26790 .k(k)
26791 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080026792 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026793 }
26794 }
26795
26796 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_lt_8_subtile) {
26797 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080026798 for (uint32_t n = 1; n <= 4; n++) {
26799 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026800 GemmMicrokernelTester()
26801 .mr(3)
26802 .nr(4)
26803 .kr(8)
26804 .sr(1)
26805 .m(m)
26806 .n(n)
26807 .k(k)
26808 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026809 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026810 }
26811 }
26812 }
26813 }
26814
26815 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_gt_8) {
26816 for (size_t k = 9; k < 16; k++) {
26817 GemmMicrokernelTester()
26818 .mr(3)
26819 .nr(4)
26820 .kr(8)
26821 .sr(1)
26822 .m(3)
26823 .n(4)
26824 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026825 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026826 }
26827 }
26828
26829 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_gt_8_strided_a) {
26830 for (size_t k = 9; k < 16; k++) {
26831 GemmMicrokernelTester()
26832 .mr(3)
26833 .nr(4)
26834 .kr(8)
26835 .sr(1)
26836 .m(3)
26837 .n(4)
26838 .k(k)
26839 .a_stride(19)
Marat Dukhan50323b82022-01-11 00:12:01 -080026840 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026841 }
26842 }
26843
26844 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_gt_8_subtile) {
26845 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080026846 for (uint32_t n = 1; n <= 4; n++) {
26847 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026848 GemmMicrokernelTester()
26849 .mr(3)
26850 .nr(4)
26851 .kr(8)
26852 .sr(1)
26853 .m(m)
26854 .n(n)
26855 .k(k)
26856 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026857 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026858 }
26859 }
26860 }
26861 }
26862
26863 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_div_8) {
26864 for (size_t k = 16; k <= 80; k += 8) {
26865 GemmMicrokernelTester()
26866 .mr(3)
26867 .nr(4)
26868 .kr(8)
26869 .sr(1)
26870 .m(3)
26871 .n(4)
26872 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026873 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026874 }
26875 }
26876
26877 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_div_8_strided_a) {
26878 for (size_t k = 16; k <= 80; k += 8) {
26879 GemmMicrokernelTester()
26880 .mr(3)
26881 .nr(4)
26882 .kr(8)
26883 .sr(1)
26884 .m(3)
26885 .n(4)
26886 .k(k)
26887 .a_stride(83)
Marat Dukhan50323b82022-01-11 00:12:01 -080026888 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026889 }
26890 }
26891
26892 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_div_8_subtile) {
26893 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080026894 for (uint32_t n = 1; n <= 4; n++) {
26895 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026896 GemmMicrokernelTester()
26897 .mr(3)
26898 .nr(4)
26899 .kr(8)
26900 .sr(1)
26901 .m(m)
26902 .n(n)
26903 .k(k)
26904 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026905 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026906 }
26907 }
26908 }
26909 }
26910
26911 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_gt_4) {
26912 for (uint32_t n = 5; n < 8; n++) {
26913 for (size_t k = 1; k <= 40; k += 9) {
26914 GemmMicrokernelTester()
26915 .mr(3)
26916 .nr(4)
26917 .kr(8)
26918 .sr(1)
26919 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080026920 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026921 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026922 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026923 }
26924 }
26925 }
26926
26927 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_gt_4_strided_cn) {
26928 for (uint32_t n = 5; n < 8; n++) {
26929 for (size_t k = 1; k <= 40; k += 9) {
26930 GemmMicrokernelTester()
26931 .mr(3)
26932 .nr(4)
26933 .kr(8)
26934 .sr(1)
26935 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080026936 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026937 .k(k)
26938 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080026939 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026940 }
26941 }
26942 }
26943
26944 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_gt_4_strided_a) {
26945 for (uint32_t n = 5; n < 8; n++) {
26946 for (size_t k = 1; k <= 40; k += 9) {
26947 GemmMicrokernelTester()
26948 .mr(3)
26949 .nr(4)
26950 .kr(8)
26951 .sr(1)
26952 .m(3)
26953 .n(n)
26954 .k(k)
26955 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080026956 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026957 }
26958 }
26959 }
26960
26961 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_gt_4_subtile) {
26962 for (uint32_t n = 5; n < 8; n++) {
26963 for (size_t k = 1; k <= 40; k += 9) {
26964 for (uint32_t m = 1; m <= 3; m++) {
26965 GemmMicrokernelTester()
26966 .mr(3)
26967 .nr(4)
26968 .kr(8)
26969 .sr(1)
26970 .m(m)
26971 .n(n)
26972 .k(k)
26973 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080026974 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026975 }
26976 }
26977 }
26978 }
26979
26980 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_div_4) {
26981 for (uint32_t n = 8; n <= 12; n += 4) {
26982 for (size_t k = 1; k <= 40; k += 9) {
26983 GemmMicrokernelTester()
26984 .mr(3)
26985 .nr(4)
26986 .kr(8)
26987 .sr(1)
26988 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080026989 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026990 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080026991 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080026992 }
26993 }
26994 }
26995
26996 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_div_4_strided_cn) {
26997 for (uint32_t n = 8; n <= 12; n += 4) {
26998 for (size_t k = 1; k <= 40; k += 9) {
26999 GemmMicrokernelTester()
27000 .mr(3)
27001 .nr(4)
27002 .kr(8)
27003 .sr(1)
27004 .m(3)
27005 .n(n)
27006 .k(k)
27007 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080027008 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027009 }
27010 }
27011 }
27012
27013 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_div_4_strided_a) {
27014 for (uint32_t n = 8; n <= 12; n += 4) {
27015 for (size_t k = 1; k <= 40; k += 9) {
27016 GemmMicrokernelTester()
27017 .mr(3)
27018 .nr(4)
27019 .kr(8)
27020 .sr(1)
27021 .m(3)
27022 .n(n)
27023 .k(k)
27024 .a_stride(43)
Marat Dukhan50323b82022-01-11 00:12:01 -080027025 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027026 }
27027 }
27028 }
27029
27030 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_div_4_subtile) {
27031 for (uint32_t n = 8; n <= 12; n += 4) {
27032 for (size_t k = 1; k <= 40; k += 9) {
27033 for (uint32_t m = 1; m <= 3; m++) {
27034 GemmMicrokernelTester()
27035 .mr(3)
27036 .nr(4)
27037 .kr(8)
27038 .sr(1)
27039 .m(m)
27040 .n(n)
27041 .k(k)
27042 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027043 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027044 }
27045 }
27046 }
27047 }
27048
27049 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, strided_cm_subtile) {
27050 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080027051 for (uint32_t n = 1; n <= 4; n++) {
27052 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027053 GemmMicrokernelTester()
27054 .mr(3)
27055 .nr(4)
27056 .kr(8)
27057 .sr(1)
27058 .m(m)
27059 .n(n)
27060 .k(k)
27061 .cm_stride(7)
27062 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027063 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027064 }
27065 }
27066 }
27067 }
27068
27069 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, qmin) {
27070 GemmMicrokernelTester()
27071 .mr(3)
27072 .nr(4)
27073 .kr(8)
27074 .sr(1)
27075 .m(3)
27076 .n(4)
27077 .k(8)
27078 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080027079 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027080 }
27081
27082 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, qmax) {
27083 GemmMicrokernelTester()
27084 .mr(3)
27085 .nr(4)
27086 .kr(8)
27087 .sr(1)
27088 .m(3)
27089 .n(4)
27090 .k(8)
27091 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080027092 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027093 }
27094
27095 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, strided_cm) {
27096 GemmMicrokernelTester()
27097 .mr(3)
27098 .nr(4)
27099 .kr(8)
27100 .sr(1)
27101 .m(3)
27102 .n(4)
27103 .k(8)
27104 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080027105 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027106 }
27107#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
27108
27109
27110#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
27111 TEST(QC8_GEMM_MINMAX_FP32_2X2__WASM_FMAGIC, k_eq_1) {
27112 GemmMicrokernelTester()
27113 .mr(2)
27114 .nr(2)
27115 .kr(1)
27116 .sr(1)
27117 .m(2)
27118 .n(2)
27119 .k(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027120 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027121 }
27122
27123 TEST(QC8_GEMM_MINMAX_FP32_2X2__WASM_FMAGIC, strided_cn) {
27124 GemmMicrokernelTester()
27125 .mr(2)
27126 .nr(2)
27127 .kr(1)
27128 .sr(1)
27129 .m(2)
27130 .n(2)
27131 .k(1)
27132 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080027133 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027134 }
27135
27136 TEST(QC8_GEMM_MINMAX_FP32_2X2__WASM_FMAGIC, k_eq_1_strided_a) {
27137 GemmMicrokernelTester()
27138 .mr(2)
27139 .nr(2)
27140 .kr(1)
27141 .sr(1)
27142 .m(2)
27143 .n(2)
27144 .k(1)
27145 .a_stride(3)
Marat Dukhan50323b82022-01-11 00:12:01 -080027146 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027147 }
27148
27149 TEST(QC8_GEMM_MINMAX_FP32_2X2__WASM_FMAGIC, k_eq_1_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080027150 for (uint32_t n = 1; n <= 2; n++) {
27151 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027152 GemmMicrokernelTester()
27153 .mr(2)
27154 .nr(2)
27155 .kr(1)
27156 .sr(1)
27157 .m(m)
27158 .n(n)
27159 .k(1)
27160 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027161 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027162 }
27163 }
27164 }
27165
27166 TEST(QC8_GEMM_MINMAX_FP32_2X2__WASM_FMAGIC, k_eq_1_subtile_m) {
27167 for (uint32_t m = 1; m <= 2; m++) {
27168 GemmMicrokernelTester()
27169 .mr(2)
27170 .nr(2)
27171 .kr(1)
27172 .sr(1)
27173 .m(m)
27174 .n(2)
27175 .k(1)
27176 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027177 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027178 }
27179 }
27180
27181 TEST(QC8_GEMM_MINMAX_FP32_2X2__WASM_FMAGIC, k_eq_1_subtile_n) {
27182 for (uint32_t n = 1; n <= 2; n++) {
27183 GemmMicrokernelTester()
27184 .mr(2)
27185 .nr(2)
27186 .kr(1)
27187 .sr(1)
27188 .m(2)
27189 .n(n)
27190 .k(1)
27191 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027192 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027193 }
27194 }
27195
27196 TEST(QC8_GEMM_MINMAX_FP32_2X2__WASM_FMAGIC, k_gt_1) {
27197 for (size_t k = 2; k < 10; k++) {
27198 GemmMicrokernelTester()
27199 .mr(2)
27200 .nr(2)
27201 .kr(1)
27202 .sr(1)
27203 .m(2)
27204 .n(2)
27205 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080027206 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027207 }
27208 }
27209
27210 TEST(QC8_GEMM_MINMAX_FP32_2X2__WASM_FMAGIC, k_gt_1_strided_a) {
27211 for (size_t k = 2; k < 10; k++) {
27212 GemmMicrokernelTester()
27213 .mr(2)
27214 .nr(2)
27215 .kr(1)
27216 .sr(1)
27217 .m(2)
27218 .n(2)
27219 .k(k)
27220 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080027221 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027222 }
27223 }
27224
27225 TEST(QC8_GEMM_MINMAX_FP32_2X2__WASM_FMAGIC, k_gt_1_subtile) {
27226 for (size_t k = 2; k < 10; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080027227 for (uint32_t n = 1; n <= 2; n++) {
27228 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027229 GemmMicrokernelTester()
27230 .mr(2)
27231 .nr(2)
27232 .kr(1)
27233 .sr(1)
27234 .m(m)
27235 .n(n)
27236 .k(k)
27237 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027238 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027239 }
27240 }
27241 }
27242 }
27243
27244 TEST(QC8_GEMM_MINMAX_FP32_2X2__WASM_FMAGIC, n_gt_2) {
27245 for (uint32_t n = 3; n < 4; n++) {
27246 for (size_t k = 1; k <= 5; k += 2) {
27247 GemmMicrokernelTester()
27248 .mr(2)
27249 .nr(2)
27250 .kr(1)
27251 .sr(1)
27252 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080027253 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027254 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080027255 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027256 }
27257 }
27258 }
27259
27260 TEST(QC8_GEMM_MINMAX_FP32_2X2__WASM_FMAGIC, n_gt_2_strided_cn) {
27261 for (uint32_t n = 3; n < 4; n++) {
27262 for (size_t k = 1; k <= 5; k += 2) {
27263 GemmMicrokernelTester()
27264 .mr(2)
27265 .nr(2)
27266 .kr(1)
27267 .sr(1)
27268 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080027269 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027270 .k(k)
27271 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080027272 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027273 }
27274 }
27275 }
27276
27277 TEST(QC8_GEMM_MINMAX_FP32_2X2__WASM_FMAGIC, n_gt_2_strided_a) {
27278 for (uint32_t n = 3; n < 4; n++) {
27279 for (size_t k = 1; k <= 5; k += 2) {
27280 GemmMicrokernelTester()
27281 .mr(2)
27282 .nr(2)
27283 .kr(1)
27284 .sr(1)
27285 .m(2)
27286 .n(n)
27287 .k(k)
27288 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080027289 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027290 }
27291 }
27292 }
27293
27294 TEST(QC8_GEMM_MINMAX_FP32_2X2__WASM_FMAGIC, n_gt_2_subtile) {
27295 for (uint32_t n = 3; n < 4; n++) {
27296 for (size_t k = 1; k <= 5; k += 2) {
27297 for (uint32_t m = 1; m <= 2; m++) {
27298 GemmMicrokernelTester()
27299 .mr(2)
27300 .nr(2)
27301 .kr(1)
27302 .sr(1)
27303 .m(m)
27304 .n(n)
27305 .k(k)
27306 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027307 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027308 }
27309 }
27310 }
27311 }
27312
27313 TEST(QC8_GEMM_MINMAX_FP32_2X2__WASM_FMAGIC, n_div_2) {
27314 for (uint32_t n = 4; n <= 6; n += 2) {
27315 for (size_t k = 1; k <= 5; k += 2) {
27316 GemmMicrokernelTester()
27317 .mr(2)
27318 .nr(2)
27319 .kr(1)
27320 .sr(1)
27321 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080027322 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027323 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080027324 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027325 }
27326 }
27327 }
27328
27329 TEST(QC8_GEMM_MINMAX_FP32_2X2__WASM_FMAGIC, n_div_2_strided_cn) {
27330 for (uint32_t n = 4; n <= 6; n += 2) {
27331 for (size_t k = 1; k <= 5; k += 2) {
27332 GemmMicrokernelTester()
27333 .mr(2)
27334 .nr(2)
27335 .kr(1)
27336 .sr(1)
27337 .m(2)
27338 .n(n)
27339 .k(k)
27340 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080027341 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027342 }
27343 }
27344 }
27345
27346 TEST(QC8_GEMM_MINMAX_FP32_2X2__WASM_FMAGIC, n_div_2_strided_a) {
27347 for (uint32_t n = 4; n <= 6; n += 2) {
27348 for (size_t k = 1; k <= 5; k += 2) {
27349 GemmMicrokernelTester()
27350 .mr(2)
27351 .nr(2)
27352 .kr(1)
27353 .sr(1)
27354 .m(2)
27355 .n(n)
27356 .k(k)
27357 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080027358 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027359 }
27360 }
27361 }
27362
27363 TEST(QC8_GEMM_MINMAX_FP32_2X2__WASM_FMAGIC, n_div_2_subtile) {
27364 for (uint32_t n = 4; n <= 6; n += 2) {
27365 for (size_t k = 1; k <= 5; k += 2) {
27366 for (uint32_t m = 1; m <= 2; m++) {
27367 GemmMicrokernelTester()
27368 .mr(2)
27369 .nr(2)
27370 .kr(1)
27371 .sr(1)
27372 .m(m)
27373 .n(n)
27374 .k(k)
27375 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027376 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027377 }
27378 }
27379 }
27380 }
27381
27382 TEST(QC8_GEMM_MINMAX_FP32_2X2__WASM_FMAGIC, strided_cm_subtile) {
27383 for (size_t k = 1; k <= 5; k += 2) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080027384 for (uint32_t n = 1; n <= 2; n++) {
27385 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027386 GemmMicrokernelTester()
27387 .mr(2)
27388 .nr(2)
27389 .kr(1)
27390 .sr(1)
27391 .m(m)
27392 .n(n)
27393 .k(k)
27394 .cm_stride(5)
27395 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027396 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027397 }
27398 }
27399 }
27400 }
27401
27402 TEST(QC8_GEMM_MINMAX_FP32_2X2__WASM_FMAGIC, qmin) {
27403 GemmMicrokernelTester()
27404 .mr(2)
27405 .nr(2)
27406 .kr(1)
27407 .sr(1)
27408 .m(2)
27409 .n(2)
27410 .k(1)
27411 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080027412 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027413 }
27414
27415 TEST(QC8_GEMM_MINMAX_FP32_2X2__WASM_FMAGIC, qmax) {
27416 GemmMicrokernelTester()
27417 .mr(2)
27418 .nr(2)
27419 .kr(1)
27420 .sr(1)
27421 .m(2)
27422 .n(2)
27423 .k(1)
27424 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080027425 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027426 }
27427
27428 TEST(QC8_GEMM_MINMAX_FP32_2X2__WASM_FMAGIC, strided_cm) {
27429 GemmMicrokernelTester()
27430 .mr(2)
27431 .nr(2)
27432 .kr(1)
27433 .sr(1)
27434 .m(2)
27435 .n(2)
27436 .k(1)
27437 .cm_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080027438 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027439 }
27440#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
27441
27442
27443TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, k_eq_1) {
27444 GemmMicrokernelTester()
27445 .mr(4)
27446 .nr(2)
27447 .kr(1)
27448 .sr(1)
27449 .m(4)
27450 .n(2)
27451 .k(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027452 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027453}
27454
27455TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, strided_cn) {
27456 GemmMicrokernelTester()
27457 .mr(4)
27458 .nr(2)
27459 .kr(1)
27460 .sr(1)
27461 .m(4)
27462 .n(2)
27463 .k(1)
27464 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080027465 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027466}
27467
27468TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, k_eq_1_strided_a) {
27469 GemmMicrokernelTester()
27470 .mr(4)
27471 .nr(2)
27472 .kr(1)
27473 .sr(1)
27474 .m(4)
27475 .n(2)
27476 .k(1)
27477 .a_stride(3)
Marat Dukhan50323b82022-01-11 00:12:01 -080027478 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027479}
27480
27481TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, k_eq_1_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080027482 for (uint32_t n = 1; n <= 2; n++) {
27483 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027484 GemmMicrokernelTester()
27485 .mr(4)
27486 .nr(2)
27487 .kr(1)
27488 .sr(1)
27489 .m(m)
27490 .n(n)
27491 .k(1)
27492 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027493 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027494 }
27495 }
27496}
27497
27498TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, k_eq_1_subtile_m) {
27499 for (uint32_t m = 1; m <= 4; m++) {
27500 GemmMicrokernelTester()
27501 .mr(4)
27502 .nr(2)
27503 .kr(1)
27504 .sr(1)
27505 .m(m)
27506 .n(2)
27507 .k(1)
27508 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027509 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027510 }
27511}
27512
27513TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, k_eq_1_subtile_n) {
27514 for (uint32_t n = 1; n <= 2; n++) {
27515 GemmMicrokernelTester()
27516 .mr(4)
27517 .nr(2)
27518 .kr(1)
27519 .sr(1)
27520 .m(4)
27521 .n(n)
27522 .k(1)
27523 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027524 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027525 }
27526}
27527
27528TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, k_gt_1) {
27529 for (size_t k = 2; k < 10; k++) {
27530 GemmMicrokernelTester()
27531 .mr(4)
27532 .nr(2)
27533 .kr(1)
27534 .sr(1)
27535 .m(4)
27536 .n(2)
27537 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080027538 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027539 }
27540}
27541
27542TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, k_gt_1_strided_a) {
27543 for (size_t k = 2; k < 10; k++) {
27544 GemmMicrokernelTester()
27545 .mr(4)
27546 .nr(2)
27547 .kr(1)
27548 .sr(1)
27549 .m(4)
27550 .n(2)
27551 .k(k)
27552 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080027553 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027554 }
27555}
27556
27557TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, k_gt_1_subtile) {
27558 for (size_t k = 2; k < 10; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080027559 for (uint32_t n = 1; n <= 2; n++) {
27560 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027561 GemmMicrokernelTester()
27562 .mr(4)
27563 .nr(2)
27564 .kr(1)
27565 .sr(1)
27566 .m(m)
27567 .n(n)
27568 .k(k)
27569 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027570 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027571 }
27572 }
27573 }
27574}
27575
27576TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, n_gt_2) {
27577 for (uint32_t n = 3; n < 4; n++) {
27578 for (size_t k = 1; k <= 5; k += 2) {
27579 GemmMicrokernelTester()
27580 .mr(4)
27581 .nr(2)
27582 .kr(1)
27583 .sr(1)
27584 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080027585 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027586 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080027587 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027588 }
27589 }
27590}
27591
27592TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, n_gt_2_strided_cn) {
27593 for (uint32_t n = 3; n < 4; n++) {
27594 for (size_t k = 1; k <= 5; k += 2) {
27595 GemmMicrokernelTester()
27596 .mr(4)
27597 .nr(2)
27598 .kr(1)
27599 .sr(1)
27600 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080027601 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027602 .k(k)
27603 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080027604 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027605 }
27606 }
27607}
27608
27609TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, n_gt_2_strided_a) {
27610 for (uint32_t n = 3; n < 4; n++) {
27611 for (size_t k = 1; k <= 5; k += 2) {
27612 GemmMicrokernelTester()
27613 .mr(4)
27614 .nr(2)
27615 .kr(1)
27616 .sr(1)
27617 .m(4)
27618 .n(n)
27619 .k(k)
27620 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080027621 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027622 }
27623 }
27624}
27625
27626TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, n_gt_2_subtile) {
27627 for (uint32_t n = 3; n < 4; n++) {
27628 for (size_t k = 1; k <= 5; k += 2) {
27629 for (uint32_t m = 1; m <= 4; m++) {
27630 GemmMicrokernelTester()
27631 .mr(4)
27632 .nr(2)
27633 .kr(1)
27634 .sr(1)
27635 .m(m)
27636 .n(n)
27637 .k(k)
27638 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027639 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027640 }
27641 }
27642 }
27643}
27644
27645TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, n_div_2) {
27646 for (uint32_t n = 4; n <= 6; n += 2) {
27647 for (size_t k = 1; k <= 5; k += 2) {
27648 GemmMicrokernelTester()
27649 .mr(4)
27650 .nr(2)
27651 .kr(1)
27652 .sr(1)
27653 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080027654 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027655 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080027656 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027657 }
27658 }
27659}
27660
27661TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, n_div_2_strided_cn) {
27662 for (uint32_t n = 4; n <= 6; n += 2) {
27663 for (size_t k = 1; k <= 5; k += 2) {
27664 GemmMicrokernelTester()
27665 .mr(4)
27666 .nr(2)
27667 .kr(1)
27668 .sr(1)
27669 .m(4)
27670 .n(n)
27671 .k(k)
27672 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080027673 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027674 }
27675 }
27676}
27677
27678TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, n_div_2_strided_a) {
27679 for (uint32_t n = 4; n <= 6; n += 2) {
27680 for (size_t k = 1; k <= 5; k += 2) {
27681 GemmMicrokernelTester()
27682 .mr(4)
27683 .nr(2)
27684 .kr(1)
27685 .sr(1)
27686 .m(4)
27687 .n(n)
27688 .k(k)
27689 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080027690 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027691 }
27692 }
27693}
27694
27695TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, n_div_2_subtile) {
27696 for (uint32_t n = 4; n <= 6; n += 2) {
27697 for (size_t k = 1; k <= 5; k += 2) {
27698 for (uint32_t m = 1; m <= 4; m++) {
27699 GemmMicrokernelTester()
27700 .mr(4)
27701 .nr(2)
27702 .kr(1)
27703 .sr(1)
27704 .m(m)
27705 .n(n)
27706 .k(k)
27707 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027708 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027709 }
27710 }
27711 }
27712}
27713
27714TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, strided_cm_subtile) {
27715 for (size_t k = 1; k <= 5; k += 2) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080027716 for (uint32_t n = 1; n <= 2; n++) {
27717 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027718 GemmMicrokernelTester()
27719 .mr(4)
27720 .nr(2)
27721 .kr(1)
27722 .sr(1)
27723 .m(m)
27724 .n(n)
27725 .k(k)
27726 .cm_stride(5)
27727 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027728 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027729 }
27730 }
27731 }
27732}
27733
27734TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, qmin) {
27735 GemmMicrokernelTester()
27736 .mr(4)
27737 .nr(2)
27738 .kr(1)
27739 .sr(1)
27740 .m(4)
27741 .n(2)
27742 .k(1)
27743 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080027744 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027745}
27746
27747TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, qmax) {
27748 GemmMicrokernelTester()
27749 .mr(4)
27750 .nr(2)
27751 .kr(1)
27752 .sr(1)
27753 .m(4)
27754 .n(2)
27755 .k(1)
27756 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080027757 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027758}
27759
27760TEST(QC8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, strided_cm) {
27761 GemmMicrokernelTester()
27762 .mr(4)
27763 .nr(2)
27764 .kr(1)
27765 .sr(1)
27766 .m(4)
27767 .n(2)
27768 .k(1)
27769 .cm_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080027770 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027771}
27772
27773
27774TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_FMAGIC, k_eq_1) {
27775 GemmMicrokernelTester()
27776 .mr(2)
27777 .nr(4)
27778 .kr(1)
27779 .sr(1)
27780 .m(2)
27781 .n(4)
27782 .k(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027783 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027784}
27785
27786TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_FMAGIC, strided_cn) {
27787 GemmMicrokernelTester()
27788 .mr(2)
27789 .nr(4)
27790 .kr(1)
27791 .sr(1)
27792 .m(2)
27793 .n(4)
27794 .k(1)
27795 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080027796 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027797}
27798
27799TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_FMAGIC, k_eq_1_strided_a) {
27800 GemmMicrokernelTester()
27801 .mr(2)
27802 .nr(4)
27803 .kr(1)
27804 .sr(1)
27805 .m(2)
27806 .n(4)
27807 .k(1)
27808 .a_stride(3)
Marat Dukhan50323b82022-01-11 00:12:01 -080027809 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027810}
27811
27812TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_FMAGIC, k_eq_1_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080027813 for (uint32_t n = 1; n <= 4; n++) {
27814 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027815 GemmMicrokernelTester()
27816 .mr(2)
27817 .nr(4)
27818 .kr(1)
27819 .sr(1)
27820 .m(m)
27821 .n(n)
27822 .k(1)
27823 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027824 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027825 }
27826 }
27827}
27828
27829TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_FMAGIC, k_eq_1_subtile_m) {
27830 for (uint32_t m = 1; m <= 2; m++) {
27831 GemmMicrokernelTester()
27832 .mr(2)
27833 .nr(4)
27834 .kr(1)
27835 .sr(1)
27836 .m(m)
27837 .n(4)
27838 .k(1)
27839 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027840 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027841 }
27842}
27843
27844TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_FMAGIC, k_eq_1_subtile_n) {
27845 for (uint32_t n = 1; n <= 4; n++) {
27846 GemmMicrokernelTester()
27847 .mr(2)
27848 .nr(4)
27849 .kr(1)
27850 .sr(1)
27851 .m(2)
27852 .n(n)
27853 .k(1)
27854 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027855 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027856 }
27857}
27858
27859TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_FMAGIC, k_gt_1) {
27860 for (size_t k = 2; k < 10; k++) {
27861 GemmMicrokernelTester()
27862 .mr(2)
27863 .nr(4)
27864 .kr(1)
27865 .sr(1)
27866 .m(2)
27867 .n(4)
27868 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080027869 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027870 }
27871}
27872
27873TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_FMAGIC, k_gt_1_strided_a) {
27874 for (size_t k = 2; k < 10; k++) {
27875 GemmMicrokernelTester()
27876 .mr(2)
27877 .nr(4)
27878 .kr(1)
27879 .sr(1)
27880 .m(2)
27881 .n(4)
27882 .k(k)
27883 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080027884 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027885 }
27886}
27887
27888TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_FMAGIC, k_gt_1_subtile) {
27889 for (size_t k = 2; k < 10; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080027890 for (uint32_t n = 1; n <= 4; n++) {
27891 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027892 GemmMicrokernelTester()
27893 .mr(2)
27894 .nr(4)
27895 .kr(1)
27896 .sr(1)
27897 .m(m)
27898 .n(n)
27899 .k(k)
27900 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027901 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027902 }
27903 }
27904 }
27905}
27906
27907TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_FMAGIC, n_gt_4) {
27908 for (uint32_t n = 5; n < 8; n++) {
27909 for (size_t k = 1; k <= 5; k += 2) {
27910 GemmMicrokernelTester()
27911 .mr(2)
27912 .nr(4)
27913 .kr(1)
27914 .sr(1)
27915 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080027916 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027917 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080027918 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027919 }
27920 }
27921}
27922
27923TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_FMAGIC, n_gt_4_strided_cn) {
27924 for (uint32_t n = 5; n < 8; n++) {
27925 for (size_t k = 1; k <= 5; k += 2) {
27926 GemmMicrokernelTester()
27927 .mr(2)
27928 .nr(4)
27929 .kr(1)
27930 .sr(1)
27931 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080027932 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027933 .k(k)
27934 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080027935 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027936 }
27937 }
27938}
27939
27940TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_FMAGIC, n_gt_4_strided_a) {
27941 for (uint32_t n = 5; n < 8; n++) {
27942 for (size_t k = 1; k <= 5; k += 2) {
27943 GemmMicrokernelTester()
27944 .mr(2)
27945 .nr(4)
27946 .kr(1)
27947 .sr(1)
27948 .m(2)
27949 .n(n)
27950 .k(k)
27951 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080027952 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027953 }
27954 }
27955}
27956
27957TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_FMAGIC, n_gt_4_subtile) {
27958 for (uint32_t n = 5; n < 8; n++) {
27959 for (size_t k = 1; k <= 5; k += 2) {
27960 for (uint32_t m = 1; m <= 2; m++) {
27961 GemmMicrokernelTester()
27962 .mr(2)
27963 .nr(4)
27964 .kr(1)
27965 .sr(1)
27966 .m(m)
27967 .n(n)
27968 .k(k)
27969 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080027970 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027971 }
27972 }
27973 }
27974}
27975
27976TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_FMAGIC, n_div_4) {
27977 for (uint32_t n = 8; n <= 12; n += 4) {
27978 for (size_t k = 1; k <= 5; k += 2) {
27979 GemmMicrokernelTester()
27980 .mr(2)
27981 .nr(4)
27982 .kr(1)
27983 .sr(1)
27984 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080027985 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027986 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080027987 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080027988 }
27989 }
27990}
27991
27992TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_FMAGIC, n_div_4_strided_cn) {
27993 for (uint32_t n = 8; n <= 12; n += 4) {
27994 for (size_t k = 1; k <= 5; k += 2) {
27995 GemmMicrokernelTester()
27996 .mr(2)
27997 .nr(4)
27998 .kr(1)
27999 .sr(1)
28000 .m(2)
28001 .n(n)
28002 .k(k)
28003 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028004 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028005 }
28006 }
28007}
28008
28009TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_FMAGIC, n_div_4_strided_a) {
28010 for (uint32_t n = 8; n <= 12; n += 4) {
28011 for (size_t k = 1; k <= 5; k += 2) {
28012 GemmMicrokernelTester()
28013 .mr(2)
28014 .nr(4)
28015 .kr(1)
28016 .sr(1)
28017 .m(2)
28018 .n(n)
28019 .k(k)
28020 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028021 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028022 }
28023 }
28024}
28025
28026TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_FMAGIC, n_div_4_subtile) {
28027 for (uint32_t n = 8; n <= 12; n += 4) {
28028 for (size_t k = 1; k <= 5; k += 2) {
28029 for (uint32_t m = 1; m <= 2; m++) {
28030 GemmMicrokernelTester()
28031 .mr(2)
28032 .nr(4)
28033 .kr(1)
28034 .sr(1)
28035 .m(m)
28036 .n(n)
28037 .k(k)
28038 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028039 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028040 }
28041 }
28042 }
28043}
28044
28045TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_FMAGIC, strided_cm_subtile) {
28046 for (size_t k = 1; k <= 5; k += 2) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080028047 for (uint32_t n = 1; n <= 4; n++) {
28048 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028049 GemmMicrokernelTester()
28050 .mr(2)
28051 .nr(4)
28052 .kr(1)
28053 .sr(1)
28054 .m(m)
28055 .n(n)
28056 .k(k)
28057 .cm_stride(7)
28058 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028059 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028060 }
28061 }
28062 }
28063}
28064
28065TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_FMAGIC, qmin) {
28066 GemmMicrokernelTester()
28067 .mr(2)
28068 .nr(4)
28069 .kr(1)
28070 .sr(1)
28071 .m(2)
28072 .n(4)
28073 .k(1)
28074 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080028075 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028076}
28077
28078TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_FMAGIC, qmax) {
28079 GemmMicrokernelTester()
28080 .mr(2)
28081 .nr(4)
28082 .kr(1)
28083 .sr(1)
28084 .m(2)
28085 .n(4)
28086 .k(1)
28087 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080028088 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028089}
28090
28091TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_FMAGIC, strided_cm) {
28092 GemmMicrokernelTester()
28093 .mr(2)
28094 .nr(4)
28095 .kr(1)
28096 .sr(1)
28097 .m(2)
28098 .n(4)
28099 .k(1)
28100 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028101 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028102}
28103
28104
28105TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, k_eq_1) {
28106 GemmMicrokernelTester()
28107 .mr(4)
28108 .nr(4)
28109 .kr(1)
28110 .sr(1)
28111 .m(4)
28112 .n(4)
28113 .k(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028114 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028115}
28116
28117TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, strided_cn) {
28118 GemmMicrokernelTester()
28119 .mr(4)
28120 .nr(4)
28121 .kr(1)
28122 .sr(1)
28123 .m(4)
28124 .n(4)
28125 .k(1)
28126 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028127 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028128}
28129
28130TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, k_eq_1_strided_a) {
28131 GemmMicrokernelTester()
28132 .mr(4)
28133 .nr(4)
28134 .kr(1)
28135 .sr(1)
28136 .m(4)
28137 .n(4)
28138 .k(1)
28139 .a_stride(3)
Marat Dukhan50323b82022-01-11 00:12:01 -080028140 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028141}
28142
28143TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, k_eq_1_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080028144 for (uint32_t n = 1; n <= 4; n++) {
28145 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028146 GemmMicrokernelTester()
28147 .mr(4)
28148 .nr(4)
28149 .kr(1)
28150 .sr(1)
28151 .m(m)
28152 .n(n)
28153 .k(1)
28154 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028155 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028156 }
28157 }
28158}
28159
28160TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, k_eq_1_subtile_m) {
28161 for (uint32_t m = 1; m <= 4; m++) {
28162 GemmMicrokernelTester()
28163 .mr(4)
28164 .nr(4)
28165 .kr(1)
28166 .sr(1)
28167 .m(m)
28168 .n(4)
28169 .k(1)
28170 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028171 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028172 }
28173}
28174
28175TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, k_eq_1_subtile_n) {
28176 for (uint32_t n = 1; n <= 4; n++) {
28177 GemmMicrokernelTester()
28178 .mr(4)
28179 .nr(4)
28180 .kr(1)
28181 .sr(1)
28182 .m(4)
28183 .n(n)
28184 .k(1)
28185 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028186 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028187 }
28188}
28189
28190TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, k_gt_1) {
28191 for (size_t k = 2; k < 10; k++) {
28192 GemmMicrokernelTester()
28193 .mr(4)
28194 .nr(4)
28195 .kr(1)
28196 .sr(1)
28197 .m(4)
28198 .n(4)
28199 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080028200 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028201 }
28202}
28203
28204TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, k_gt_1_strided_a) {
28205 for (size_t k = 2; k < 10; k++) {
28206 GemmMicrokernelTester()
28207 .mr(4)
28208 .nr(4)
28209 .kr(1)
28210 .sr(1)
28211 .m(4)
28212 .n(4)
28213 .k(k)
28214 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080028215 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028216 }
28217}
28218
28219TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, k_gt_1_subtile) {
28220 for (size_t k = 2; k < 10; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080028221 for (uint32_t n = 1; n <= 4; n++) {
28222 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028223 GemmMicrokernelTester()
28224 .mr(4)
28225 .nr(4)
28226 .kr(1)
28227 .sr(1)
28228 .m(m)
28229 .n(n)
28230 .k(k)
28231 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028232 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028233 }
28234 }
28235 }
28236}
28237
28238TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, n_gt_4) {
28239 for (uint32_t n = 5; n < 8; n++) {
28240 for (size_t k = 1; k <= 5; k += 2) {
28241 GemmMicrokernelTester()
28242 .mr(4)
28243 .nr(4)
28244 .kr(1)
28245 .sr(1)
28246 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080028247 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028248 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080028249 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028250 }
28251 }
28252}
28253
28254TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, n_gt_4_strided_cn) {
28255 for (uint32_t n = 5; n < 8; n++) {
28256 for (size_t k = 1; k <= 5; k += 2) {
28257 GemmMicrokernelTester()
28258 .mr(4)
28259 .nr(4)
28260 .kr(1)
28261 .sr(1)
28262 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080028263 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028264 .k(k)
28265 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028266 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028267 }
28268 }
28269}
28270
28271TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, n_gt_4_strided_a) {
28272 for (uint32_t n = 5; n < 8; n++) {
28273 for (size_t k = 1; k <= 5; k += 2) {
28274 GemmMicrokernelTester()
28275 .mr(4)
28276 .nr(4)
28277 .kr(1)
28278 .sr(1)
28279 .m(4)
28280 .n(n)
28281 .k(k)
28282 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028283 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028284 }
28285 }
28286}
28287
28288TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, n_gt_4_subtile) {
28289 for (uint32_t n = 5; n < 8; n++) {
28290 for (size_t k = 1; k <= 5; k += 2) {
28291 for (uint32_t m = 1; m <= 4; m++) {
28292 GemmMicrokernelTester()
28293 .mr(4)
28294 .nr(4)
28295 .kr(1)
28296 .sr(1)
28297 .m(m)
28298 .n(n)
28299 .k(k)
28300 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028301 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028302 }
28303 }
28304 }
28305}
28306
28307TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, n_div_4) {
28308 for (uint32_t n = 8; n <= 12; n += 4) {
28309 for (size_t k = 1; k <= 5; k += 2) {
28310 GemmMicrokernelTester()
28311 .mr(4)
28312 .nr(4)
28313 .kr(1)
28314 .sr(1)
28315 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080028316 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028317 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080028318 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028319 }
28320 }
28321}
28322
28323TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, n_div_4_strided_cn) {
28324 for (uint32_t n = 8; n <= 12; n += 4) {
28325 for (size_t k = 1; k <= 5; k += 2) {
28326 GemmMicrokernelTester()
28327 .mr(4)
28328 .nr(4)
28329 .kr(1)
28330 .sr(1)
28331 .m(4)
28332 .n(n)
28333 .k(k)
28334 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028335 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028336 }
28337 }
28338}
28339
28340TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, n_div_4_strided_a) {
28341 for (uint32_t n = 8; n <= 12; n += 4) {
28342 for (size_t k = 1; k <= 5; k += 2) {
28343 GemmMicrokernelTester()
28344 .mr(4)
28345 .nr(4)
28346 .kr(1)
28347 .sr(1)
28348 .m(4)
28349 .n(n)
28350 .k(k)
28351 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028352 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028353 }
28354 }
28355}
28356
28357TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, n_div_4_subtile) {
28358 for (uint32_t n = 8; n <= 12; n += 4) {
28359 for (size_t k = 1; k <= 5; k += 2) {
28360 for (uint32_t m = 1; m <= 4; m++) {
28361 GemmMicrokernelTester()
28362 .mr(4)
28363 .nr(4)
28364 .kr(1)
28365 .sr(1)
28366 .m(m)
28367 .n(n)
28368 .k(k)
28369 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028370 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028371 }
28372 }
28373 }
28374}
28375
28376TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, strided_cm_subtile) {
28377 for (size_t k = 1; k <= 5; k += 2) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080028378 for (uint32_t n = 1; n <= 4; n++) {
28379 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028380 GemmMicrokernelTester()
28381 .mr(4)
28382 .nr(4)
28383 .kr(1)
28384 .sr(1)
28385 .m(m)
28386 .n(n)
28387 .k(k)
28388 .cm_stride(7)
28389 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028390 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028391 }
28392 }
28393 }
28394}
28395
28396TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, qmin) {
28397 GemmMicrokernelTester()
28398 .mr(4)
28399 .nr(4)
28400 .kr(1)
28401 .sr(1)
28402 .m(4)
28403 .n(4)
28404 .k(1)
28405 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080028406 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028407}
28408
28409TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, qmax) {
28410 GemmMicrokernelTester()
28411 .mr(4)
28412 .nr(4)
28413 .kr(1)
28414 .sr(1)
28415 .m(4)
28416 .n(4)
28417 .k(1)
28418 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080028419 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028420}
28421
28422TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, strided_cm) {
28423 GemmMicrokernelTester()
28424 .mr(4)
28425 .nr(4)
28426 .kr(1)
28427 .sr(1)
28428 .m(4)
28429 .n(4)
28430 .k(1)
28431 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028432 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qs8_minmax_scalar_fmagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028433}
28434
28435
28436TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, k_eq_1) {
28437 GemmMicrokernelTester()
28438 .mr(2)
28439 .nr(2)
28440 .kr(1)
28441 .sr(1)
28442 .m(2)
28443 .n(2)
28444 .k(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028445 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028446}
28447
28448TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, strided_cn) {
28449 GemmMicrokernelTester()
28450 .mr(2)
28451 .nr(2)
28452 .kr(1)
28453 .sr(1)
28454 .m(2)
28455 .n(2)
28456 .k(1)
28457 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080028458 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028459}
28460
28461TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, k_eq_1_strided_a) {
28462 GemmMicrokernelTester()
28463 .mr(2)
28464 .nr(2)
28465 .kr(1)
28466 .sr(1)
28467 .m(2)
28468 .n(2)
28469 .k(1)
28470 .a_stride(3)
Marat Dukhan50323b82022-01-11 00:12:01 -080028471 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028472}
28473
28474TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, k_eq_1_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080028475 for (uint32_t n = 1; n <= 2; n++) {
28476 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028477 GemmMicrokernelTester()
28478 .mr(2)
28479 .nr(2)
28480 .kr(1)
28481 .sr(1)
28482 .m(m)
28483 .n(n)
28484 .k(1)
28485 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028486 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028487 }
28488 }
28489}
28490
28491TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, k_eq_1_subtile_m) {
28492 for (uint32_t m = 1; m <= 2; m++) {
28493 GemmMicrokernelTester()
28494 .mr(2)
28495 .nr(2)
28496 .kr(1)
28497 .sr(1)
28498 .m(m)
28499 .n(2)
28500 .k(1)
28501 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028502 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028503 }
28504}
28505
28506TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, k_eq_1_subtile_n) {
28507 for (uint32_t n = 1; n <= 2; n++) {
28508 GemmMicrokernelTester()
28509 .mr(2)
28510 .nr(2)
28511 .kr(1)
28512 .sr(1)
28513 .m(2)
28514 .n(n)
28515 .k(1)
28516 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028517 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028518 }
28519}
28520
28521TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, k_gt_1) {
28522 for (size_t k = 2; k < 10; k++) {
28523 GemmMicrokernelTester()
28524 .mr(2)
28525 .nr(2)
28526 .kr(1)
28527 .sr(1)
28528 .m(2)
28529 .n(2)
28530 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080028531 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028532 }
28533}
28534
28535TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, k_gt_1_strided_a) {
28536 for (size_t k = 2; k < 10; k++) {
28537 GemmMicrokernelTester()
28538 .mr(2)
28539 .nr(2)
28540 .kr(1)
28541 .sr(1)
28542 .m(2)
28543 .n(2)
28544 .k(k)
28545 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080028546 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028547 }
28548}
28549
28550TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, k_gt_1_subtile) {
28551 for (size_t k = 2; k < 10; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080028552 for (uint32_t n = 1; n <= 2; n++) {
28553 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028554 GemmMicrokernelTester()
28555 .mr(2)
28556 .nr(2)
28557 .kr(1)
28558 .sr(1)
28559 .m(m)
28560 .n(n)
28561 .k(k)
28562 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028563 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028564 }
28565 }
28566 }
28567}
28568
28569TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, n_gt_2) {
28570 for (uint32_t n = 3; n < 4; n++) {
28571 for (size_t k = 1; k <= 5; k += 2) {
28572 GemmMicrokernelTester()
28573 .mr(2)
28574 .nr(2)
28575 .kr(1)
28576 .sr(1)
28577 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080028578 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028579 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080028580 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028581 }
28582 }
28583}
28584
28585TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, n_gt_2_strided_cn) {
28586 for (uint32_t n = 3; n < 4; n++) {
28587 for (size_t k = 1; k <= 5; k += 2) {
28588 GemmMicrokernelTester()
28589 .mr(2)
28590 .nr(2)
28591 .kr(1)
28592 .sr(1)
28593 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080028594 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028595 .k(k)
28596 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080028597 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028598 }
28599 }
28600}
28601
28602TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, n_gt_2_strided_a) {
28603 for (uint32_t n = 3; n < 4; n++) {
28604 for (size_t k = 1; k <= 5; k += 2) {
28605 GemmMicrokernelTester()
28606 .mr(2)
28607 .nr(2)
28608 .kr(1)
28609 .sr(1)
28610 .m(2)
28611 .n(n)
28612 .k(k)
28613 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028614 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028615 }
28616 }
28617}
28618
28619TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, n_gt_2_subtile) {
28620 for (uint32_t n = 3; n < 4; n++) {
28621 for (size_t k = 1; k <= 5; k += 2) {
28622 for (uint32_t m = 1; m <= 2; m++) {
28623 GemmMicrokernelTester()
28624 .mr(2)
28625 .nr(2)
28626 .kr(1)
28627 .sr(1)
28628 .m(m)
28629 .n(n)
28630 .k(k)
28631 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028632 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028633 }
28634 }
28635 }
28636}
28637
28638TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, n_div_2) {
28639 for (uint32_t n = 4; n <= 6; n += 2) {
28640 for (size_t k = 1; k <= 5; k += 2) {
28641 GemmMicrokernelTester()
28642 .mr(2)
28643 .nr(2)
28644 .kr(1)
28645 .sr(1)
28646 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080028647 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028648 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080028649 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028650 }
28651 }
28652}
28653
28654TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, n_div_2_strided_cn) {
28655 for (uint32_t n = 4; n <= 6; n += 2) {
28656 for (size_t k = 1; k <= 5; k += 2) {
28657 GemmMicrokernelTester()
28658 .mr(2)
28659 .nr(2)
28660 .kr(1)
28661 .sr(1)
28662 .m(2)
28663 .n(n)
28664 .k(k)
28665 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080028666 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028667 }
28668 }
28669}
28670
28671TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, n_div_2_strided_a) {
28672 for (uint32_t n = 4; n <= 6; n += 2) {
28673 for (size_t k = 1; k <= 5; k += 2) {
28674 GemmMicrokernelTester()
28675 .mr(2)
28676 .nr(2)
28677 .kr(1)
28678 .sr(1)
28679 .m(2)
28680 .n(n)
28681 .k(k)
28682 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028683 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028684 }
28685 }
28686}
28687
28688TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, n_div_2_subtile) {
28689 for (uint32_t n = 4; n <= 6; n += 2) {
28690 for (size_t k = 1; k <= 5; k += 2) {
28691 for (uint32_t m = 1; m <= 2; m++) {
28692 GemmMicrokernelTester()
28693 .mr(2)
28694 .nr(2)
28695 .kr(1)
28696 .sr(1)
28697 .m(m)
28698 .n(n)
28699 .k(k)
28700 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028701 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028702 }
28703 }
28704 }
28705}
28706
28707TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, strided_cm_subtile) {
28708 for (size_t k = 1; k <= 5; k += 2) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080028709 for (uint32_t n = 1; n <= 2; n++) {
28710 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028711 GemmMicrokernelTester()
28712 .mr(2)
28713 .nr(2)
28714 .kr(1)
28715 .sr(1)
28716 .m(m)
28717 .n(n)
28718 .k(k)
28719 .cm_stride(5)
28720 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028721 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028722 }
28723 }
28724 }
28725}
28726
28727TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, qmin) {
28728 GemmMicrokernelTester()
28729 .mr(2)
28730 .nr(2)
28731 .kr(1)
28732 .sr(1)
28733 .m(2)
28734 .n(2)
28735 .k(1)
28736 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080028737 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028738}
28739
28740TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, qmax) {
28741 GemmMicrokernelTester()
28742 .mr(2)
28743 .nr(2)
28744 .kr(1)
28745 .sr(1)
28746 .m(2)
28747 .n(2)
28748 .k(1)
28749 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080028750 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028751}
28752
28753TEST(QC8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, strided_cm) {
28754 GemmMicrokernelTester()
28755 .mr(2)
28756 .nr(2)
28757 .kr(1)
28758 .sr(1)
28759 .m(2)
28760 .n(2)
28761 .k(1)
28762 .cm_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080028763 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028764}
28765
28766
28767TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, k_eq_1) {
28768 GemmMicrokernelTester()
28769 .mr(2)
28770 .nr(4)
28771 .kr(1)
28772 .sr(1)
28773 .m(2)
28774 .n(4)
28775 .k(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028776 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028777}
28778
28779TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, strided_cn) {
28780 GemmMicrokernelTester()
28781 .mr(2)
28782 .nr(4)
28783 .kr(1)
28784 .sr(1)
28785 .m(2)
28786 .n(4)
28787 .k(1)
28788 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028789 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028790}
28791
28792TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, k_eq_1_strided_a) {
28793 GemmMicrokernelTester()
28794 .mr(2)
28795 .nr(4)
28796 .kr(1)
28797 .sr(1)
28798 .m(2)
28799 .n(4)
28800 .k(1)
28801 .a_stride(3)
Marat Dukhan50323b82022-01-11 00:12:01 -080028802 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028803}
28804
28805TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, k_eq_1_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080028806 for (uint32_t n = 1; n <= 4; n++) {
28807 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028808 GemmMicrokernelTester()
28809 .mr(2)
28810 .nr(4)
28811 .kr(1)
28812 .sr(1)
28813 .m(m)
28814 .n(n)
28815 .k(1)
28816 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028817 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028818 }
28819 }
28820}
28821
28822TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, k_eq_1_subtile_m) {
28823 for (uint32_t m = 1; m <= 2; m++) {
28824 GemmMicrokernelTester()
28825 .mr(2)
28826 .nr(4)
28827 .kr(1)
28828 .sr(1)
28829 .m(m)
28830 .n(4)
28831 .k(1)
28832 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028833 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028834 }
28835}
28836
28837TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, k_eq_1_subtile_n) {
28838 for (uint32_t n = 1; n <= 4; n++) {
28839 GemmMicrokernelTester()
28840 .mr(2)
28841 .nr(4)
28842 .kr(1)
28843 .sr(1)
28844 .m(2)
28845 .n(n)
28846 .k(1)
28847 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028848 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028849 }
28850}
28851
28852TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, k_gt_1) {
28853 for (size_t k = 2; k < 10; k++) {
28854 GemmMicrokernelTester()
28855 .mr(2)
28856 .nr(4)
28857 .kr(1)
28858 .sr(1)
28859 .m(2)
28860 .n(4)
28861 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080028862 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028863 }
28864}
28865
28866TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, k_gt_1_strided_a) {
28867 for (size_t k = 2; k < 10; k++) {
28868 GemmMicrokernelTester()
28869 .mr(2)
28870 .nr(4)
28871 .kr(1)
28872 .sr(1)
28873 .m(2)
28874 .n(4)
28875 .k(k)
28876 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080028877 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028878 }
28879}
28880
28881TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, k_gt_1_subtile) {
28882 for (size_t k = 2; k < 10; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080028883 for (uint32_t n = 1; n <= 4; n++) {
28884 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028885 GemmMicrokernelTester()
28886 .mr(2)
28887 .nr(4)
28888 .kr(1)
28889 .sr(1)
28890 .m(m)
28891 .n(n)
28892 .k(k)
28893 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028894 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028895 }
28896 }
28897 }
28898}
28899
28900TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, n_gt_4) {
28901 for (uint32_t n = 5; n < 8; n++) {
28902 for (size_t k = 1; k <= 5; k += 2) {
28903 GemmMicrokernelTester()
28904 .mr(2)
28905 .nr(4)
28906 .kr(1)
28907 .sr(1)
28908 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080028909 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028910 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080028911 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028912 }
28913 }
28914}
28915
28916TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, n_gt_4_strided_cn) {
28917 for (uint32_t n = 5; n < 8; n++) {
28918 for (size_t k = 1; k <= 5; k += 2) {
28919 GemmMicrokernelTester()
28920 .mr(2)
28921 .nr(4)
28922 .kr(1)
28923 .sr(1)
28924 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080028925 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028926 .k(k)
28927 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028928 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028929 }
28930 }
28931}
28932
28933TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, n_gt_4_strided_a) {
28934 for (uint32_t n = 5; n < 8; n++) {
28935 for (size_t k = 1; k <= 5; k += 2) {
28936 GemmMicrokernelTester()
28937 .mr(2)
28938 .nr(4)
28939 .kr(1)
28940 .sr(1)
28941 .m(2)
28942 .n(n)
28943 .k(k)
28944 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028945 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028946 }
28947 }
28948}
28949
28950TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, n_gt_4_subtile) {
28951 for (uint32_t n = 5; n < 8; n++) {
28952 for (size_t k = 1; k <= 5; k += 2) {
28953 for (uint32_t m = 1; m <= 2; m++) {
28954 GemmMicrokernelTester()
28955 .mr(2)
28956 .nr(4)
28957 .kr(1)
28958 .sr(1)
28959 .m(m)
28960 .n(n)
28961 .k(k)
28962 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080028963 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028964 }
28965 }
28966 }
28967}
28968
28969TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, n_div_4) {
28970 for (uint32_t n = 8; n <= 12; n += 4) {
28971 for (size_t k = 1; k <= 5; k += 2) {
28972 GemmMicrokernelTester()
28973 .mr(2)
28974 .nr(4)
28975 .kr(1)
28976 .sr(1)
28977 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080028978 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028979 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080028980 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028981 }
28982 }
28983}
28984
28985TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, n_div_4_strided_cn) {
28986 for (uint32_t n = 8; n <= 12; n += 4) {
28987 for (size_t k = 1; k <= 5; k += 2) {
28988 GemmMicrokernelTester()
28989 .mr(2)
28990 .nr(4)
28991 .kr(1)
28992 .sr(1)
28993 .m(2)
28994 .n(n)
28995 .k(k)
28996 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080028997 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080028998 }
28999 }
29000}
29001
29002TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, n_div_4_strided_a) {
29003 for (uint32_t n = 8; n <= 12; n += 4) {
29004 for (size_t k = 1; k <= 5; k += 2) {
29005 GemmMicrokernelTester()
29006 .mr(2)
29007 .nr(4)
29008 .kr(1)
29009 .sr(1)
29010 .m(2)
29011 .n(n)
29012 .k(k)
29013 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029014 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029015 }
29016 }
29017}
29018
29019TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, n_div_4_subtile) {
29020 for (uint32_t n = 8; n <= 12; n += 4) {
29021 for (size_t k = 1; k <= 5; k += 2) {
29022 for (uint32_t m = 1; m <= 2; m++) {
29023 GemmMicrokernelTester()
29024 .mr(2)
29025 .nr(4)
29026 .kr(1)
29027 .sr(1)
29028 .m(m)
29029 .n(n)
29030 .k(k)
29031 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029032 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029033 }
29034 }
29035 }
29036}
29037
29038TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, strided_cm_subtile) {
29039 for (size_t k = 1; k <= 5; k += 2) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080029040 for (uint32_t n = 1; n <= 4; n++) {
29041 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029042 GemmMicrokernelTester()
29043 .mr(2)
29044 .nr(4)
29045 .kr(1)
29046 .sr(1)
29047 .m(m)
29048 .n(n)
29049 .k(k)
29050 .cm_stride(7)
29051 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029052 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029053 }
29054 }
29055 }
29056}
29057
29058TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, qmin) {
29059 GemmMicrokernelTester()
29060 .mr(2)
29061 .nr(4)
29062 .kr(1)
29063 .sr(1)
29064 .m(2)
29065 .n(4)
29066 .k(1)
29067 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080029068 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029069}
29070
29071TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, qmax) {
29072 GemmMicrokernelTester()
29073 .mr(2)
29074 .nr(4)
29075 .kr(1)
29076 .sr(1)
29077 .m(2)
29078 .n(4)
29079 .k(1)
29080 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080029081 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029082}
29083
29084TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, strided_cm) {
29085 GemmMicrokernelTester()
29086 .mr(2)
29087 .nr(4)
29088 .kr(1)
29089 .sr(1)
29090 .m(2)
29091 .n(4)
29092 .k(1)
29093 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029094 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029095}
29096
29097
29098TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_IMAGIC, k_eq_1) {
29099 GemmMicrokernelTester()
29100 .mr(3)
29101 .nr(4)
29102 .kr(1)
29103 .sr(1)
29104 .m(3)
29105 .n(4)
29106 .k(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029107 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029108}
29109
29110TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_IMAGIC, strided_cn) {
29111 GemmMicrokernelTester()
29112 .mr(3)
29113 .nr(4)
29114 .kr(1)
29115 .sr(1)
29116 .m(3)
29117 .n(4)
29118 .k(1)
29119 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029120 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029121}
29122
29123TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_IMAGIC, k_eq_1_strided_a) {
29124 GemmMicrokernelTester()
29125 .mr(3)
29126 .nr(4)
29127 .kr(1)
29128 .sr(1)
29129 .m(3)
29130 .n(4)
29131 .k(1)
29132 .a_stride(3)
Marat Dukhan50323b82022-01-11 00:12:01 -080029133 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029134}
29135
29136TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_IMAGIC, k_eq_1_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080029137 for (uint32_t n = 1; n <= 4; n++) {
29138 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029139 GemmMicrokernelTester()
29140 .mr(3)
29141 .nr(4)
29142 .kr(1)
29143 .sr(1)
29144 .m(m)
29145 .n(n)
29146 .k(1)
29147 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029148 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029149 }
29150 }
29151}
29152
29153TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_IMAGIC, k_eq_1_subtile_m) {
29154 for (uint32_t m = 1; m <= 3; m++) {
29155 GemmMicrokernelTester()
29156 .mr(3)
29157 .nr(4)
29158 .kr(1)
29159 .sr(1)
29160 .m(m)
29161 .n(4)
29162 .k(1)
29163 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029164 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029165 }
29166}
29167
29168TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_IMAGIC, k_eq_1_subtile_n) {
29169 for (uint32_t n = 1; n <= 4; n++) {
29170 GemmMicrokernelTester()
29171 .mr(3)
29172 .nr(4)
29173 .kr(1)
29174 .sr(1)
29175 .m(3)
29176 .n(n)
29177 .k(1)
29178 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029179 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029180 }
29181}
29182
29183TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_IMAGIC, k_gt_1) {
29184 for (size_t k = 2; k < 10; k++) {
29185 GemmMicrokernelTester()
29186 .mr(3)
29187 .nr(4)
29188 .kr(1)
29189 .sr(1)
29190 .m(3)
29191 .n(4)
29192 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080029193 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029194 }
29195}
29196
29197TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_IMAGIC, k_gt_1_strided_a) {
29198 for (size_t k = 2; k < 10; k++) {
29199 GemmMicrokernelTester()
29200 .mr(3)
29201 .nr(4)
29202 .kr(1)
29203 .sr(1)
29204 .m(3)
29205 .n(4)
29206 .k(k)
29207 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080029208 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029209 }
29210}
29211
29212TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_IMAGIC, k_gt_1_subtile) {
29213 for (size_t k = 2; k < 10; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080029214 for (uint32_t n = 1; n <= 4; n++) {
29215 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029216 GemmMicrokernelTester()
29217 .mr(3)
29218 .nr(4)
29219 .kr(1)
29220 .sr(1)
29221 .m(m)
29222 .n(n)
29223 .k(k)
29224 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029225 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029226 }
29227 }
29228 }
29229}
29230
29231TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_IMAGIC, n_gt_4) {
29232 for (uint32_t n = 5; n < 8; n++) {
29233 for (size_t k = 1; k <= 5; k += 2) {
29234 GemmMicrokernelTester()
29235 .mr(3)
29236 .nr(4)
29237 .kr(1)
29238 .sr(1)
29239 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080029240 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029241 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080029242 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029243 }
29244 }
29245}
29246
29247TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_IMAGIC, n_gt_4_strided_cn) {
29248 for (uint32_t n = 5; n < 8; n++) {
29249 for (size_t k = 1; k <= 5; k += 2) {
29250 GemmMicrokernelTester()
29251 .mr(3)
29252 .nr(4)
29253 .kr(1)
29254 .sr(1)
29255 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080029256 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029257 .k(k)
29258 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029259 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029260 }
29261 }
29262}
29263
29264TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_IMAGIC, n_gt_4_strided_a) {
29265 for (uint32_t n = 5; n < 8; n++) {
29266 for (size_t k = 1; k <= 5; k += 2) {
29267 GemmMicrokernelTester()
29268 .mr(3)
29269 .nr(4)
29270 .kr(1)
29271 .sr(1)
29272 .m(3)
29273 .n(n)
29274 .k(k)
29275 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029276 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029277 }
29278 }
29279}
29280
29281TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_IMAGIC, n_gt_4_subtile) {
29282 for (uint32_t n = 5; n < 8; n++) {
29283 for (size_t k = 1; k <= 5; k += 2) {
29284 for (uint32_t m = 1; m <= 3; m++) {
29285 GemmMicrokernelTester()
29286 .mr(3)
29287 .nr(4)
29288 .kr(1)
29289 .sr(1)
29290 .m(m)
29291 .n(n)
29292 .k(k)
29293 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029294 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029295 }
29296 }
29297 }
29298}
29299
29300TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_IMAGIC, n_div_4) {
29301 for (uint32_t n = 8; n <= 12; n += 4) {
29302 for (size_t k = 1; k <= 5; k += 2) {
29303 GemmMicrokernelTester()
29304 .mr(3)
29305 .nr(4)
29306 .kr(1)
29307 .sr(1)
29308 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080029309 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029310 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080029311 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029312 }
29313 }
29314}
29315
29316TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_IMAGIC, n_div_4_strided_cn) {
29317 for (uint32_t n = 8; n <= 12; n += 4) {
29318 for (size_t k = 1; k <= 5; k += 2) {
29319 GemmMicrokernelTester()
29320 .mr(3)
29321 .nr(4)
29322 .kr(1)
29323 .sr(1)
29324 .m(3)
29325 .n(n)
29326 .k(k)
29327 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029328 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029329 }
29330 }
29331}
29332
29333TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_IMAGIC, n_div_4_strided_a) {
29334 for (uint32_t n = 8; n <= 12; n += 4) {
29335 for (size_t k = 1; k <= 5; k += 2) {
29336 GemmMicrokernelTester()
29337 .mr(3)
29338 .nr(4)
29339 .kr(1)
29340 .sr(1)
29341 .m(3)
29342 .n(n)
29343 .k(k)
29344 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029345 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029346 }
29347 }
29348}
29349
29350TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_IMAGIC, n_div_4_subtile) {
29351 for (uint32_t n = 8; n <= 12; n += 4) {
29352 for (size_t k = 1; k <= 5; k += 2) {
29353 for (uint32_t m = 1; m <= 3; m++) {
29354 GemmMicrokernelTester()
29355 .mr(3)
29356 .nr(4)
29357 .kr(1)
29358 .sr(1)
29359 .m(m)
29360 .n(n)
29361 .k(k)
29362 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029363 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029364 }
29365 }
29366 }
29367}
29368
29369TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_IMAGIC, strided_cm_subtile) {
29370 for (size_t k = 1; k <= 5; k += 2) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080029371 for (uint32_t n = 1; n <= 4; n++) {
29372 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029373 GemmMicrokernelTester()
29374 .mr(3)
29375 .nr(4)
29376 .kr(1)
29377 .sr(1)
29378 .m(m)
29379 .n(n)
29380 .k(k)
29381 .cm_stride(7)
29382 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029383 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029384 }
29385 }
29386 }
29387}
29388
29389TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_IMAGIC, qmin) {
29390 GemmMicrokernelTester()
29391 .mr(3)
29392 .nr(4)
29393 .kr(1)
29394 .sr(1)
29395 .m(3)
29396 .n(4)
29397 .k(1)
29398 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080029399 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029400}
29401
29402TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_IMAGIC, qmax) {
29403 GemmMicrokernelTester()
29404 .mr(3)
29405 .nr(4)
29406 .kr(1)
29407 .sr(1)
29408 .m(3)
29409 .n(4)
29410 .k(1)
29411 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080029412 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029413}
29414
29415TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_IMAGIC, strided_cm) {
29416 GemmMicrokernelTester()
29417 .mr(3)
29418 .nr(4)
29419 .kr(1)
29420 .sr(1)
29421 .m(3)
29422 .n(4)
29423 .k(1)
29424 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029425 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, xnn_init_qs8_minmax_scalar_imagic_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029426}
29427
29428
29429TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_LRINTF, k_eq_1) {
29430 GemmMicrokernelTester()
29431 .mr(1)
29432 .nr(2)
29433 .kr(1)
29434 .sr(1)
29435 .m(1)
29436 .n(2)
29437 .k(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029438 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029439}
29440
29441TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_LRINTF, strided_cn) {
29442 GemmMicrokernelTester()
29443 .mr(1)
29444 .nr(2)
29445 .kr(1)
29446 .sr(1)
29447 .m(1)
29448 .n(2)
29449 .k(1)
29450 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080029451 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029452}
29453
29454TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_LRINTF, k_eq_1_strided_a) {
29455 GemmMicrokernelTester()
29456 .mr(1)
29457 .nr(2)
29458 .kr(1)
29459 .sr(1)
29460 .m(1)
29461 .n(2)
29462 .k(1)
29463 .a_stride(3)
Marat Dukhan50323b82022-01-11 00:12:01 -080029464 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029465}
29466
29467TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_LRINTF, k_eq_1_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080029468 for (uint32_t n = 1; n <= 2; n++) {
29469 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029470 GemmMicrokernelTester()
29471 .mr(1)
29472 .nr(2)
29473 .kr(1)
29474 .sr(1)
29475 .m(m)
29476 .n(n)
29477 .k(1)
29478 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029479 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029480 }
29481 }
29482}
29483
29484TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_LRINTF, k_eq_1_subtile_m) {
29485 for (uint32_t m = 1; m <= 1; m++) {
29486 GemmMicrokernelTester()
29487 .mr(1)
29488 .nr(2)
29489 .kr(1)
29490 .sr(1)
29491 .m(m)
29492 .n(2)
29493 .k(1)
29494 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029495 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029496 }
29497}
29498
29499TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_LRINTF, k_eq_1_subtile_n) {
29500 for (uint32_t n = 1; n <= 2; n++) {
29501 GemmMicrokernelTester()
29502 .mr(1)
29503 .nr(2)
29504 .kr(1)
29505 .sr(1)
29506 .m(1)
29507 .n(n)
29508 .k(1)
29509 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029510 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029511 }
29512}
29513
29514TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_LRINTF, k_gt_1) {
29515 for (size_t k = 2; k < 10; k++) {
29516 GemmMicrokernelTester()
29517 .mr(1)
29518 .nr(2)
29519 .kr(1)
29520 .sr(1)
29521 .m(1)
29522 .n(2)
29523 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080029524 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029525 }
29526}
29527
29528TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_LRINTF, k_gt_1_strided_a) {
29529 for (size_t k = 2; k < 10; k++) {
29530 GemmMicrokernelTester()
29531 .mr(1)
29532 .nr(2)
29533 .kr(1)
29534 .sr(1)
29535 .m(1)
29536 .n(2)
29537 .k(k)
29538 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080029539 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029540 }
29541}
29542
29543TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_LRINTF, k_gt_1_subtile) {
29544 for (size_t k = 2; k < 10; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080029545 for (uint32_t n = 1; n <= 2; n++) {
29546 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029547 GemmMicrokernelTester()
29548 .mr(1)
29549 .nr(2)
29550 .kr(1)
29551 .sr(1)
29552 .m(m)
29553 .n(n)
29554 .k(k)
29555 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029556 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029557 }
29558 }
29559 }
29560}
29561
29562TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_LRINTF, n_gt_2) {
29563 for (uint32_t n = 3; n < 4; n++) {
29564 for (size_t k = 1; k <= 5; k += 2) {
29565 GemmMicrokernelTester()
29566 .mr(1)
29567 .nr(2)
29568 .kr(1)
29569 .sr(1)
29570 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080029571 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029572 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080029573 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029574 }
29575 }
29576}
29577
29578TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_LRINTF, n_gt_2_strided_cn) {
29579 for (uint32_t n = 3; n < 4; n++) {
29580 for (size_t k = 1; k <= 5; k += 2) {
29581 GemmMicrokernelTester()
29582 .mr(1)
29583 .nr(2)
29584 .kr(1)
29585 .sr(1)
29586 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080029587 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029588 .k(k)
29589 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080029590 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029591 }
29592 }
29593}
29594
29595TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_LRINTF, n_gt_2_strided_a) {
29596 for (uint32_t n = 3; n < 4; n++) {
29597 for (size_t k = 1; k <= 5; k += 2) {
29598 GemmMicrokernelTester()
29599 .mr(1)
29600 .nr(2)
29601 .kr(1)
29602 .sr(1)
29603 .m(1)
29604 .n(n)
29605 .k(k)
29606 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029607 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029608 }
29609 }
29610}
29611
29612TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_LRINTF, n_gt_2_subtile) {
29613 for (uint32_t n = 3; n < 4; n++) {
29614 for (size_t k = 1; k <= 5; k += 2) {
29615 for (uint32_t m = 1; m <= 1; m++) {
29616 GemmMicrokernelTester()
29617 .mr(1)
29618 .nr(2)
29619 .kr(1)
29620 .sr(1)
29621 .m(m)
29622 .n(n)
29623 .k(k)
29624 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029625 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029626 }
29627 }
29628 }
29629}
29630
29631TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_LRINTF, n_div_2) {
29632 for (uint32_t n = 4; n <= 6; n += 2) {
29633 for (size_t k = 1; k <= 5; k += 2) {
29634 GemmMicrokernelTester()
29635 .mr(1)
29636 .nr(2)
29637 .kr(1)
29638 .sr(1)
29639 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080029640 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029641 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080029642 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029643 }
29644 }
29645}
29646
29647TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_LRINTF, n_div_2_strided_cn) {
29648 for (uint32_t n = 4; n <= 6; n += 2) {
29649 for (size_t k = 1; k <= 5; k += 2) {
29650 GemmMicrokernelTester()
29651 .mr(1)
29652 .nr(2)
29653 .kr(1)
29654 .sr(1)
29655 .m(1)
29656 .n(n)
29657 .k(k)
29658 .cn_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080029659 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029660 }
29661 }
29662}
29663
29664TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_LRINTF, n_div_2_strided_a) {
29665 for (uint32_t n = 4; n <= 6; n += 2) {
29666 for (size_t k = 1; k <= 5; k += 2) {
29667 GemmMicrokernelTester()
29668 .mr(1)
29669 .nr(2)
29670 .kr(1)
29671 .sr(1)
29672 .m(1)
29673 .n(n)
29674 .k(k)
29675 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029676 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029677 }
29678 }
29679}
29680
29681TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_LRINTF, n_div_2_subtile) {
29682 for (uint32_t n = 4; n <= 6; n += 2) {
29683 for (size_t k = 1; k <= 5; k += 2) {
29684 for (uint32_t m = 1; m <= 1; m++) {
29685 GemmMicrokernelTester()
29686 .mr(1)
29687 .nr(2)
29688 .kr(1)
29689 .sr(1)
29690 .m(m)
29691 .n(n)
29692 .k(k)
29693 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029694 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029695 }
29696 }
29697 }
29698}
29699
29700TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_LRINTF, strided_cm_subtile) {
29701 for (size_t k = 1; k <= 5; k += 2) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080029702 for (uint32_t n = 1; n <= 2; n++) {
29703 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029704 GemmMicrokernelTester()
29705 .mr(1)
29706 .nr(2)
29707 .kr(1)
29708 .sr(1)
29709 .m(m)
29710 .n(n)
29711 .k(k)
29712 .cm_stride(5)
29713 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029714 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029715 }
29716 }
29717 }
29718}
29719
29720TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_LRINTF, qmin) {
29721 GemmMicrokernelTester()
29722 .mr(1)
29723 .nr(2)
29724 .kr(1)
29725 .sr(1)
29726 .m(1)
29727 .n(2)
29728 .k(1)
29729 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080029730 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029731}
29732
29733TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_LRINTF, qmax) {
29734 GemmMicrokernelTester()
29735 .mr(1)
29736 .nr(2)
29737 .kr(1)
29738 .sr(1)
29739 .m(1)
29740 .n(2)
29741 .k(1)
29742 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080029743 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029744}
29745
29746TEST(QC8_GEMM_MINMAX_FP32_1X2__SCALAR_LRINTF, strided_cm) {
29747 GemmMicrokernelTester()
29748 .mr(1)
29749 .nr(2)
29750 .kr(1)
29751 .sr(1)
29752 .m(1)
29753 .n(2)
29754 .k(1)
29755 .cm_stride(5)
Marat Dukhan50323b82022-01-11 00:12:01 -080029756 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029757}
29758
29759
29760TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_LRINTF, k_eq_1) {
29761 GemmMicrokernelTester()
29762 .mr(1)
29763 .nr(4)
29764 .kr(1)
29765 .sr(1)
29766 .m(1)
29767 .n(4)
29768 .k(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029769 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029770}
29771
29772TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_LRINTF, strided_cn) {
29773 GemmMicrokernelTester()
29774 .mr(1)
29775 .nr(4)
29776 .kr(1)
29777 .sr(1)
29778 .m(1)
29779 .n(4)
29780 .k(1)
29781 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029782 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029783}
29784
29785TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_LRINTF, k_eq_1_strided_a) {
29786 GemmMicrokernelTester()
29787 .mr(1)
29788 .nr(4)
29789 .kr(1)
29790 .sr(1)
29791 .m(1)
29792 .n(4)
29793 .k(1)
29794 .a_stride(3)
Marat Dukhan50323b82022-01-11 00:12:01 -080029795 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029796}
29797
29798TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_LRINTF, k_eq_1_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080029799 for (uint32_t n = 1; n <= 4; n++) {
29800 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029801 GemmMicrokernelTester()
29802 .mr(1)
29803 .nr(4)
29804 .kr(1)
29805 .sr(1)
29806 .m(m)
29807 .n(n)
29808 .k(1)
29809 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029810 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029811 }
29812 }
29813}
29814
29815TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_LRINTF, k_eq_1_subtile_m) {
29816 for (uint32_t m = 1; m <= 1; m++) {
29817 GemmMicrokernelTester()
29818 .mr(1)
29819 .nr(4)
29820 .kr(1)
29821 .sr(1)
29822 .m(m)
29823 .n(4)
29824 .k(1)
29825 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029826 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029827 }
29828}
29829
29830TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_LRINTF, k_eq_1_subtile_n) {
29831 for (uint32_t n = 1; n <= 4; n++) {
29832 GemmMicrokernelTester()
29833 .mr(1)
29834 .nr(4)
29835 .kr(1)
29836 .sr(1)
29837 .m(1)
29838 .n(n)
29839 .k(1)
29840 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029841 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029842 }
29843}
29844
29845TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_LRINTF, k_gt_1) {
29846 for (size_t k = 2; k < 10; k++) {
29847 GemmMicrokernelTester()
29848 .mr(1)
29849 .nr(4)
29850 .kr(1)
29851 .sr(1)
29852 .m(1)
29853 .n(4)
29854 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080029855 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029856 }
29857}
29858
29859TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_LRINTF, k_gt_1_strided_a) {
29860 for (size_t k = 2; k < 10; k++) {
29861 GemmMicrokernelTester()
29862 .mr(1)
29863 .nr(4)
29864 .kr(1)
29865 .sr(1)
29866 .m(1)
29867 .n(4)
29868 .k(k)
29869 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080029870 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029871 }
29872}
29873
29874TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_LRINTF, k_gt_1_subtile) {
29875 for (size_t k = 2; k < 10; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080029876 for (uint32_t n = 1; n <= 4; n++) {
29877 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029878 GemmMicrokernelTester()
29879 .mr(1)
29880 .nr(4)
29881 .kr(1)
29882 .sr(1)
29883 .m(m)
29884 .n(n)
29885 .k(k)
29886 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029887 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029888 }
29889 }
29890 }
29891}
29892
29893TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_LRINTF, n_gt_4) {
29894 for (uint32_t n = 5; n < 8; n++) {
29895 for (size_t k = 1; k <= 5; k += 2) {
29896 GemmMicrokernelTester()
29897 .mr(1)
29898 .nr(4)
29899 .kr(1)
29900 .sr(1)
29901 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080029902 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029903 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080029904 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029905 }
29906 }
29907}
29908
29909TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_LRINTF, n_gt_4_strided_cn) {
29910 for (uint32_t n = 5; n < 8; n++) {
29911 for (size_t k = 1; k <= 5; k += 2) {
29912 GemmMicrokernelTester()
29913 .mr(1)
29914 .nr(4)
29915 .kr(1)
29916 .sr(1)
29917 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080029918 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029919 .k(k)
29920 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029921 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029922 }
29923 }
29924}
29925
29926TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_LRINTF, n_gt_4_strided_a) {
29927 for (uint32_t n = 5; n < 8; n++) {
29928 for (size_t k = 1; k <= 5; k += 2) {
29929 GemmMicrokernelTester()
29930 .mr(1)
29931 .nr(4)
29932 .kr(1)
29933 .sr(1)
29934 .m(1)
29935 .n(n)
29936 .k(k)
29937 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029938 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029939 }
29940 }
29941}
29942
29943TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_LRINTF, n_gt_4_subtile) {
29944 for (uint32_t n = 5; n < 8; n++) {
29945 for (size_t k = 1; k <= 5; k += 2) {
29946 for (uint32_t m = 1; m <= 1; m++) {
29947 GemmMicrokernelTester()
29948 .mr(1)
29949 .nr(4)
29950 .kr(1)
29951 .sr(1)
29952 .m(m)
29953 .n(n)
29954 .k(k)
29955 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080029956 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029957 }
29958 }
29959 }
29960}
29961
29962TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_LRINTF, n_div_4) {
29963 for (uint32_t n = 8; n <= 12; n += 4) {
29964 for (size_t k = 1; k <= 5; k += 2) {
29965 GemmMicrokernelTester()
29966 .mr(1)
29967 .nr(4)
29968 .kr(1)
29969 .sr(1)
29970 .m(1)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080029971 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029972 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080029973 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029974 }
29975 }
29976}
29977
29978TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_LRINTF, n_div_4_strided_cn) {
29979 for (uint32_t n = 8; n <= 12; n += 4) {
29980 for (size_t k = 1; k <= 5; k += 2) {
29981 GemmMicrokernelTester()
29982 .mr(1)
29983 .nr(4)
29984 .kr(1)
29985 .sr(1)
29986 .m(1)
29987 .n(n)
29988 .k(k)
29989 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080029990 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080029991 }
29992 }
29993}
29994
29995TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_LRINTF, n_div_4_strided_a) {
29996 for (uint32_t n = 8; n <= 12; n += 4) {
29997 for (size_t k = 1; k <= 5; k += 2) {
29998 GemmMicrokernelTester()
29999 .mr(1)
30000 .nr(4)
30001 .kr(1)
30002 .sr(1)
30003 .m(1)
30004 .n(n)
30005 .k(k)
30006 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030007 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030008 }
30009 }
30010}
30011
30012TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_LRINTF, n_div_4_subtile) {
30013 for (uint32_t n = 8; n <= 12; n += 4) {
30014 for (size_t k = 1; k <= 5; k += 2) {
30015 for (uint32_t m = 1; m <= 1; m++) {
30016 GemmMicrokernelTester()
30017 .mr(1)
30018 .nr(4)
30019 .kr(1)
30020 .sr(1)
30021 .m(m)
30022 .n(n)
30023 .k(k)
30024 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030025 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030026 }
30027 }
30028 }
30029}
30030
30031TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_LRINTF, strided_cm_subtile) {
30032 for (size_t k = 1; k <= 5; k += 2) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080030033 for (uint32_t n = 1; n <= 4; n++) {
30034 for (uint32_t m = 1; m <= 1; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030035 GemmMicrokernelTester()
30036 .mr(1)
30037 .nr(4)
30038 .kr(1)
30039 .sr(1)
30040 .m(m)
30041 .n(n)
30042 .k(k)
30043 .cm_stride(7)
30044 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030045 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030046 }
30047 }
30048 }
30049}
30050
30051TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_LRINTF, qmin) {
30052 GemmMicrokernelTester()
30053 .mr(1)
30054 .nr(4)
30055 .kr(1)
30056 .sr(1)
30057 .m(1)
30058 .n(4)
30059 .k(1)
30060 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080030061 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030062}
30063
30064TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_LRINTF, qmax) {
30065 GemmMicrokernelTester()
30066 .mr(1)
30067 .nr(4)
30068 .kr(1)
30069 .sr(1)
30070 .m(1)
30071 .n(4)
30072 .k(1)
30073 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080030074 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030075}
30076
30077TEST(QC8_GEMM_MINMAX_FP32_1X4__SCALAR_LRINTF, strided_cm) {
30078 GemmMicrokernelTester()
30079 .mr(1)
30080 .nr(4)
30081 .kr(1)
30082 .sr(1)
30083 .m(1)
30084 .n(4)
30085 .k(1)
30086 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030087 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030088}
30089
30090
30091TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_LRINTF, k_eq_1) {
30092 GemmMicrokernelTester()
30093 .mr(2)
30094 .nr(4)
30095 .kr(1)
30096 .sr(1)
30097 .m(2)
30098 .n(4)
30099 .k(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030100 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030101}
30102
30103TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_LRINTF, strided_cn) {
30104 GemmMicrokernelTester()
30105 .mr(2)
30106 .nr(4)
30107 .kr(1)
30108 .sr(1)
30109 .m(2)
30110 .n(4)
30111 .k(1)
30112 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030113 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030114}
30115
30116TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_LRINTF, k_eq_1_strided_a) {
30117 GemmMicrokernelTester()
30118 .mr(2)
30119 .nr(4)
30120 .kr(1)
30121 .sr(1)
30122 .m(2)
30123 .n(4)
30124 .k(1)
30125 .a_stride(3)
Marat Dukhan50323b82022-01-11 00:12:01 -080030126 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030127}
30128
30129TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_LRINTF, k_eq_1_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080030130 for (uint32_t n = 1; n <= 4; n++) {
30131 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030132 GemmMicrokernelTester()
30133 .mr(2)
30134 .nr(4)
30135 .kr(1)
30136 .sr(1)
30137 .m(m)
30138 .n(n)
30139 .k(1)
30140 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030141 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030142 }
30143 }
30144}
30145
30146TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_LRINTF, k_eq_1_subtile_m) {
30147 for (uint32_t m = 1; m <= 2; m++) {
30148 GemmMicrokernelTester()
30149 .mr(2)
30150 .nr(4)
30151 .kr(1)
30152 .sr(1)
30153 .m(m)
30154 .n(4)
30155 .k(1)
30156 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030157 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030158 }
30159}
30160
30161TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_LRINTF, k_eq_1_subtile_n) {
30162 for (uint32_t n = 1; n <= 4; n++) {
30163 GemmMicrokernelTester()
30164 .mr(2)
30165 .nr(4)
30166 .kr(1)
30167 .sr(1)
30168 .m(2)
30169 .n(n)
30170 .k(1)
30171 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030172 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030173 }
30174}
30175
30176TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_LRINTF, k_gt_1) {
30177 for (size_t k = 2; k < 10; k++) {
30178 GemmMicrokernelTester()
30179 .mr(2)
30180 .nr(4)
30181 .kr(1)
30182 .sr(1)
30183 .m(2)
30184 .n(4)
30185 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080030186 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030187 }
30188}
30189
30190TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_LRINTF, k_gt_1_strided_a) {
30191 for (size_t k = 2; k < 10; k++) {
30192 GemmMicrokernelTester()
30193 .mr(2)
30194 .nr(4)
30195 .kr(1)
30196 .sr(1)
30197 .m(2)
30198 .n(4)
30199 .k(k)
30200 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080030201 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030202 }
30203}
30204
30205TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_LRINTF, k_gt_1_subtile) {
30206 for (size_t k = 2; k < 10; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080030207 for (uint32_t n = 1; n <= 4; n++) {
30208 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030209 GemmMicrokernelTester()
30210 .mr(2)
30211 .nr(4)
30212 .kr(1)
30213 .sr(1)
30214 .m(m)
30215 .n(n)
30216 .k(k)
30217 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030218 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030219 }
30220 }
30221 }
30222}
30223
30224TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_LRINTF, n_gt_4) {
30225 for (uint32_t n = 5; n < 8; n++) {
30226 for (size_t k = 1; k <= 5; k += 2) {
30227 GemmMicrokernelTester()
30228 .mr(2)
30229 .nr(4)
30230 .kr(1)
30231 .sr(1)
30232 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080030233 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030234 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080030235 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030236 }
30237 }
30238}
30239
30240TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_LRINTF, n_gt_4_strided_cn) {
30241 for (uint32_t n = 5; n < 8; n++) {
30242 for (size_t k = 1; k <= 5; k += 2) {
30243 GemmMicrokernelTester()
30244 .mr(2)
30245 .nr(4)
30246 .kr(1)
30247 .sr(1)
30248 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080030249 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030250 .k(k)
30251 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030252 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030253 }
30254 }
30255}
30256
30257TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_LRINTF, n_gt_4_strided_a) {
30258 for (uint32_t n = 5; n < 8; n++) {
30259 for (size_t k = 1; k <= 5; k += 2) {
30260 GemmMicrokernelTester()
30261 .mr(2)
30262 .nr(4)
30263 .kr(1)
30264 .sr(1)
30265 .m(2)
30266 .n(n)
30267 .k(k)
30268 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030269 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030270 }
30271 }
30272}
30273
30274TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_LRINTF, n_gt_4_subtile) {
30275 for (uint32_t n = 5; n < 8; n++) {
30276 for (size_t k = 1; k <= 5; k += 2) {
30277 for (uint32_t m = 1; m <= 2; m++) {
30278 GemmMicrokernelTester()
30279 .mr(2)
30280 .nr(4)
30281 .kr(1)
30282 .sr(1)
30283 .m(m)
30284 .n(n)
30285 .k(k)
30286 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030287 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030288 }
30289 }
30290 }
30291}
30292
30293TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_LRINTF, n_div_4) {
30294 for (uint32_t n = 8; n <= 12; n += 4) {
30295 for (size_t k = 1; k <= 5; k += 2) {
30296 GemmMicrokernelTester()
30297 .mr(2)
30298 .nr(4)
30299 .kr(1)
30300 .sr(1)
30301 .m(2)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080030302 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030303 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080030304 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030305 }
30306 }
30307}
30308
30309TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_LRINTF, n_div_4_strided_cn) {
30310 for (uint32_t n = 8; n <= 12; n += 4) {
30311 for (size_t k = 1; k <= 5; k += 2) {
30312 GemmMicrokernelTester()
30313 .mr(2)
30314 .nr(4)
30315 .kr(1)
30316 .sr(1)
30317 .m(2)
30318 .n(n)
30319 .k(k)
30320 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030321 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030322 }
30323 }
30324}
30325
30326TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_LRINTF, n_div_4_strided_a) {
30327 for (uint32_t n = 8; n <= 12; n += 4) {
30328 for (size_t k = 1; k <= 5; k += 2) {
30329 GemmMicrokernelTester()
30330 .mr(2)
30331 .nr(4)
30332 .kr(1)
30333 .sr(1)
30334 .m(2)
30335 .n(n)
30336 .k(k)
30337 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030338 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030339 }
30340 }
30341}
30342
30343TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_LRINTF, n_div_4_subtile) {
30344 for (uint32_t n = 8; n <= 12; n += 4) {
30345 for (size_t k = 1; k <= 5; k += 2) {
30346 for (uint32_t m = 1; m <= 2; m++) {
30347 GemmMicrokernelTester()
30348 .mr(2)
30349 .nr(4)
30350 .kr(1)
30351 .sr(1)
30352 .m(m)
30353 .n(n)
30354 .k(k)
30355 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030356 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030357 }
30358 }
30359 }
30360}
30361
30362TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_LRINTF, strided_cm_subtile) {
30363 for (size_t k = 1; k <= 5; k += 2) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080030364 for (uint32_t n = 1; n <= 4; n++) {
30365 for (uint32_t m = 1; m <= 2; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030366 GemmMicrokernelTester()
30367 .mr(2)
30368 .nr(4)
30369 .kr(1)
30370 .sr(1)
30371 .m(m)
30372 .n(n)
30373 .k(k)
30374 .cm_stride(7)
30375 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030376 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030377 }
30378 }
30379 }
30380}
30381
30382TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_LRINTF, qmin) {
30383 GemmMicrokernelTester()
30384 .mr(2)
30385 .nr(4)
30386 .kr(1)
30387 .sr(1)
30388 .m(2)
30389 .n(4)
30390 .k(1)
30391 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080030392 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030393}
30394
30395TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_LRINTF, qmax) {
30396 GemmMicrokernelTester()
30397 .mr(2)
30398 .nr(4)
30399 .kr(1)
30400 .sr(1)
30401 .m(2)
30402 .n(4)
30403 .k(1)
30404 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080030405 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030406}
30407
30408TEST(QC8_GEMM_MINMAX_FP32_2X4__SCALAR_LRINTF, strided_cm) {
30409 GemmMicrokernelTester()
30410 .mr(2)
30411 .nr(4)
30412 .kr(1)
30413 .sr(1)
30414 .m(2)
30415 .n(4)
30416 .k(1)
30417 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030418 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030419}
30420
30421
30422TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, k_eq_1) {
30423 GemmMicrokernelTester()
30424 .mr(3)
30425 .nr(4)
30426 .kr(1)
30427 .sr(1)
30428 .m(3)
30429 .n(4)
30430 .k(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030431 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030432}
30433
30434TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, strided_cn) {
30435 GemmMicrokernelTester()
30436 .mr(3)
30437 .nr(4)
30438 .kr(1)
30439 .sr(1)
30440 .m(3)
30441 .n(4)
30442 .k(1)
30443 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030444 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030445}
30446
30447TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, k_eq_1_strided_a) {
30448 GemmMicrokernelTester()
30449 .mr(3)
30450 .nr(4)
30451 .kr(1)
30452 .sr(1)
30453 .m(3)
30454 .n(4)
30455 .k(1)
30456 .a_stride(3)
Marat Dukhan50323b82022-01-11 00:12:01 -080030457 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030458}
30459
30460TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, k_eq_1_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080030461 for (uint32_t n = 1; n <= 4; n++) {
30462 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030463 GemmMicrokernelTester()
30464 .mr(3)
30465 .nr(4)
30466 .kr(1)
30467 .sr(1)
30468 .m(m)
30469 .n(n)
30470 .k(1)
30471 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030472 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030473 }
30474 }
30475}
30476
30477TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, k_eq_1_subtile_m) {
30478 for (uint32_t m = 1; m <= 3; m++) {
30479 GemmMicrokernelTester()
30480 .mr(3)
30481 .nr(4)
30482 .kr(1)
30483 .sr(1)
30484 .m(m)
30485 .n(4)
30486 .k(1)
30487 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030488 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030489 }
30490}
30491
30492TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, k_eq_1_subtile_n) {
30493 for (uint32_t n = 1; n <= 4; n++) {
30494 GemmMicrokernelTester()
30495 .mr(3)
30496 .nr(4)
30497 .kr(1)
30498 .sr(1)
30499 .m(3)
30500 .n(n)
30501 .k(1)
30502 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030503 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030504 }
30505}
30506
30507TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, k_gt_1) {
30508 for (size_t k = 2; k < 10; k++) {
30509 GemmMicrokernelTester()
30510 .mr(3)
30511 .nr(4)
30512 .kr(1)
30513 .sr(1)
30514 .m(3)
30515 .n(4)
30516 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080030517 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030518 }
30519}
30520
30521TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, k_gt_1_strided_a) {
30522 for (size_t k = 2; k < 10; k++) {
30523 GemmMicrokernelTester()
30524 .mr(3)
30525 .nr(4)
30526 .kr(1)
30527 .sr(1)
30528 .m(3)
30529 .n(4)
30530 .k(k)
30531 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080030532 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030533 }
30534}
30535
30536TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, k_gt_1_subtile) {
30537 for (size_t k = 2; k < 10; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080030538 for (uint32_t n = 1; n <= 4; n++) {
30539 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030540 GemmMicrokernelTester()
30541 .mr(3)
30542 .nr(4)
30543 .kr(1)
30544 .sr(1)
30545 .m(m)
30546 .n(n)
30547 .k(k)
30548 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030549 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030550 }
30551 }
30552 }
30553}
30554
30555TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, n_gt_4) {
30556 for (uint32_t n = 5; n < 8; n++) {
30557 for (size_t k = 1; k <= 5; k += 2) {
30558 GemmMicrokernelTester()
30559 .mr(3)
30560 .nr(4)
30561 .kr(1)
30562 .sr(1)
30563 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080030564 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030565 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080030566 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030567 }
30568 }
30569}
30570
30571TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, n_gt_4_strided_cn) {
30572 for (uint32_t n = 5; n < 8; n++) {
30573 for (size_t k = 1; k <= 5; k += 2) {
30574 GemmMicrokernelTester()
30575 .mr(3)
30576 .nr(4)
30577 .kr(1)
30578 .sr(1)
30579 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080030580 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030581 .k(k)
30582 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030583 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030584 }
30585 }
30586}
30587
30588TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, n_gt_4_strided_a) {
30589 for (uint32_t n = 5; n < 8; n++) {
30590 for (size_t k = 1; k <= 5; k += 2) {
30591 GemmMicrokernelTester()
30592 .mr(3)
30593 .nr(4)
30594 .kr(1)
30595 .sr(1)
30596 .m(3)
30597 .n(n)
30598 .k(k)
30599 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030600 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030601 }
30602 }
30603}
30604
30605TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, n_gt_4_subtile) {
30606 for (uint32_t n = 5; n < 8; n++) {
30607 for (size_t k = 1; k <= 5; k += 2) {
30608 for (uint32_t m = 1; m <= 3; m++) {
30609 GemmMicrokernelTester()
30610 .mr(3)
30611 .nr(4)
30612 .kr(1)
30613 .sr(1)
30614 .m(m)
30615 .n(n)
30616 .k(k)
30617 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030618 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030619 }
30620 }
30621 }
30622}
30623
30624TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, n_div_4) {
30625 for (uint32_t n = 8; n <= 12; n += 4) {
30626 for (size_t k = 1; k <= 5; k += 2) {
30627 GemmMicrokernelTester()
30628 .mr(3)
30629 .nr(4)
30630 .kr(1)
30631 .sr(1)
30632 .m(3)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080030633 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030634 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080030635 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030636 }
30637 }
30638}
30639
30640TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, n_div_4_strided_cn) {
30641 for (uint32_t n = 8; n <= 12; n += 4) {
30642 for (size_t k = 1; k <= 5; k += 2) {
30643 GemmMicrokernelTester()
30644 .mr(3)
30645 .nr(4)
30646 .kr(1)
30647 .sr(1)
30648 .m(3)
30649 .n(n)
30650 .k(k)
30651 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030652 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030653 }
30654 }
30655}
30656
30657TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, n_div_4_strided_a) {
30658 for (uint32_t n = 8; n <= 12; n += 4) {
30659 for (size_t k = 1; k <= 5; k += 2) {
30660 GemmMicrokernelTester()
30661 .mr(3)
30662 .nr(4)
30663 .kr(1)
30664 .sr(1)
30665 .m(3)
30666 .n(n)
30667 .k(k)
30668 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030669 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030670 }
30671 }
30672}
30673
30674TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, n_div_4_subtile) {
30675 for (uint32_t n = 8; n <= 12; n += 4) {
30676 for (size_t k = 1; k <= 5; k += 2) {
30677 for (uint32_t m = 1; m <= 3; m++) {
30678 GemmMicrokernelTester()
30679 .mr(3)
30680 .nr(4)
30681 .kr(1)
30682 .sr(1)
30683 .m(m)
30684 .n(n)
30685 .k(k)
30686 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030687 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030688 }
30689 }
30690 }
30691}
30692
30693TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, strided_cm_subtile) {
30694 for (size_t k = 1; k <= 5; k += 2) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080030695 for (uint32_t n = 1; n <= 4; n++) {
30696 for (uint32_t m = 1; m <= 3; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030697 GemmMicrokernelTester()
30698 .mr(3)
30699 .nr(4)
30700 .kr(1)
30701 .sr(1)
30702 .m(m)
30703 .n(n)
30704 .k(k)
30705 .cm_stride(7)
30706 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030707 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030708 }
30709 }
30710 }
30711}
30712
30713TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, qmin) {
30714 GemmMicrokernelTester()
30715 .mr(3)
30716 .nr(4)
30717 .kr(1)
30718 .sr(1)
30719 .m(3)
30720 .n(4)
30721 .k(1)
30722 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080030723 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030724}
30725
30726TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, qmax) {
30727 GemmMicrokernelTester()
30728 .mr(3)
30729 .nr(4)
30730 .kr(1)
30731 .sr(1)
30732 .m(3)
30733 .n(4)
30734 .k(1)
30735 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080030736 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030737}
30738
30739TEST(QC8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, strided_cm) {
30740 GemmMicrokernelTester()
30741 .mr(3)
30742 .nr(4)
30743 .kr(1)
30744 .sr(1)
30745 .m(3)
30746 .n(4)
30747 .k(1)
30748 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030749 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030750}
30751
30752
30753TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, k_eq_1) {
30754 GemmMicrokernelTester()
30755 .mr(4)
30756 .nr(4)
30757 .kr(1)
30758 .sr(1)
30759 .m(4)
30760 .n(4)
30761 .k(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030762 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030763}
30764
30765TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, strided_cn) {
30766 GemmMicrokernelTester()
30767 .mr(4)
30768 .nr(4)
30769 .kr(1)
30770 .sr(1)
30771 .m(4)
30772 .n(4)
30773 .k(1)
30774 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030775 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030776}
30777
30778TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, k_eq_1_strided_a) {
30779 GemmMicrokernelTester()
30780 .mr(4)
30781 .nr(4)
30782 .kr(1)
30783 .sr(1)
30784 .m(4)
30785 .n(4)
30786 .k(1)
30787 .a_stride(3)
Marat Dukhan50323b82022-01-11 00:12:01 -080030788 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030789}
30790
30791TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, k_eq_1_subtile) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080030792 for (uint32_t n = 1; n <= 4; n++) {
30793 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030794 GemmMicrokernelTester()
30795 .mr(4)
30796 .nr(4)
30797 .kr(1)
30798 .sr(1)
30799 .m(m)
30800 .n(n)
30801 .k(1)
30802 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030803 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030804 }
30805 }
30806}
30807
30808TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, k_eq_1_subtile_m) {
30809 for (uint32_t m = 1; m <= 4; m++) {
30810 GemmMicrokernelTester()
30811 .mr(4)
30812 .nr(4)
30813 .kr(1)
30814 .sr(1)
30815 .m(m)
30816 .n(4)
30817 .k(1)
30818 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030819 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030820 }
30821}
30822
30823TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, k_eq_1_subtile_n) {
30824 for (uint32_t n = 1; n <= 4; n++) {
30825 GemmMicrokernelTester()
30826 .mr(4)
30827 .nr(4)
30828 .kr(1)
30829 .sr(1)
30830 .m(4)
30831 .n(n)
30832 .k(1)
30833 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030834 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030835 }
30836}
30837
30838TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, k_gt_1) {
30839 for (size_t k = 2; k < 10; k++) {
30840 GemmMicrokernelTester()
30841 .mr(4)
30842 .nr(4)
30843 .kr(1)
30844 .sr(1)
30845 .m(4)
30846 .n(4)
30847 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080030848 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030849 }
30850}
30851
30852TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, k_gt_1_strided_a) {
30853 for (size_t k = 2; k < 10; k++) {
30854 GemmMicrokernelTester()
30855 .mr(4)
30856 .nr(4)
30857 .kr(1)
30858 .sr(1)
30859 .m(4)
30860 .n(4)
30861 .k(k)
30862 .a_stride(11)
Marat Dukhan50323b82022-01-11 00:12:01 -080030863 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030864 }
30865}
30866
30867TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, k_gt_1_subtile) {
30868 for (size_t k = 2; k < 10; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080030869 for (uint32_t n = 1; n <= 4; n++) {
30870 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030871 GemmMicrokernelTester()
30872 .mr(4)
30873 .nr(4)
30874 .kr(1)
30875 .sr(1)
30876 .m(m)
30877 .n(n)
30878 .k(k)
30879 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030880 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030881 }
30882 }
30883 }
30884}
30885
30886TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, n_gt_4) {
30887 for (uint32_t n = 5; n < 8; n++) {
30888 for (size_t k = 1; k <= 5; k += 2) {
30889 GemmMicrokernelTester()
30890 .mr(4)
30891 .nr(4)
30892 .kr(1)
30893 .sr(1)
30894 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080030895 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030896 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080030897 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030898 }
30899 }
30900}
30901
30902TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, n_gt_4_strided_cn) {
30903 for (uint32_t n = 5; n < 8; n++) {
30904 for (size_t k = 1; k <= 5; k += 2) {
30905 GemmMicrokernelTester()
30906 .mr(4)
30907 .nr(4)
30908 .kr(1)
30909 .sr(1)
30910 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080030911 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030912 .k(k)
30913 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030914 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030915 }
30916 }
30917}
30918
30919TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, n_gt_4_strided_a) {
30920 for (uint32_t n = 5; n < 8; n++) {
30921 for (size_t k = 1; k <= 5; k += 2) {
30922 GemmMicrokernelTester()
30923 .mr(4)
30924 .nr(4)
30925 .kr(1)
30926 .sr(1)
30927 .m(4)
30928 .n(n)
30929 .k(k)
30930 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030931 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030932 }
30933 }
30934}
30935
30936TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, n_gt_4_subtile) {
30937 for (uint32_t n = 5; n < 8; n++) {
30938 for (size_t k = 1; k <= 5; k += 2) {
30939 for (uint32_t m = 1; m <= 4; m++) {
30940 GemmMicrokernelTester()
30941 .mr(4)
30942 .nr(4)
30943 .kr(1)
30944 .sr(1)
30945 .m(m)
30946 .n(n)
30947 .k(k)
30948 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080030949 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030950 }
30951 }
30952 }
30953}
30954
30955TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, n_div_4) {
30956 for (uint32_t n = 8; n <= 12; n += 4) {
30957 for (size_t k = 1; k <= 5; k += 2) {
30958 GemmMicrokernelTester()
30959 .mr(4)
30960 .nr(4)
30961 .kr(1)
30962 .sr(1)
30963 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080030964 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030965 .k(k)
Marat Dukhan50323b82022-01-11 00:12:01 -080030966 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030967 }
30968 }
30969}
30970
30971TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, n_div_4_strided_cn) {
30972 for (uint32_t n = 8; n <= 12; n += 4) {
30973 for (size_t k = 1; k <= 5; k += 2) {
30974 GemmMicrokernelTester()
30975 .mr(4)
30976 .nr(4)
30977 .kr(1)
30978 .sr(1)
30979 .m(4)
30980 .n(n)
30981 .k(k)
30982 .cn_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080030983 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080030984 }
30985 }
30986}
30987
30988TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, n_div_4_strided_a) {
30989 for (uint32_t n = 8; n <= 12; n += 4) {
30990 for (size_t k = 1; k <= 5; k += 2) {
30991 GemmMicrokernelTester()
30992 .mr(4)
30993 .nr(4)
30994 .kr(1)
30995 .sr(1)
30996 .m(4)
30997 .n(n)
30998 .k(k)
30999 .a_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080031000 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031001 }
31002 }
31003}
31004
31005TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, n_div_4_subtile) {
31006 for (uint32_t n = 8; n <= 12; n += 4) {
31007 for (size_t k = 1; k <= 5; k += 2) {
31008 for (uint32_t m = 1; m <= 4; m++) {
31009 GemmMicrokernelTester()
31010 .mr(4)
31011 .nr(4)
31012 .kr(1)
31013 .sr(1)
31014 .m(m)
31015 .n(n)
31016 .k(k)
31017 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080031018 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031019 }
31020 }
31021 }
31022}
31023
31024TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, strided_cm_subtile) {
31025 for (size_t k = 1; k <= 5; k += 2) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080031026 for (uint32_t n = 1; n <= 4; n++) {
31027 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031028 GemmMicrokernelTester()
31029 .mr(4)
31030 .nr(4)
31031 .kr(1)
31032 .sr(1)
31033 .m(m)
31034 .n(n)
31035 .k(k)
31036 .cm_stride(7)
31037 .iterations(1)
Marat Dukhan50323b82022-01-11 00:12:01 -080031038 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031039 }
31040 }
31041 }
31042}
31043
31044TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, qmin) {
31045 GemmMicrokernelTester()
31046 .mr(4)
31047 .nr(4)
31048 .kr(1)
31049 .sr(1)
31050 .m(4)
31051 .n(4)
31052 .k(1)
31053 .qmin(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080031054 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031055}
31056
31057TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, qmax) {
31058 GemmMicrokernelTester()
31059 .mr(4)
31060 .nr(4)
31061 .kr(1)
31062 .sr(1)
31063 .m(4)
31064 .n(4)
31065 .k(1)
31066 .qmax(128)
Marat Dukhan50323b82022-01-11 00:12:01 -080031067 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031068}
31069
31070TEST(QC8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, strided_cm) {
31071 GemmMicrokernelTester()
31072 .mr(4)
31073 .nr(4)
31074 .kr(1)
31075 .sr(1)
31076 .m(4)
31077 .n(4)
31078 .k(1)
31079 .cm_stride(7)
Marat Dukhan50323b82022-01-11 00:12:01 -080031080 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qs8_minmax_scalar_lrintf_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031081}
31082
31083
31084#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT
31085 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_eq_8) {
31086 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031087 GemmMicrokernelTester()
31088 .mr(4)
31089 .nr(8)
31090 .kr(1)
31091 .sr(1)
31092 .m(4)
31093 .n(8)
31094 .k(8)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031095 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031096 }
31097
31098 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, strided_cn) {
31099 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031100 GemmMicrokernelTester()
31101 .mr(4)
31102 .nr(8)
31103 .kr(1)
31104 .sr(1)
31105 .m(4)
31106 .n(8)
31107 .k(8)
31108 .cn_stride(11)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031109 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031110 }
31111
31112 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_eq_8_strided_a) {
31113 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031114 GemmMicrokernelTester()
31115 .mr(4)
31116 .nr(8)
31117 .kr(1)
31118 .sr(1)
31119 .m(4)
31120 .n(8)
31121 .k(8)
31122 .a_stride(11)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031123 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031124 }
31125
31126 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_eq_8_subtile) {
31127 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng83844ae2022-01-14 09:52:25 -080031128 for (uint32_t n = 1; n <= 8; n++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080031129 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031130 GemmMicrokernelTester()
31131 .mr(4)
31132 .nr(8)
31133 .kr(1)
31134 .sr(1)
31135 .m(m)
31136 .n(n)
31137 .k(8)
31138 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031139 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031140 }
31141 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031142 }
31143
31144 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_eq_8_subtile_m) {
31145 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031146 for (uint32_t m = 1; m <= 4; m++) {
31147 GemmMicrokernelTester()
31148 .mr(4)
31149 .nr(8)
31150 .kr(1)
31151 .sr(1)
31152 .m(m)
31153 .n(8)
31154 .k(8)
31155 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031156 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031157 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031158 }
31159
31160 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_eq_8_subtile_n) {
31161 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031162 for (uint32_t n = 1; n <= 8; n++) {
31163 GemmMicrokernelTester()
31164 .mr(4)
31165 .nr(8)
31166 .kr(1)
31167 .sr(1)
31168 .m(4)
31169 .n(n)
31170 .k(8)
31171 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031172 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031173 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031174 }
31175
31176 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_lt_8) {
31177 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031178 for (size_t k = 1; k < 8; k++) {
31179 GemmMicrokernelTester()
31180 .mr(4)
31181 .nr(8)
31182 .kr(1)
31183 .sr(1)
31184 .m(4)
31185 .n(8)
31186 .k(k)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031187 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031188 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031189 }
31190
31191 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_lt_8_strided_a) {
31192 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031193 for (size_t k = 1; k < 8; k++) {
31194 GemmMicrokernelTester()
31195 .mr(4)
31196 .nr(8)
31197 .kr(1)
31198 .sr(1)
31199 .m(4)
31200 .n(8)
31201 .k(k)
31202 .a_stride(11)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031203 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031204 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031205 }
31206
31207 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_lt_8_subtile) {
31208 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031209 for (size_t k = 1; k < 8; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080031210 for (uint32_t n = 1; n <= 8; n++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080031211 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031212 GemmMicrokernelTester()
31213 .mr(4)
31214 .nr(8)
31215 .kr(1)
31216 .sr(1)
31217 .m(m)
31218 .n(n)
31219 .k(k)
31220 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031221 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031222 }
31223 }
31224 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031225 }
31226
31227 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_gt_8) {
31228 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031229 for (size_t k = 9; k < 16; k++) {
31230 GemmMicrokernelTester()
31231 .mr(4)
31232 .nr(8)
31233 .kr(1)
31234 .sr(1)
31235 .m(4)
31236 .n(8)
31237 .k(k)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031238 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031239 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031240 }
31241
31242 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_gt_8_strided_a) {
31243 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031244 for (size_t k = 9; k < 16; k++) {
31245 GemmMicrokernelTester()
31246 .mr(4)
31247 .nr(8)
31248 .kr(1)
31249 .sr(1)
31250 .m(4)
31251 .n(8)
31252 .k(k)
31253 .a_stride(19)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031254 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031255 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031256 }
31257
31258 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_gt_8_subtile) {
31259 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031260 for (size_t k = 9; k < 16; k++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080031261 for (uint32_t n = 1; n <= 8; n++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080031262 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031263 GemmMicrokernelTester()
31264 .mr(4)
31265 .nr(8)
31266 .kr(1)
31267 .sr(1)
31268 .m(m)
31269 .n(n)
31270 .k(k)
31271 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031272 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031273 }
31274 }
31275 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031276 }
31277
31278 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_div_8) {
31279 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031280 for (size_t k = 16; k <= 80; k += 8) {
31281 GemmMicrokernelTester()
31282 .mr(4)
31283 .nr(8)
31284 .kr(1)
31285 .sr(1)
31286 .m(4)
31287 .n(8)
31288 .k(k)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031289 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031290 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031291 }
31292
31293 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_div_8_strided_a) {
31294 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031295 for (size_t k = 16; k <= 80; k += 8) {
31296 GemmMicrokernelTester()
31297 .mr(4)
31298 .nr(8)
31299 .kr(1)
31300 .sr(1)
31301 .m(4)
31302 .n(8)
31303 .k(k)
31304 .a_stride(83)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031305 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031306 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031307 }
31308
31309 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, k_div_8_subtile) {
31310 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031311 for (size_t k = 16; k <= 80; k += 8) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080031312 for (uint32_t n = 1; n <= 8; n++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080031313 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031314 GemmMicrokernelTester()
31315 .mr(4)
31316 .nr(8)
31317 .kr(1)
31318 .sr(1)
31319 .m(m)
31320 .n(n)
31321 .k(k)
31322 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031323 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031324 }
31325 }
31326 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031327 }
31328
31329 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, n_gt_8) {
31330 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031331 for (uint32_t n = 9; n < 16; n++) {
31332 for (size_t k = 1; k <= 40; k += 9) {
31333 GemmMicrokernelTester()
31334 .mr(4)
31335 .nr(8)
31336 .kr(1)
31337 .sr(1)
31338 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080031339 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031340 .k(k)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031341 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031342 }
31343 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031344 }
31345
31346 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, n_gt_8_strided_cn) {
31347 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031348 for (uint32_t n = 9; n < 16; n++) {
31349 for (size_t k = 1; k <= 40; k += 9) {
31350 GemmMicrokernelTester()
31351 .mr(4)
31352 .nr(8)
31353 .kr(1)
31354 .sr(1)
31355 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080031356 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031357 .k(k)
31358 .cn_stride(11)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031359 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031360 }
31361 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031362 }
31363
31364 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, n_gt_8_strided_a) {
31365 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031366 for (uint32_t n = 9; n < 16; n++) {
31367 for (size_t k = 1; k <= 40; k += 9) {
31368 GemmMicrokernelTester()
31369 .mr(4)
31370 .nr(8)
31371 .kr(1)
31372 .sr(1)
31373 .m(4)
31374 .n(n)
31375 .k(k)
31376 .a_stride(43)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031377 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031378 }
31379 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031380 }
31381
31382 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, n_gt_8_subtile) {
31383 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031384 for (uint32_t n = 9; n < 16; n++) {
31385 for (size_t k = 1; k <= 40; k += 9) {
31386 for (uint32_t m = 1; m <= 4; m++) {
31387 GemmMicrokernelTester()
31388 .mr(4)
31389 .nr(8)
31390 .kr(1)
31391 .sr(1)
31392 .m(m)
31393 .n(n)
31394 .k(k)
31395 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031396 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031397 }
31398 }
31399 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031400 }
31401
31402 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, n_div_8) {
31403 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031404 for (uint32_t n = 16; n <= 24; n += 8) {
31405 for (size_t k = 1; k <= 40; k += 9) {
31406 GemmMicrokernelTester()
31407 .mr(4)
31408 .nr(8)
31409 .kr(1)
31410 .sr(1)
31411 .m(4)
Zhi An Ngaf9ff852022-01-13 10:48:37 -080031412 .n(n)
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031413 .k(k)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031414 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031415 }
31416 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031417 }
31418
31419 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, n_div_8_strided_cn) {
31420 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031421 for (uint32_t n = 16; n <= 24; n += 8) {
31422 for (size_t k = 1; k <= 40; k += 9) {
31423 GemmMicrokernelTester()
31424 .mr(4)
31425 .nr(8)
31426 .kr(1)
31427 .sr(1)
31428 .m(4)
31429 .n(n)
31430 .k(k)
31431 .cn_stride(11)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031432 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031433 }
31434 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031435 }
31436
31437 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, n_div_8_strided_a) {
31438 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031439 for (uint32_t n = 16; n <= 24; n += 8) {
31440 for (size_t k = 1; k <= 40; k += 9) {
31441 GemmMicrokernelTester()
31442 .mr(4)
31443 .nr(8)
31444 .kr(1)
31445 .sr(1)
31446 .m(4)
31447 .n(n)
31448 .k(k)
31449 .a_stride(43)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031450 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031451 }
31452 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031453 }
31454
31455 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, n_div_8_subtile) {
31456 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031457 for (uint32_t n = 16; n <= 24; n += 8) {
31458 for (size_t k = 1; k <= 40; k += 9) {
31459 for (uint32_t m = 1; m <= 4; m++) {
31460 GemmMicrokernelTester()
31461 .mr(4)
31462 .nr(8)
31463 .kr(1)
31464 .sr(1)
31465 .m(m)
31466 .n(n)
31467 .k(k)
31468 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031469 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031470 }
31471 }
31472 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031473 }
31474
31475 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, strided_cm_subtile) {
31476 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031477 for (size_t k = 1; k <= 40; k += 9) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080031478 for (uint32_t n = 1; n <= 8; n++) {
Zhi An Ng83844ae2022-01-14 09:52:25 -080031479 for (uint32_t m = 1; m <= 4; m++) {
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031480 GemmMicrokernelTester()
31481 .mr(4)
31482 .nr(8)
31483 .kr(1)
31484 .sr(1)
31485 .m(m)
31486 .n(n)
31487 .k(k)
31488 .cm_stride(11)
31489 .iterations(1)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031490 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031491 }
31492 }
31493 }
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031494 }
31495
31496 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, qmin) {
31497 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031498 GemmMicrokernelTester()
31499 .mr(4)
31500 .nr(8)
31501 .kr(1)
31502 .sr(1)
31503 .m(4)
31504 .n(8)
31505 .k(8)
31506 .qmin(128)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031507 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031508 }
31509
31510 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, qmax) {
31511 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031512 GemmMicrokernelTester()
31513 .mr(4)
31514 .nr(8)
31515 .kr(1)
31516 .sr(1)
31517 .m(4)
31518 .n(8)
31519 .k(8)
31520 .qmax(128)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031521 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031522 }
31523
31524 TEST(GENERATE_QC8_GEMM_FP32_4X8__AARCH32_NEONV8_MLAL_LANE_LD64, strided_cm) {
31525 TEST_REQUIRES_ARM_NEON_V8;
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031526 GemmMicrokernelTester()
31527 .mr(4)
31528 .nr(8)
31529 .kr(1)
31530 .sr(1)
31531 .m(4)
31532 .n(8)
31533 .k(8)
31534 .cm_stride(11)
Zhi An Ng0ec25cf2022-01-19 11:38:55 -080031535 .Test(xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
Zhi An Ng4c1fd6f2022-01-10 19:35:06 -080031536 }
31537#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT