blob: 27d8db71eead4af520ac1e7291af4fd593da2dcb [file] [log] [blame]
Marat Dukhan0b043742021-06-02 18:29:11 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8//
9// Auto-generated file. Do not edit!
10// Specification: test/qc8-gemm-minmax-fp32.yaml
11// Generator: tools/generate-gemm-test.py
12
13
14#include <gtest/gtest.h>
15
16#include <xnnpack/common.h>
17#include <xnnpack/isa-checks.h>
18
19#include <xnnpack/gemm.h>
20#include <xnnpack/igemm.h>
21#include <xnnpack/ppmm.h>
22#include "gemm-microkernel-tester.h"
23
24
25#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Marat Dukhanfc188ed2021-06-03 12:21:22 -070026 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8) {
27 TEST_REQUIRES_X86_SSE2;
28 GemmMicrokernelTester()
29 .mr(1)
30 .nr(4)
31 .kr(2)
32 .sr(1)
33 .m(1)
34 .n(4)
35 .k(8)
36 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
37 }
38
39 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, strided_cn) {
40 TEST_REQUIRES_X86_SSE2;
41 GemmMicrokernelTester()
42 .mr(1)
43 .nr(4)
44 .kr(2)
45 .sr(1)
46 .m(1)
47 .n(4)
48 .k(8)
49 .cn_stride(7)
50 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
51 }
52
53 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_strided_a) {
54 TEST_REQUIRES_X86_SSE2;
55 GemmMicrokernelTester()
56 .mr(1)
57 .nr(4)
58 .kr(2)
59 .sr(1)
60 .m(1)
61 .n(4)
62 .k(8)
63 .a_stride(11)
64 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
65 }
66
67 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_subtile) {
68 TEST_REQUIRES_X86_SSE2;
69 for (uint32_t m = 1; m <= 1; m++) {
70 for (uint32_t n = 1; n <= 4; n++) {
71 GemmMicrokernelTester()
72 .mr(1)
73 .nr(4)
74 .kr(2)
75 .sr(1)
76 .m(m)
77 .n(n)
78 .k(8)
79 .iterations(1)
80 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
81 }
82 }
83 }
84
85 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_subtile_m) {
86 TEST_REQUIRES_X86_SSE2;
87 for (uint32_t m = 1; m <= 1; m++) {
88 GemmMicrokernelTester()
89 .mr(1)
90 .nr(4)
91 .kr(2)
92 .sr(1)
93 .m(m)
94 .n(4)
95 .k(8)
96 .iterations(1)
97 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
98 }
99 }
100
101 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_subtile_n) {
102 TEST_REQUIRES_X86_SSE2;
103 for (uint32_t n = 1; n <= 4; n++) {
104 GemmMicrokernelTester()
105 .mr(1)
106 .nr(4)
107 .kr(2)
108 .sr(1)
109 .m(1)
110 .n(n)
111 .k(8)
112 .iterations(1)
113 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
114 }
115 }
116
117 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_lt_8) {
118 TEST_REQUIRES_X86_SSE2;
119 for (size_t k = 1; k < 8; k++) {
120 GemmMicrokernelTester()
121 .mr(1)
122 .nr(4)
123 .kr(2)
124 .sr(1)
125 .m(1)
126 .n(4)
127 .k(k)
128 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
129 }
130 }
131
132 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_lt_8_strided_a) {
133 TEST_REQUIRES_X86_SSE2;
134 for (size_t k = 1; k < 8; k++) {
135 GemmMicrokernelTester()
136 .mr(1)
137 .nr(4)
138 .kr(2)
139 .sr(1)
140 .m(1)
141 .n(4)
142 .k(k)
143 .a_stride(11)
144 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
145 }
146 }
147
148 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_lt_8_subtile) {
149 TEST_REQUIRES_X86_SSE2;
150 for (size_t k = 1; k < 8; k++) {
151 for (uint32_t m = 1; m <= 1; m++) {
152 for (uint32_t n = 1; n <= 4; n++) {
153 GemmMicrokernelTester()
154 .mr(1)
155 .nr(4)
156 .kr(2)
157 .sr(1)
158 .m(m)
159 .n(n)
160 .k(k)
161 .iterations(1)
162 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
163 }
164 }
165 }
166 }
167
168 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_gt_8) {
169 TEST_REQUIRES_X86_SSE2;
170 for (size_t k = 9; k < 16; k++) {
171 GemmMicrokernelTester()
172 .mr(1)
173 .nr(4)
174 .kr(2)
175 .sr(1)
176 .m(1)
177 .n(4)
178 .k(k)
179 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
180 }
181 }
182
183 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_gt_8_strided_a) {
184 TEST_REQUIRES_X86_SSE2;
185 for (size_t k = 9; k < 16; k++) {
186 GemmMicrokernelTester()
187 .mr(1)
188 .nr(4)
189 .kr(2)
190 .sr(1)
191 .m(1)
192 .n(4)
193 .k(k)
194 .a_stride(19)
195 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
196 }
197 }
198
199 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_gt_8_subtile) {
200 TEST_REQUIRES_X86_SSE2;
201 for (size_t k = 9; k < 16; k++) {
202 for (uint32_t m = 1; m <= 1; m++) {
203 for (uint32_t n = 1; n <= 4; n++) {
204 GemmMicrokernelTester()
205 .mr(1)
206 .nr(4)
207 .kr(2)
208 .sr(1)
209 .m(m)
210 .n(n)
211 .k(k)
212 .iterations(1)
213 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
214 }
215 }
216 }
217 }
218
219 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_div_8) {
220 TEST_REQUIRES_X86_SSE2;
221 for (size_t k = 16; k <= 80; k += 8) {
222 GemmMicrokernelTester()
223 .mr(1)
224 .nr(4)
225 .kr(2)
226 .sr(1)
227 .m(1)
228 .n(4)
229 .k(k)
230 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
231 }
232 }
233
234 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_div_8_strided_a) {
235 TEST_REQUIRES_X86_SSE2;
236 for (size_t k = 16; k <= 80; k += 8) {
237 GemmMicrokernelTester()
238 .mr(1)
239 .nr(4)
240 .kr(2)
241 .sr(1)
242 .m(1)
243 .n(4)
244 .k(k)
245 .a_stride(83)
246 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
247 }
248 }
249
250 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_div_8_subtile) {
251 TEST_REQUIRES_X86_SSE2;
252 for (size_t k = 16; k <= 80; k += 8) {
253 for (uint32_t m = 1; m <= 1; m++) {
254 for (uint32_t n = 1; n <= 4; n++) {
255 GemmMicrokernelTester()
256 .mr(1)
257 .nr(4)
258 .kr(2)
259 .sr(1)
260 .m(m)
261 .n(n)
262 .k(k)
263 .iterations(1)
264 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
265 }
266 }
267 }
268 }
269
270 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4) {
271 TEST_REQUIRES_X86_SSE2;
272 for (uint32_t n = 5; n < 8; n++) {
273 for (size_t k = 1; k <= 40; k += 9) {
274 GemmMicrokernelTester()
275 .mr(1)
276 .nr(4)
277 .kr(2)
278 .sr(1)
279 .m(1)
280 .n(4)
281 .k(k)
282 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
283 }
284 }
285 }
286
287 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4_strided_cn) {
288 TEST_REQUIRES_X86_SSE2;
289 for (uint32_t n = 5; n < 8; n++) {
290 for (size_t k = 1; k <= 40; k += 9) {
291 GemmMicrokernelTester()
292 .mr(1)
293 .nr(4)
294 .kr(2)
295 .sr(1)
296 .m(1)
297 .n(4)
298 .k(k)
299 .cn_stride(7)
300 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
301 }
302 }
303 }
304
305 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4_strided_a) {
306 TEST_REQUIRES_X86_SSE2;
307 for (uint32_t n = 5; n < 8; n++) {
308 for (size_t k = 1; k <= 40; k += 9) {
309 GemmMicrokernelTester()
310 .mr(1)
311 .nr(4)
312 .kr(2)
313 .sr(1)
314 .m(1)
315 .n(n)
316 .k(k)
317 .a_stride(43)
318 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
319 }
320 }
321 }
322
323 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4_subtile) {
324 TEST_REQUIRES_X86_SSE2;
325 for (uint32_t n = 5; n < 8; n++) {
326 for (size_t k = 1; k <= 40; k += 9) {
327 for (uint32_t m = 1; m <= 1; m++) {
328 GemmMicrokernelTester()
329 .mr(1)
330 .nr(4)
331 .kr(2)
332 .sr(1)
333 .m(m)
334 .n(n)
335 .k(k)
336 .iterations(1)
337 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
338 }
339 }
340 }
341 }
342
343 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4) {
344 TEST_REQUIRES_X86_SSE2;
345 for (uint32_t n = 8; n <= 12; n += 4) {
346 for (size_t k = 1; k <= 40; k += 9) {
347 GemmMicrokernelTester()
348 .mr(1)
349 .nr(4)
350 .kr(2)
351 .sr(1)
352 .m(1)
353 .n(4)
354 .k(k)
355 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
356 }
357 }
358 }
359
360 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4_strided_cn) {
361 TEST_REQUIRES_X86_SSE2;
362 for (uint32_t n = 8; n <= 12; n += 4) {
363 for (size_t k = 1; k <= 40; k += 9) {
364 GemmMicrokernelTester()
365 .mr(1)
366 .nr(4)
367 .kr(2)
368 .sr(1)
369 .m(1)
370 .n(n)
371 .k(k)
372 .cn_stride(7)
373 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
374 }
375 }
376 }
377
378 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4_strided_a) {
379 TEST_REQUIRES_X86_SSE2;
380 for (uint32_t n = 8; n <= 12; n += 4) {
381 for (size_t k = 1; k <= 40; k += 9) {
382 GemmMicrokernelTester()
383 .mr(1)
384 .nr(4)
385 .kr(2)
386 .sr(1)
387 .m(1)
388 .n(n)
389 .k(k)
390 .a_stride(43)
391 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
392 }
393 }
394 }
395
396 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4_subtile) {
397 TEST_REQUIRES_X86_SSE2;
398 for (uint32_t n = 8; n <= 12; n += 4) {
399 for (size_t k = 1; k <= 40; k += 9) {
400 for (uint32_t m = 1; m <= 1; m++) {
401 GemmMicrokernelTester()
402 .mr(1)
403 .nr(4)
404 .kr(2)
405 .sr(1)
406 .m(m)
407 .n(n)
408 .k(k)
409 .iterations(1)
410 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
411 }
412 }
413 }
414 }
415
416 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, strided_cm_subtile) {
417 TEST_REQUIRES_X86_SSE2;
418 for (size_t k = 1; k <= 40; k += 9) {
419 for (uint32_t m = 1; m <= 1; m++) {
420 for (uint32_t n = 1; n <= 4; n++) {
421 GemmMicrokernelTester()
422 .mr(1)
423 .nr(4)
424 .kr(2)
425 .sr(1)
426 .m(m)
427 .n(n)
428 .k(k)
429 .cm_stride(7)
430 .iterations(1)
431 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
432 }
433 }
434 }
435 }
436
437 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, qmin) {
438 TEST_REQUIRES_X86_SSE2;
439 GemmMicrokernelTester()
440 .mr(1)
441 .nr(4)
442 .kr(2)
443 .sr(1)
444 .m(1)
445 .n(4)
446 .k(8)
447 .qmin(128)
448 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
449 }
450
451 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, qmax) {
452 TEST_REQUIRES_X86_SSE2;
453 GemmMicrokernelTester()
454 .mr(1)
455 .nr(4)
456 .kr(2)
457 .sr(1)
458 .m(1)
459 .n(4)
460 .k(8)
461 .qmax(128)
462 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
463 }
464
465 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, strided_cm) {
466 TEST_REQUIRES_X86_SSE2;
467 GemmMicrokernelTester()
468 .mr(1)
469 .nr(4)
470 .kr(2)
471 .sr(1)
472 .m(1)
473 .n(4)
474 .k(8)
475 .cm_stride(7)
476 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
477 }
478#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
479
480
481#if XNN_ARCH_X86 || XNN_ARCH_X86_64
482 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8) {
483 TEST_REQUIRES_X86_SSE2;
484 GemmMicrokernelTester()
485 .mr(2)
486 .nr(4)
487 .kr(2)
488 .sr(1)
489 .m(2)
490 .n(4)
491 .k(8)
492 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
493 }
494
495 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, strided_cn) {
496 TEST_REQUIRES_X86_SSE2;
497 GemmMicrokernelTester()
498 .mr(2)
499 .nr(4)
500 .kr(2)
501 .sr(1)
502 .m(2)
503 .n(4)
504 .k(8)
505 .cn_stride(7)
506 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
507 }
508
509 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8_strided_a) {
510 TEST_REQUIRES_X86_SSE2;
511 GemmMicrokernelTester()
512 .mr(2)
513 .nr(4)
514 .kr(2)
515 .sr(1)
516 .m(2)
517 .n(4)
518 .k(8)
519 .a_stride(11)
520 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
521 }
522
523 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8_subtile) {
524 TEST_REQUIRES_X86_SSE2;
525 for (uint32_t m = 1; m <= 2; m++) {
526 for (uint32_t n = 1; n <= 4; n++) {
527 GemmMicrokernelTester()
528 .mr(2)
529 .nr(4)
530 .kr(2)
531 .sr(1)
532 .m(m)
533 .n(n)
534 .k(8)
535 .iterations(1)
536 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
537 }
538 }
539 }
540
541 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8_subtile_m) {
542 TEST_REQUIRES_X86_SSE2;
543 for (uint32_t m = 1; m <= 2; m++) {
544 GemmMicrokernelTester()
545 .mr(2)
546 .nr(4)
547 .kr(2)
548 .sr(1)
549 .m(m)
550 .n(4)
551 .k(8)
552 .iterations(1)
553 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
554 }
555 }
556
557 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8_subtile_n) {
558 TEST_REQUIRES_X86_SSE2;
559 for (uint32_t n = 1; n <= 4; n++) {
560 GemmMicrokernelTester()
561 .mr(2)
562 .nr(4)
563 .kr(2)
564 .sr(1)
565 .m(2)
566 .n(n)
567 .k(8)
568 .iterations(1)
569 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
570 }
571 }
572
573 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_lt_8) {
574 TEST_REQUIRES_X86_SSE2;
575 for (size_t k = 1; k < 8; k++) {
576 GemmMicrokernelTester()
577 .mr(2)
578 .nr(4)
579 .kr(2)
580 .sr(1)
581 .m(2)
582 .n(4)
583 .k(k)
584 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
585 }
586 }
587
588 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_lt_8_strided_a) {
589 TEST_REQUIRES_X86_SSE2;
590 for (size_t k = 1; k < 8; k++) {
591 GemmMicrokernelTester()
592 .mr(2)
593 .nr(4)
594 .kr(2)
595 .sr(1)
596 .m(2)
597 .n(4)
598 .k(k)
599 .a_stride(11)
600 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
601 }
602 }
603
604 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_lt_8_subtile) {
605 TEST_REQUIRES_X86_SSE2;
606 for (size_t k = 1; k < 8; k++) {
607 for (uint32_t m = 1; m <= 2; m++) {
608 for (uint32_t n = 1; n <= 4; n++) {
609 GemmMicrokernelTester()
610 .mr(2)
611 .nr(4)
612 .kr(2)
613 .sr(1)
614 .m(m)
615 .n(n)
616 .k(k)
617 .iterations(1)
618 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
619 }
620 }
621 }
622 }
623
624 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_gt_8) {
625 TEST_REQUIRES_X86_SSE2;
626 for (size_t k = 9; k < 16; k++) {
627 GemmMicrokernelTester()
628 .mr(2)
629 .nr(4)
630 .kr(2)
631 .sr(1)
632 .m(2)
633 .n(4)
634 .k(k)
635 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
636 }
637 }
638
639 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_gt_8_strided_a) {
640 TEST_REQUIRES_X86_SSE2;
641 for (size_t k = 9; k < 16; k++) {
642 GemmMicrokernelTester()
643 .mr(2)
644 .nr(4)
645 .kr(2)
646 .sr(1)
647 .m(2)
648 .n(4)
649 .k(k)
650 .a_stride(19)
651 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
652 }
653 }
654
655 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_gt_8_subtile) {
656 TEST_REQUIRES_X86_SSE2;
657 for (size_t k = 9; k < 16; k++) {
658 for (uint32_t m = 1; m <= 2; m++) {
659 for (uint32_t n = 1; n <= 4; n++) {
660 GemmMicrokernelTester()
661 .mr(2)
662 .nr(4)
663 .kr(2)
664 .sr(1)
665 .m(m)
666 .n(n)
667 .k(k)
668 .iterations(1)
669 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
670 }
671 }
672 }
673 }
674
675 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_div_8) {
676 TEST_REQUIRES_X86_SSE2;
677 for (size_t k = 16; k <= 80; k += 8) {
678 GemmMicrokernelTester()
679 .mr(2)
680 .nr(4)
681 .kr(2)
682 .sr(1)
683 .m(2)
684 .n(4)
685 .k(k)
686 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
687 }
688 }
689
690 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_div_8_strided_a) {
691 TEST_REQUIRES_X86_SSE2;
692 for (size_t k = 16; k <= 80; k += 8) {
693 GemmMicrokernelTester()
694 .mr(2)
695 .nr(4)
696 .kr(2)
697 .sr(1)
698 .m(2)
699 .n(4)
700 .k(k)
701 .a_stride(83)
702 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
703 }
704 }
705
706 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_div_8_subtile) {
707 TEST_REQUIRES_X86_SSE2;
708 for (size_t k = 16; k <= 80; k += 8) {
709 for (uint32_t m = 1; m <= 2; m++) {
710 for (uint32_t n = 1; n <= 4; n++) {
711 GemmMicrokernelTester()
712 .mr(2)
713 .nr(4)
714 .kr(2)
715 .sr(1)
716 .m(m)
717 .n(n)
718 .k(k)
719 .iterations(1)
720 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
721 }
722 }
723 }
724 }
725
726 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4) {
727 TEST_REQUIRES_X86_SSE2;
728 for (uint32_t n = 5; n < 8; n++) {
729 for (size_t k = 1; k <= 40; k += 9) {
730 GemmMicrokernelTester()
731 .mr(2)
732 .nr(4)
733 .kr(2)
734 .sr(1)
735 .m(2)
736 .n(4)
737 .k(k)
738 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
739 }
740 }
741 }
742
743 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4_strided_cn) {
744 TEST_REQUIRES_X86_SSE2;
745 for (uint32_t n = 5; n < 8; n++) {
746 for (size_t k = 1; k <= 40; k += 9) {
747 GemmMicrokernelTester()
748 .mr(2)
749 .nr(4)
750 .kr(2)
751 .sr(1)
752 .m(2)
753 .n(4)
754 .k(k)
755 .cn_stride(7)
756 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
757 }
758 }
759 }
760
761 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4_strided_a) {
762 TEST_REQUIRES_X86_SSE2;
763 for (uint32_t n = 5; n < 8; n++) {
764 for (size_t k = 1; k <= 40; k += 9) {
765 GemmMicrokernelTester()
766 .mr(2)
767 .nr(4)
768 .kr(2)
769 .sr(1)
770 .m(2)
771 .n(n)
772 .k(k)
773 .a_stride(43)
774 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
775 }
776 }
777 }
778
779 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4_subtile) {
780 TEST_REQUIRES_X86_SSE2;
781 for (uint32_t n = 5; n < 8; n++) {
782 for (size_t k = 1; k <= 40; k += 9) {
783 for (uint32_t m = 1; m <= 2; m++) {
784 GemmMicrokernelTester()
785 .mr(2)
786 .nr(4)
787 .kr(2)
788 .sr(1)
789 .m(m)
790 .n(n)
791 .k(k)
792 .iterations(1)
793 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
794 }
795 }
796 }
797 }
798
799 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4) {
800 TEST_REQUIRES_X86_SSE2;
801 for (uint32_t n = 8; n <= 12; n += 4) {
802 for (size_t k = 1; k <= 40; k += 9) {
803 GemmMicrokernelTester()
804 .mr(2)
805 .nr(4)
806 .kr(2)
807 .sr(1)
808 .m(2)
809 .n(4)
810 .k(k)
811 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
812 }
813 }
814 }
815
816 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4_strided_cn) {
817 TEST_REQUIRES_X86_SSE2;
818 for (uint32_t n = 8; n <= 12; n += 4) {
819 for (size_t k = 1; k <= 40; k += 9) {
820 GemmMicrokernelTester()
821 .mr(2)
822 .nr(4)
823 .kr(2)
824 .sr(1)
825 .m(2)
826 .n(n)
827 .k(k)
828 .cn_stride(7)
829 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
830 }
831 }
832 }
833
834 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4_strided_a) {
835 TEST_REQUIRES_X86_SSE2;
836 for (uint32_t n = 8; n <= 12; n += 4) {
837 for (size_t k = 1; k <= 40; k += 9) {
838 GemmMicrokernelTester()
839 .mr(2)
840 .nr(4)
841 .kr(2)
842 .sr(1)
843 .m(2)
844 .n(n)
845 .k(k)
846 .a_stride(43)
847 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
848 }
849 }
850 }
851
852 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4_subtile) {
853 TEST_REQUIRES_X86_SSE2;
854 for (uint32_t n = 8; n <= 12; n += 4) {
855 for (size_t k = 1; k <= 40; k += 9) {
856 for (uint32_t m = 1; m <= 2; m++) {
857 GemmMicrokernelTester()
858 .mr(2)
859 .nr(4)
860 .kr(2)
861 .sr(1)
862 .m(m)
863 .n(n)
864 .k(k)
865 .iterations(1)
866 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
867 }
868 }
869 }
870 }
871
872 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, strided_cm_subtile) {
873 TEST_REQUIRES_X86_SSE2;
874 for (size_t k = 1; k <= 40; k += 9) {
875 for (uint32_t m = 1; m <= 2; m++) {
876 for (uint32_t n = 1; n <= 4; n++) {
877 GemmMicrokernelTester()
878 .mr(2)
879 .nr(4)
880 .kr(2)
881 .sr(1)
882 .m(m)
883 .n(n)
884 .k(k)
885 .cm_stride(7)
886 .iterations(1)
887 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
888 }
889 }
890 }
891 }
892
893 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, qmin) {
894 TEST_REQUIRES_X86_SSE2;
895 GemmMicrokernelTester()
896 .mr(2)
897 .nr(4)
898 .kr(2)
899 .sr(1)
900 .m(2)
901 .n(4)
902 .k(8)
903 .qmin(128)
904 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
905 }
906
907 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, qmax) {
908 TEST_REQUIRES_X86_SSE2;
909 GemmMicrokernelTester()
910 .mr(2)
911 .nr(4)
912 .kr(2)
913 .sr(1)
914 .m(2)
915 .n(4)
916 .k(8)
917 .qmax(128)
918 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
919 }
920
921 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, strided_cm) {
922 TEST_REQUIRES_X86_SSE2;
923 GemmMicrokernelTester()
924 .mr(2)
925 .nr(4)
926 .kr(2)
927 .sr(1)
928 .m(2)
929 .n(4)
930 .k(8)
931 .cm_stride(7)
932 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
933 }
934#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
935
936
937#if XNN_ARCH_X86 || XNN_ARCH_X86_64
938 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8) {
939 TEST_REQUIRES_X86_SSE2;
940 GemmMicrokernelTester()
941 .mr(3)
942 .nr(4)
943 .kr(2)
944 .sr(1)
945 .m(3)
946 .n(4)
947 .k(8)
948 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
949 }
950
951 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, strided_cn) {
952 TEST_REQUIRES_X86_SSE2;
953 GemmMicrokernelTester()
954 .mr(3)
955 .nr(4)
956 .kr(2)
957 .sr(1)
958 .m(3)
959 .n(4)
960 .k(8)
961 .cn_stride(7)
962 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
963 }
964
965 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8_strided_a) {
966 TEST_REQUIRES_X86_SSE2;
967 GemmMicrokernelTester()
968 .mr(3)
969 .nr(4)
970 .kr(2)
971 .sr(1)
972 .m(3)
973 .n(4)
974 .k(8)
975 .a_stride(11)
976 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
977 }
978
979 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8_subtile) {
980 TEST_REQUIRES_X86_SSE2;
981 for (uint32_t m = 1; m <= 3; m++) {
982 for (uint32_t n = 1; n <= 4; n++) {
983 GemmMicrokernelTester()
984 .mr(3)
985 .nr(4)
986 .kr(2)
987 .sr(1)
988 .m(m)
989 .n(n)
990 .k(8)
991 .iterations(1)
992 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
993 }
994 }
995 }
996
997 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8_subtile_m) {
998 TEST_REQUIRES_X86_SSE2;
999 for (uint32_t m = 1; m <= 3; m++) {
1000 GemmMicrokernelTester()
1001 .mr(3)
1002 .nr(4)
1003 .kr(2)
1004 .sr(1)
1005 .m(m)
1006 .n(4)
1007 .k(8)
1008 .iterations(1)
1009 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1010 }
1011 }
1012
1013 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8_subtile_n) {
1014 TEST_REQUIRES_X86_SSE2;
1015 for (uint32_t n = 1; n <= 4; n++) {
1016 GemmMicrokernelTester()
1017 .mr(3)
1018 .nr(4)
1019 .kr(2)
1020 .sr(1)
1021 .m(3)
1022 .n(n)
1023 .k(8)
1024 .iterations(1)
1025 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1026 }
1027 }
1028
1029 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_lt_8) {
1030 TEST_REQUIRES_X86_SSE2;
1031 for (size_t k = 1; k < 8; k++) {
1032 GemmMicrokernelTester()
1033 .mr(3)
1034 .nr(4)
1035 .kr(2)
1036 .sr(1)
1037 .m(3)
1038 .n(4)
1039 .k(k)
1040 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1041 }
1042 }
1043
1044 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_lt_8_strided_a) {
1045 TEST_REQUIRES_X86_SSE2;
1046 for (size_t k = 1; k < 8; k++) {
1047 GemmMicrokernelTester()
1048 .mr(3)
1049 .nr(4)
1050 .kr(2)
1051 .sr(1)
1052 .m(3)
1053 .n(4)
1054 .k(k)
1055 .a_stride(11)
1056 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1057 }
1058 }
1059
1060 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_lt_8_subtile) {
1061 TEST_REQUIRES_X86_SSE2;
1062 for (size_t k = 1; k < 8; k++) {
1063 for (uint32_t m = 1; m <= 3; m++) {
1064 for (uint32_t n = 1; n <= 4; n++) {
1065 GemmMicrokernelTester()
1066 .mr(3)
1067 .nr(4)
1068 .kr(2)
1069 .sr(1)
1070 .m(m)
1071 .n(n)
1072 .k(k)
1073 .iterations(1)
1074 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1075 }
1076 }
1077 }
1078 }
1079
1080 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_gt_8) {
1081 TEST_REQUIRES_X86_SSE2;
1082 for (size_t k = 9; k < 16; k++) {
1083 GemmMicrokernelTester()
1084 .mr(3)
1085 .nr(4)
1086 .kr(2)
1087 .sr(1)
1088 .m(3)
1089 .n(4)
1090 .k(k)
1091 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1092 }
1093 }
1094
1095 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_gt_8_strided_a) {
1096 TEST_REQUIRES_X86_SSE2;
1097 for (size_t k = 9; k < 16; k++) {
1098 GemmMicrokernelTester()
1099 .mr(3)
1100 .nr(4)
1101 .kr(2)
1102 .sr(1)
1103 .m(3)
1104 .n(4)
1105 .k(k)
1106 .a_stride(19)
1107 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1108 }
1109 }
1110
1111 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_gt_8_subtile) {
1112 TEST_REQUIRES_X86_SSE2;
1113 for (size_t k = 9; k < 16; k++) {
1114 for (uint32_t m = 1; m <= 3; m++) {
1115 for (uint32_t n = 1; n <= 4; n++) {
1116 GemmMicrokernelTester()
1117 .mr(3)
1118 .nr(4)
1119 .kr(2)
1120 .sr(1)
1121 .m(m)
1122 .n(n)
1123 .k(k)
1124 .iterations(1)
1125 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1126 }
1127 }
1128 }
1129 }
1130
1131 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_div_8) {
1132 TEST_REQUIRES_X86_SSE2;
1133 for (size_t k = 16; k <= 80; k += 8) {
1134 GemmMicrokernelTester()
1135 .mr(3)
1136 .nr(4)
1137 .kr(2)
1138 .sr(1)
1139 .m(3)
1140 .n(4)
1141 .k(k)
1142 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1143 }
1144 }
1145
1146 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_div_8_strided_a) {
1147 TEST_REQUIRES_X86_SSE2;
1148 for (size_t k = 16; k <= 80; k += 8) {
1149 GemmMicrokernelTester()
1150 .mr(3)
1151 .nr(4)
1152 .kr(2)
1153 .sr(1)
1154 .m(3)
1155 .n(4)
1156 .k(k)
1157 .a_stride(83)
1158 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1159 }
1160 }
1161
1162 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_div_8_subtile) {
1163 TEST_REQUIRES_X86_SSE2;
1164 for (size_t k = 16; k <= 80; k += 8) {
1165 for (uint32_t m = 1; m <= 3; m++) {
1166 for (uint32_t n = 1; n <= 4; n++) {
1167 GemmMicrokernelTester()
1168 .mr(3)
1169 .nr(4)
1170 .kr(2)
1171 .sr(1)
1172 .m(m)
1173 .n(n)
1174 .k(k)
1175 .iterations(1)
1176 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1177 }
1178 }
1179 }
1180 }
1181
1182 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4) {
1183 TEST_REQUIRES_X86_SSE2;
1184 for (uint32_t n = 5; n < 8; n++) {
1185 for (size_t k = 1; k <= 40; k += 9) {
1186 GemmMicrokernelTester()
1187 .mr(3)
1188 .nr(4)
1189 .kr(2)
1190 .sr(1)
1191 .m(3)
1192 .n(4)
1193 .k(k)
1194 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1195 }
1196 }
1197 }
1198
1199 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4_strided_cn) {
1200 TEST_REQUIRES_X86_SSE2;
1201 for (uint32_t n = 5; n < 8; n++) {
1202 for (size_t k = 1; k <= 40; k += 9) {
1203 GemmMicrokernelTester()
1204 .mr(3)
1205 .nr(4)
1206 .kr(2)
1207 .sr(1)
1208 .m(3)
1209 .n(4)
1210 .k(k)
1211 .cn_stride(7)
1212 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1213 }
1214 }
1215 }
1216
1217 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4_strided_a) {
1218 TEST_REQUIRES_X86_SSE2;
1219 for (uint32_t n = 5; n < 8; n++) {
1220 for (size_t k = 1; k <= 40; k += 9) {
1221 GemmMicrokernelTester()
1222 .mr(3)
1223 .nr(4)
1224 .kr(2)
1225 .sr(1)
1226 .m(3)
1227 .n(n)
1228 .k(k)
1229 .a_stride(43)
1230 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1231 }
1232 }
1233 }
1234
1235 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4_subtile) {
1236 TEST_REQUIRES_X86_SSE2;
1237 for (uint32_t n = 5; n < 8; n++) {
1238 for (size_t k = 1; k <= 40; k += 9) {
1239 for (uint32_t m = 1; m <= 3; m++) {
1240 GemmMicrokernelTester()
1241 .mr(3)
1242 .nr(4)
1243 .kr(2)
1244 .sr(1)
1245 .m(m)
1246 .n(n)
1247 .k(k)
1248 .iterations(1)
1249 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1250 }
1251 }
1252 }
1253 }
1254
1255 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4) {
1256 TEST_REQUIRES_X86_SSE2;
1257 for (uint32_t n = 8; n <= 12; n += 4) {
1258 for (size_t k = 1; k <= 40; k += 9) {
1259 GemmMicrokernelTester()
1260 .mr(3)
1261 .nr(4)
1262 .kr(2)
1263 .sr(1)
1264 .m(3)
1265 .n(4)
1266 .k(k)
1267 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1268 }
1269 }
1270 }
1271
1272 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4_strided_cn) {
1273 TEST_REQUIRES_X86_SSE2;
1274 for (uint32_t n = 8; n <= 12; n += 4) {
1275 for (size_t k = 1; k <= 40; k += 9) {
1276 GemmMicrokernelTester()
1277 .mr(3)
1278 .nr(4)
1279 .kr(2)
1280 .sr(1)
1281 .m(3)
1282 .n(n)
1283 .k(k)
1284 .cn_stride(7)
1285 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1286 }
1287 }
1288 }
1289
1290 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4_strided_a) {
1291 TEST_REQUIRES_X86_SSE2;
1292 for (uint32_t n = 8; n <= 12; n += 4) {
1293 for (size_t k = 1; k <= 40; k += 9) {
1294 GemmMicrokernelTester()
1295 .mr(3)
1296 .nr(4)
1297 .kr(2)
1298 .sr(1)
1299 .m(3)
1300 .n(n)
1301 .k(k)
1302 .a_stride(43)
1303 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1304 }
1305 }
1306 }
1307
1308 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4_subtile) {
1309 TEST_REQUIRES_X86_SSE2;
1310 for (uint32_t n = 8; n <= 12; n += 4) {
1311 for (size_t k = 1; k <= 40; k += 9) {
1312 for (uint32_t m = 1; m <= 3; m++) {
1313 GemmMicrokernelTester()
1314 .mr(3)
1315 .nr(4)
1316 .kr(2)
1317 .sr(1)
1318 .m(m)
1319 .n(n)
1320 .k(k)
1321 .iterations(1)
1322 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1323 }
1324 }
1325 }
1326 }
1327
1328 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, strided_cm_subtile) {
1329 TEST_REQUIRES_X86_SSE2;
1330 for (size_t k = 1; k <= 40; k += 9) {
1331 for (uint32_t m = 1; m <= 3; m++) {
1332 for (uint32_t n = 1; n <= 4; n++) {
1333 GemmMicrokernelTester()
1334 .mr(3)
1335 .nr(4)
1336 .kr(2)
1337 .sr(1)
1338 .m(m)
1339 .n(n)
1340 .k(k)
1341 .cm_stride(7)
1342 .iterations(1)
1343 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1344 }
1345 }
1346 }
1347 }
1348
1349 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, qmin) {
1350 TEST_REQUIRES_X86_SSE2;
1351 GemmMicrokernelTester()
1352 .mr(3)
1353 .nr(4)
1354 .kr(2)
1355 .sr(1)
1356 .m(3)
1357 .n(4)
1358 .k(8)
1359 .qmin(128)
1360 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1361 }
1362
1363 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, qmax) {
1364 TEST_REQUIRES_X86_SSE2;
1365 GemmMicrokernelTester()
1366 .mr(3)
1367 .nr(4)
1368 .kr(2)
1369 .sr(1)
1370 .m(3)
1371 .n(4)
1372 .k(8)
1373 .qmax(128)
1374 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1375 }
1376
1377 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, strided_cm) {
1378 TEST_REQUIRES_X86_SSE2;
1379 GemmMicrokernelTester()
1380 .mr(3)
1381 .nr(4)
1382 .kr(2)
1383 .sr(1)
1384 .m(3)
1385 .n(4)
1386 .k(8)
1387 .cm_stride(7)
1388 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1389 }
1390#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1391
1392
1393#if XNN_ARCH_X86 || XNN_ARCH_X86_64
1394 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8) {
1395 TEST_REQUIRES_X86_SSE2;
1396 GemmMicrokernelTester()
1397 .mr(4)
1398 .nr(4)
1399 .kr(2)
1400 .sr(1)
1401 .m(4)
1402 .n(4)
1403 .k(8)
1404 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1405 }
1406
1407 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, strided_cn) {
1408 TEST_REQUIRES_X86_SSE2;
1409 GemmMicrokernelTester()
1410 .mr(4)
1411 .nr(4)
1412 .kr(2)
1413 .sr(1)
1414 .m(4)
1415 .n(4)
1416 .k(8)
1417 .cn_stride(7)
1418 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1419 }
1420
1421 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_strided_a) {
1422 TEST_REQUIRES_X86_SSE2;
1423 GemmMicrokernelTester()
1424 .mr(4)
1425 .nr(4)
1426 .kr(2)
1427 .sr(1)
1428 .m(4)
1429 .n(4)
1430 .k(8)
1431 .a_stride(11)
1432 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1433 }
1434
1435 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_subtile) {
1436 TEST_REQUIRES_X86_SSE2;
1437 for (uint32_t m = 1; m <= 4; m++) {
1438 for (uint32_t n = 1; n <= 4; n++) {
1439 GemmMicrokernelTester()
1440 .mr(4)
1441 .nr(4)
1442 .kr(2)
1443 .sr(1)
1444 .m(m)
1445 .n(n)
1446 .k(8)
1447 .iterations(1)
1448 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1449 }
1450 }
1451 }
1452
1453 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_subtile_m) {
1454 TEST_REQUIRES_X86_SSE2;
1455 for (uint32_t m = 1; m <= 4; m++) {
1456 GemmMicrokernelTester()
1457 .mr(4)
1458 .nr(4)
1459 .kr(2)
1460 .sr(1)
1461 .m(m)
1462 .n(4)
1463 .k(8)
1464 .iterations(1)
1465 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1466 }
1467 }
1468
1469 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_subtile_n) {
1470 TEST_REQUIRES_X86_SSE2;
1471 for (uint32_t n = 1; n <= 4; n++) {
1472 GemmMicrokernelTester()
1473 .mr(4)
1474 .nr(4)
1475 .kr(2)
1476 .sr(1)
1477 .m(4)
1478 .n(n)
1479 .k(8)
1480 .iterations(1)
1481 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1482 }
1483 }
1484
1485 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_lt_8) {
1486 TEST_REQUIRES_X86_SSE2;
1487 for (size_t k = 1; k < 8; k++) {
1488 GemmMicrokernelTester()
1489 .mr(4)
1490 .nr(4)
1491 .kr(2)
1492 .sr(1)
1493 .m(4)
1494 .n(4)
1495 .k(k)
1496 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1497 }
1498 }
1499
1500 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_lt_8_strided_a) {
1501 TEST_REQUIRES_X86_SSE2;
1502 for (size_t k = 1; k < 8; k++) {
1503 GemmMicrokernelTester()
1504 .mr(4)
1505 .nr(4)
1506 .kr(2)
1507 .sr(1)
1508 .m(4)
1509 .n(4)
1510 .k(k)
1511 .a_stride(11)
1512 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1513 }
1514 }
1515
1516 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_lt_8_subtile) {
1517 TEST_REQUIRES_X86_SSE2;
1518 for (size_t k = 1; k < 8; k++) {
1519 for (uint32_t m = 1; m <= 4; m++) {
1520 for (uint32_t n = 1; n <= 4; n++) {
1521 GemmMicrokernelTester()
1522 .mr(4)
1523 .nr(4)
1524 .kr(2)
1525 .sr(1)
1526 .m(m)
1527 .n(n)
1528 .k(k)
1529 .iterations(1)
1530 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1531 }
1532 }
1533 }
1534 }
1535
1536 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_gt_8) {
1537 TEST_REQUIRES_X86_SSE2;
1538 for (size_t k = 9; k < 16; k++) {
1539 GemmMicrokernelTester()
1540 .mr(4)
1541 .nr(4)
1542 .kr(2)
1543 .sr(1)
1544 .m(4)
1545 .n(4)
1546 .k(k)
1547 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1548 }
1549 }
1550
1551 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_gt_8_strided_a) {
1552 TEST_REQUIRES_X86_SSE2;
1553 for (size_t k = 9; k < 16; k++) {
1554 GemmMicrokernelTester()
1555 .mr(4)
1556 .nr(4)
1557 .kr(2)
1558 .sr(1)
1559 .m(4)
1560 .n(4)
1561 .k(k)
1562 .a_stride(19)
1563 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1564 }
1565 }
1566
1567 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_gt_8_subtile) {
1568 TEST_REQUIRES_X86_SSE2;
1569 for (size_t k = 9; k < 16; k++) {
1570 for (uint32_t m = 1; m <= 4; m++) {
1571 for (uint32_t n = 1; n <= 4; n++) {
1572 GemmMicrokernelTester()
1573 .mr(4)
1574 .nr(4)
1575 .kr(2)
1576 .sr(1)
1577 .m(m)
1578 .n(n)
1579 .k(k)
1580 .iterations(1)
1581 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1582 }
1583 }
1584 }
1585 }
1586
1587 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_div_8) {
1588 TEST_REQUIRES_X86_SSE2;
1589 for (size_t k = 16; k <= 80; k += 8) {
1590 GemmMicrokernelTester()
1591 .mr(4)
1592 .nr(4)
1593 .kr(2)
1594 .sr(1)
1595 .m(4)
1596 .n(4)
1597 .k(k)
1598 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1599 }
1600 }
1601
1602 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_div_8_strided_a) {
1603 TEST_REQUIRES_X86_SSE2;
1604 for (size_t k = 16; k <= 80; k += 8) {
1605 GemmMicrokernelTester()
1606 .mr(4)
1607 .nr(4)
1608 .kr(2)
1609 .sr(1)
1610 .m(4)
1611 .n(4)
1612 .k(k)
1613 .a_stride(83)
1614 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1615 }
1616 }
1617
1618 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_div_8_subtile) {
1619 TEST_REQUIRES_X86_SSE2;
1620 for (size_t k = 16; k <= 80; k += 8) {
1621 for (uint32_t m = 1; m <= 4; m++) {
1622 for (uint32_t n = 1; n <= 4; n++) {
1623 GemmMicrokernelTester()
1624 .mr(4)
1625 .nr(4)
1626 .kr(2)
1627 .sr(1)
1628 .m(m)
1629 .n(n)
1630 .k(k)
1631 .iterations(1)
1632 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1633 }
1634 }
1635 }
1636 }
1637
1638 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4) {
1639 TEST_REQUIRES_X86_SSE2;
1640 for (uint32_t n = 5; n < 8; n++) {
1641 for (size_t k = 1; k <= 40; k += 9) {
1642 GemmMicrokernelTester()
1643 .mr(4)
1644 .nr(4)
1645 .kr(2)
1646 .sr(1)
1647 .m(4)
1648 .n(4)
1649 .k(k)
1650 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1651 }
1652 }
1653 }
1654
1655 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4_strided_cn) {
1656 TEST_REQUIRES_X86_SSE2;
1657 for (uint32_t n = 5; n < 8; n++) {
1658 for (size_t k = 1; k <= 40; k += 9) {
1659 GemmMicrokernelTester()
1660 .mr(4)
1661 .nr(4)
1662 .kr(2)
1663 .sr(1)
1664 .m(4)
1665 .n(4)
1666 .k(k)
1667 .cn_stride(7)
1668 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1669 }
1670 }
1671 }
1672
1673 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4_strided_a) {
1674 TEST_REQUIRES_X86_SSE2;
1675 for (uint32_t n = 5; n < 8; n++) {
1676 for (size_t k = 1; k <= 40; k += 9) {
1677 GemmMicrokernelTester()
1678 .mr(4)
1679 .nr(4)
1680 .kr(2)
1681 .sr(1)
1682 .m(4)
1683 .n(n)
1684 .k(k)
1685 .a_stride(43)
1686 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1687 }
1688 }
1689 }
1690
1691 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4_subtile) {
1692 TEST_REQUIRES_X86_SSE2;
1693 for (uint32_t n = 5; n < 8; n++) {
1694 for (size_t k = 1; k <= 40; k += 9) {
1695 for (uint32_t m = 1; m <= 4; m++) {
1696 GemmMicrokernelTester()
1697 .mr(4)
1698 .nr(4)
1699 .kr(2)
1700 .sr(1)
1701 .m(m)
1702 .n(n)
1703 .k(k)
1704 .iterations(1)
1705 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1706 }
1707 }
1708 }
1709 }
1710
1711 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4) {
1712 TEST_REQUIRES_X86_SSE2;
1713 for (uint32_t n = 8; n <= 12; n += 4) {
1714 for (size_t k = 1; k <= 40; k += 9) {
1715 GemmMicrokernelTester()
1716 .mr(4)
1717 .nr(4)
1718 .kr(2)
1719 .sr(1)
1720 .m(4)
1721 .n(4)
1722 .k(k)
1723 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1724 }
1725 }
1726 }
1727
1728 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4_strided_cn) {
1729 TEST_REQUIRES_X86_SSE2;
1730 for (uint32_t n = 8; n <= 12; n += 4) {
1731 for (size_t k = 1; k <= 40; k += 9) {
1732 GemmMicrokernelTester()
1733 .mr(4)
1734 .nr(4)
1735 .kr(2)
1736 .sr(1)
1737 .m(4)
1738 .n(n)
1739 .k(k)
1740 .cn_stride(7)
1741 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1742 }
1743 }
1744 }
1745
1746 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4_strided_a) {
1747 TEST_REQUIRES_X86_SSE2;
1748 for (uint32_t n = 8; n <= 12; n += 4) {
1749 for (size_t k = 1; k <= 40; k += 9) {
1750 GemmMicrokernelTester()
1751 .mr(4)
1752 .nr(4)
1753 .kr(2)
1754 .sr(1)
1755 .m(4)
1756 .n(n)
1757 .k(k)
1758 .a_stride(43)
1759 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1760 }
1761 }
1762 }
1763
1764 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4_subtile) {
1765 TEST_REQUIRES_X86_SSE2;
1766 for (uint32_t n = 8; n <= 12; n += 4) {
1767 for (size_t k = 1; k <= 40; k += 9) {
1768 for (uint32_t m = 1; m <= 4; m++) {
1769 GemmMicrokernelTester()
1770 .mr(4)
1771 .nr(4)
1772 .kr(2)
1773 .sr(1)
1774 .m(m)
1775 .n(n)
1776 .k(k)
1777 .iterations(1)
1778 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1779 }
1780 }
1781 }
1782 }
1783
1784 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, strided_cm_subtile) {
1785 TEST_REQUIRES_X86_SSE2;
1786 for (size_t k = 1; k <= 40; k += 9) {
1787 for (uint32_t m = 1; m <= 4; m++) {
1788 for (uint32_t n = 1; n <= 4; n++) {
1789 GemmMicrokernelTester()
1790 .mr(4)
1791 .nr(4)
1792 .kr(2)
1793 .sr(1)
1794 .m(m)
1795 .n(n)
1796 .k(k)
1797 .cm_stride(7)
1798 .iterations(1)
1799 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1800 }
1801 }
1802 }
1803 }
1804
1805 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, qmin) {
1806 TEST_REQUIRES_X86_SSE2;
1807 GemmMicrokernelTester()
1808 .mr(4)
1809 .nr(4)
1810 .kr(2)
1811 .sr(1)
1812 .m(4)
1813 .n(4)
1814 .k(8)
1815 .qmin(128)
1816 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1817 }
1818
1819 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, qmax) {
1820 TEST_REQUIRES_X86_SSE2;
1821 GemmMicrokernelTester()
1822 .mr(4)
1823 .nr(4)
1824 .kr(2)
1825 .sr(1)
1826 .m(4)
1827 .n(4)
1828 .k(8)
1829 .qmax(128)
1830 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1831 }
1832
1833 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, strided_cm) {
1834 TEST_REQUIRES_X86_SSE2;
1835 GemmMicrokernelTester()
1836 .mr(4)
1837 .nr(4)
1838 .kr(2)
1839 .sr(1)
1840 .m(4)
1841 .n(4)
1842 .k(8)
1843 .cm_stride(7)
1844 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1845 }
1846#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1847
1848
1849#if XNN_ARCH_X86 || XNN_ARCH_X86_64
1850 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8) {
1851 TEST_REQUIRES_X86_SSE41;
1852 GemmMicrokernelTester()
1853 .mr(1)
1854 .nr(4)
1855 .kr(2)
1856 .sr(1)
1857 .m(1)
1858 .n(4)
1859 .k(8)
1860 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1861 }
1862
1863 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, strided_cn) {
1864 TEST_REQUIRES_X86_SSE41;
1865 GemmMicrokernelTester()
1866 .mr(1)
1867 .nr(4)
1868 .kr(2)
1869 .sr(1)
1870 .m(1)
1871 .n(4)
1872 .k(8)
1873 .cn_stride(7)
1874 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1875 }
1876
1877 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8_strided_a) {
1878 TEST_REQUIRES_X86_SSE41;
1879 GemmMicrokernelTester()
1880 .mr(1)
1881 .nr(4)
1882 .kr(2)
1883 .sr(1)
1884 .m(1)
1885 .n(4)
1886 .k(8)
1887 .a_stride(11)
1888 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1889 }
1890
1891 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8_subtile) {
1892 TEST_REQUIRES_X86_SSE41;
1893 for (uint32_t m = 1; m <= 1; m++) {
1894 for (uint32_t n = 1; n <= 4; n++) {
1895 GemmMicrokernelTester()
1896 .mr(1)
1897 .nr(4)
1898 .kr(2)
1899 .sr(1)
1900 .m(m)
1901 .n(n)
1902 .k(8)
1903 .iterations(1)
1904 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1905 }
1906 }
1907 }
1908
1909 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8_subtile_m) {
1910 TEST_REQUIRES_X86_SSE41;
1911 for (uint32_t m = 1; m <= 1; m++) {
1912 GemmMicrokernelTester()
1913 .mr(1)
1914 .nr(4)
1915 .kr(2)
1916 .sr(1)
1917 .m(m)
1918 .n(4)
1919 .k(8)
1920 .iterations(1)
1921 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1922 }
1923 }
1924
1925 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8_subtile_n) {
1926 TEST_REQUIRES_X86_SSE41;
1927 for (uint32_t n = 1; n <= 4; n++) {
1928 GemmMicrokernelTester()
1929 .mr(1)
1930 .nr(4)
1931 .kr(2)
1932 .sr(1)
1933 .m(1)
1934 .n(n)
1935 .k(8)
1936 .iterations(1)
1937 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1938 }
1939 }
1940
1941 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_lt_8) {
1942 TEST_REQUIRES_X86_SSE41;
1943 for (size_t k = 1; k < 8; k++) {
1944 GemmMicrokernelTester()
1945 .mr(1)
1946 .nr(4)
1947 .kr(2)
1948 .sr(1)
1949 .m(1)
1950 .n(4)
1951 .k(k)
1952 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1953 }
1954 }
1955
1956 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_lt_8_strided_a) {
1957 TEST_REQUIRES_X86_SSE41;
1958 for (size_t k = 1; k < 8; k++) {
1959 GemmMicrokernelTester()
1960 .mr(1)
1961 .nr(4)
1962 .kr(2)
1963 .sr(1)
1964 .m(1)
1965 .n(4)
1966 .k(k)
1967 .a_stride(11)
1968 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1969 }
1970 }
1971
1972 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_lt_8_subtile) {
1973 TEST_REQUIRES_X86_SSE41;
1974 for (size_t k = 1; k < 8; k++) {
1975 for (uint32_t m = 1; m <= 1; m++) {
1976 for (uint32_t n = 1; n <= 4; n++) {
1977 GemmMicrokernelTester()
1978 .mr(1)
1979 .nr(4)
1980 .kr(2)
1981 .sr(1)
1982 .m(m)
1983 .n(n)
1984 .k(k)
1985 .iterations(1)
1986 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
1987 }
1988 }
1989 }
1990 }
1991
1992 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_gt_8) {
1993 TEST_REQUIRES_X86_SSE41;
1994 for (size_t k = 9; k < 16; k++) {
1995 GemmMicrokernelTester()
1996 .mr(1)
1997 .nr(4)
1998 .kr(2)
1999 .sr(1)
2000 .m(1)
2001 .n(4)
2002 .k(k)
2003 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2004 }
2005 }
2006
2007 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_gt_8_strided_a) {
2008 TEST_REQUIRES_X86_SSE41;
2009 for (size_t k = 9; k < 16; k++) {
2010 GemmMicrokernelTester()
2011 .mr(1)
2012 .nr(4)
2013 .kr(2)
2014 .sr(1)
2015 .m(1)
2016 .n(4)
2017 .k(k)
2018 .a_stride(19)
2019 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2020 }
2021 }
2022
2023 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_gt_8_subtile) {
2024 TEST_REQUIRES_X86_SSE41;
2025 for (size_t k = 9; k < 16; k++) {
2026 for (uint32_t m = 1; m <= 1; m++) {
2027 for (uint32_t n = 1; n <= 4; n++) {
2028 GemmMicrokernelTester()
2029 .mr(1)
2030 .nr(4)
2031 .kr(2)
2032 .sr(1)
2033 .m(m)
2034 .n(n)
2035 .k(k)
2036 .iterations(1)
2037 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2038 }
2039 }
2040 }
2041 }
2042
2043 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_div_8) {
2044 TEST_REQUIRES_X86_SSE41;
2045 for (size_t k = 16; k <= 80; k += 8) {
2046 GemmMicrokernelTester()
2047 .mr(1)
2048 .nr(4)
2049 .kr(2)
2050 .sr(1)
2051 .m(1)
2052 .n(4)
2053 .k(k)
2054 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2055 }
2056 }
2057
2058 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_div_8_strided_a) {
2059 TEST_REQUIRES_X86_SSE41;
2060 for (size_t k = 16; k <= 80; k += 8) {
2061 GemmMicrokernelTester()
2062 .mr(1)
2063 .nr(4)
2064 .kr(2)
2065 .sr(1)
2066 .m(1)
2067 .n(4)
2068 .k(k)
2069 .a_stride(83)
2070 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2071 }
2072 }
2073
2074 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_div_8_subtile) {
2075 TEST_REQUIRES_X86_SSE41;
2076 for (size_t k = 16; k <= 80; k += 8) {
2077 for (uint32_t m = 1; m <= 1; m++) {
2078 for (uint32_t n = 1; n <= 4; n++) {
2079 GemmMicrokernelTester()
2080 .mr(1)
2081 .nr(4)
2082 .kr(2)
2083 .sr(1)
2084 .m(m)
2085 .n(n)
2086 .k(k)
2087 .iterations(1)
2088 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2089 }
2090 }
2091 }
2092 }
2093
2094 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4) {
2095 TEST_REQUIRES_X86_SSE41;
2096 for (uint32_t n = 5; n < 8; n++) {
2097 for (size_t k = 1; k <= 40; k += 9) {
2098 GemmMicrokernelTester()
2099 .mr(1)
2100 .nr(4)
2101 .kr(2)
2102 .sr(1)
2103 .m(1)
2104 .n(4)
2105 .k(k)
2106 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2107 }
2108 }
2109 }
2110
2111 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4_strided_cn) {
2112 TEST_REQUIRES_X86_SSE41;
2113 for (uint32_t n = 5; n < 8; n++) {
2114 for (size_t k = 1; k <= 40; k += 9) {
2115 GemmMicrokernelTester()
2116 .mr(1)
2117 .nr(4)
2118 .kr(2)
2119 .sr(1)
2120 .m(1)
2121 .n(4)
2122 .k(k)
2123 .cn_stride(7)
2124 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2125 }
2126 }
2127 }
2128
2129 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4_strided_a) {
2130 TEST_REQUIRES_X86_SSE41;
2131 for (uint32_t n = 5; n < 8; n++) {
2132 for (size_t k = 1; k <= 40; k += 9) {
2133 GemmMicrokernelTester()
2134 .mr(1)
2135 .nr(4)
2136 .kr(2)
2137 .sr(1)
2138 .m(1)
2139 .n(n)
2140 .k(k)
2141 .a_stride(43)
2142 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2143 }
2144 }
2145 }
2146
2147 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4_subtile) {
2148 TEST_REQUIRES_X86_SSE41;
2149 for (uint32_t n = 5; n < 8; n++) {
2150 for (size_t k = 1; k <= 40; k += 9) {
2151 for (uint32_t m = 1; m <= 1; m++) {
2152 GemmMicrokernelTester()
2153 .mr(1)
2154 .nr(4)
2155 .kr(2)
2156 .sr(1)
2157 .m(m)
2158 .n(n)
2159 .k(k)
2160 .iterations(1)
2161 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2162 }
2163 }
2164 }
2165 }
2166
2167 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4) {
2168 TEST_REQUIRES_X86_SSE41;
2169 for (uint32_t n = 8; n <= 12; n += 4) {
2170 for (size_t k = 1; k <= 40; k += 9) {
2171 GemmMicrokernelTester()
2172 .mr(1)
2173 .nr(4)
2174 .kr(2)
2175 .sr(1)
2176 .m(1)
2177 .n(4)
2178 .k(k)
2179 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2180 }
2181 }
2182 }
2183
2184 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4_strided_cn) {
2185 TEST_REQUIRES_X86_SSE41;
2186 for (uint32_t n = 8; n <= 12; n += 4) {
2187 for (size_t k = 1; k <= 40; k += 9) {
2188 GemmMicrokernelTester()
2189 .mr(1)
2190 .nr(4)
2191 .kr(2)
2192 .sr(1)
2193 .m(1)
2194 .n(n)
2195 .k(k)
2196 .cn_stride(7)
2197 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2198 }
2199 }
2200 }
2201
2202 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4_strided_a) {
2203 TEST_REQUIRES_X86_SSE41;
2204 for (uint32_t n = 8; n <= 12; n += 4) {
2205 for (size_t k = 1; k <= 40; k += 9) {
2206 GemmMicrokernelTester()
2207 .mr(1)
2208 .nr(4)
2209 .kr(2)
2210 .sr(1)
2211 .m(1)
2212 .n(n)
2213 .k(k)
2214 .a_stride(43)
2215 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2216 }
2217 }
2218 }
2219
2220 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4_subtile) {
2221 TEST_REQUIRES_X86_SSE41;
2222 for (uint32_t n = 8; n <= 12; n += 4) {
2223 for (size_t k = 1; k <= 40; k += 9) {
2224 for (uint32_t m = 1; m <= 1; m++) {
2225 GemmMicrokernelTester()
2226 .mr(1)
2227 .nr(4)
2228 .kr(2)
2229 .sr(1)
2230 .m(m)
2231 .n(n)
2232 .k(k)
2233 .iterations(1)
2234 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2235 }
2236 }
2237 }
2238 }
2239
2240 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, strided_cm_subtile) {
2241 TEST_REQUIRES_X86_SSE41;
2242 for (size_t k = 1; k <= 40; k += 9) {
2243 for (uint32_t m = 1; m <= 1; m++) {
2244 for (uint32_t n = 1; n <= 4; n++) {
2245 GemmMicrokernelTester()
2246 .mr(1)
2247 .nr(4)
2248 .kr(2)
2249 .sr(1)
2250 .m(m)
2251 .n(n)
2252 .k(k)
2253 .cm_stride(7)
2254 .iterations(1)
2255 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2256 }
2257 }
2258 }
2259 }
2260
2261 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, qmin) {
2262 TEST_REQUIRES_X86_SSE41;
2263 GemmMicrokernelTester()
2264 .mr(1)
2265 .nr(4)
2266 .kr(2)
2267 .sr(1)
2268 .m(1)
2269 .n(4)
2270 .k(8)
2271 .qmin(128)
2272 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2273 }
2274
2275 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, qmax) {
2276 TEST_REQUIRES_X86_SSE41;
2277 GemmMicrokernelTester()
2278 .mr(1)
2279 .nr(4)
2280 .kr(2)
2281 .sr(1)
2282 .m(1)
2283 .n(4)
2284 .k(8)
2285 .qmax(128)
2286 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2287 }
2288
2289 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, strided_cm) {
2290 TEST_REQUIRES_X86_SSE41;
2291 GemmMicrokernelTester()
2292 .mr(1)
2293 .nr(4)
2294 .kr(2)
2295 .sr(1)
2296 .m(1)
2297 .n(4)
2298 .k(8)
2299 .cm_stride(7)
2300 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2301 }
2302#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
2303
2304
2305#if XNN_ARCH_X86 || XNN_ARCH_X86_64
2306 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8) {
2307 TEST_REQUIRES_X86_SSE41;
2308 GemmMicrokernelTester()
2309 .mr(2)
2310 .nr(4)
2311 .kr(2)
2312 .sr(1)
2313 .m(2)
2314 .n(4)
2315 .k(8)
2316 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2317 }
2318
2319 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, strided_cn) {
2320 TEST_REQUIRES_X86_SSE41;
2321 GemmMicrokernelTester()
2322 .mr(2)
2323 .nr(4)
2324 .kr(2)
2325 .sr(1)
2326 .m(2)
2327 .n(4)
2328 .k(8)
2329 .cn_stride(7)
2330 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2331 }
2332
2333 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_strided_a) {
2334 TEST_REQUIRES_X86_SSE41;
2335 GemmMicrokernelTester()
2336 .mr(2)
2337 .nr(4)
2338 .kr(2)
2339 .sr(1)
2340 .m(2)
2341 .n(4)
2342 .k(8)
2343 .a_stride(11)
2344 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2345 }
2346
2347 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_subtile) {
2348 TEST_REQUIRES_X86_SSE41;
2349 for (uint32_t m = 1; m <= 2; m++) {
2350 for (uint32_t n = 1; n <= 4; n++) {
2351 GemmMicrokernelTester()
2352 .mr(2)
2353 .nr(4)
2354 .kr(2)
2355 .sr(1)
2356 .m(m)
2357 .n(n)
2358 .k(8)
2359 .iterations(1)
2360 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2361 }
2362 }
2363 }
2364
2365 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_subtile_m) {
2366 TEST_REQUIRES_X86_SSE41;
2367 for (uint32_t m = 1; m <= 2; m++) {
2368 GemmMicrokernelTester()
2369 .mr(2)
2370 .nr(4)
2371 .kr(2)
2372 .sr(1)
2373 .m(m)
2374 .n(4)
2375 .k(8)
2376 .iterations(1)
2377 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2378 }
2379 }
2380
2381 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_subtile_n) {
2382 TEST_REQUIRES_X86_SSE41;
2383 for (uint32_t n = 1; n <= 4; n++) {
2384 GemmMicrokernelTester()
2385 .mr(2)
2386 .nr(4)
2387 .kr(2)
2388 .sr(1)
2389 .m(2)
2390 .n(n)
2391 .k(8)
2392 .iterations(1)
2393 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2394 }
2395 }
2396
2397 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_lt_8) {
2398 TEST_REQUIRES_X86_SSE41;
2399 for (size_t k = 1; k < 8; k++) {
2400 GemmMicrokernelTester()
2401 .mr(2)
2402 .nr(4)
2403 .kr(2)
2404 .sr(1)
2405 .m(2)
2406 .n(4)
2407 .k(k)
2408 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2409 }
2410 }
2411
2412 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_lt_8_strided_a) {
2413 TEST_REQUIRES_X86_SSE41;
2414 for (size_t k = 1; k < 8; k++) {
2415 GemmMicrokernelTester()
2416 .mr(2)
2417 .nr(4)
2418 .kr(2)
2419 .sr(1)
2420 .m(2)
2421 .n(4)
2422 .k(k)
2423 .a_stride(11)
2424 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2425 }
2426 }
2427
2428 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_lt_8_subtile) {
2429 TEST_REQUIRES_X86_SSE41;
2430 for (size_t k = 1; k < 8; k++) {
2431 for (uint32_t m = 1; m <= 2; m++) {
2432 for (uint32_t n = 1; n <= 4; n++) {
2433 GemmMicrokernelTester()
2434 .mr(2)
2435 .nr(4)
2436 .kr(2)
2437 .sr(1)
2438 .m(m)
2439 .n(n)
2440 .k(k)
2441 .iterations(1)
2442 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2443 }
2444 }
2445 }
2446 }
2447
2448 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_gt_8) {
2449 TEST_REQUIRES_X86_SSE41;
2450 for (size_t k = 9; k < 16; k++) {
2451 GemmMicrokernelTester()
2452 .mr(2)
2453 .nr(4)
2454 .kr(2)
2455 .sr(1)
2456 .m(2)
2457 .n(4)
2458 .k(k)
2459 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2460 }
2461 }
2462
2463 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_gt_8_strided_a) {
2464 TEST_REQUIRES_X86_SSE41;
2465 for (size_t k = 9; k < 16; k++) {
2466 GemmMicrokernelTester()
2467 .mr(2)
2468 .nr(4)
2469 .kr(2)
2470 .sr(1)
2471 .m(2)
2472 .n(4)
2473 .k(k)
2474 .a_stride(19)
2475 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2476 }
2477 }
2478
2479 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_gt_8_subtile) {
2480 TEST_REQUIRES_X86_SSE41;
2481 for (size_t k = 9; k < 16; k++) {
2482 for (uint32_t m = 1; m <= 2; m++) {
2483 for (uint32_t n = 1; n <= 4; n++) {
2484 GemmMicrokernelTester()
2485 .mr(2)
2486 .nr(4)
2487 .kr(2)
2488 .sr(1)
2489 .m(m)
2490 .n(n)
2491 .k(k)
2492 .iterations(1)
2493 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2494 }
2495 }
2496 }
2497 }
2498
2499 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_div_8) {
2500 TEST_REQUIRES_X86_SSE41;
2501 for (size_t k = 16; k <= 80; k += 8) {
2502 GemmMicrokernelTester()
2503 .mr(2)
2504 .nr(4)
2505 .kr(2)
2506 .sr(1)
2507 .m(2)
2508 .n(4)
2509 .k(k)
2510 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2511 }
2512 }
2513
2514 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_div_8_strided_a) {
2515 TEST_REQUIRES_X86_SSE41;
2516 for (size_t k = 16; k <= 80; k += 8) {
2517 GemmMicrokernelTester()
2518 .mr(2)
2519 .nr(4)
2520 .kr(2)
2521 .sr(1)
2522 .m(2)
2523 .n(4)
2524 .k(k)
2525 .a_stride(83)
2526 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2527 }
2528 }
2529
2530 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_div_8_subtile) {
2531 TEST_REQUIRES_X86_SSE41;
2532 for (size_t k = 16; k <= 80; k += 8) {
2533 for (uint32_t m = 1; m <= 2; m++) {
2534 for (uint32_t n = 1; n <= 4; n++) {
2535 GemmMicrokernelTester()
2536 .mr(2)
2537 .nr(4)
2538 .kr(2)
2539 .sr(1)
2540 .m(m)
2541 .n(n)
2542 .k(k)
2543 .iterations(1)
2544 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2545 }
2546 }
2547 }
2548 }
2549
2550 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4) {
2551 TEST_REQUIRES_X86_SSE41;
2552 for (uint32_t n = 5; n < 8; n++) {
2553 for (size_t k = 1; k <= 40; k += 9) {
2554 GemmMicrokernelTester()
2555 .mr(2)
2556 .nr(4)
2557 .kr(2)
2558 .sr(1)
2559 .m(2)
2560 .n(4)
2561 .k(k)
2562 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2563 }
2564 }
2565 }
2566
2567 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4_strided_cn) {
2568 TEST_REQUIRES_X86_SSE41;
2569 for (uint32_t n = 5; n < 8; n++) {
2570 for (size_t k = 1; k <= 40; k += 9) {
2571 GemmMicrokernelTester()
2572 .mr(2)
2573 .nr(4)
2574 .kr(2)
2575 .sr(1)
2576 .m(2)
2577 .n(4)
2578 .k(k)
2579 .cn_stride(7)
2580 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2581 }
2582 }
2583 }
2584
2585 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4_strided_a) {
2586 TEST_REQUIRES_X86_SSE41;
2587 for (uint32_t n = 5; n < 8; n++) {
2588 for (size_t k = 1; k <= 40; k += 9) {
2589 GemmMicrokernelTester()
2590 .mr(2)
2591 .nr(4)
2592 .kr(2)
2593 .sr(1)
2594 .m(2)
2595 .n(n)
2596 .k(k)
2597 .a_stride(43)
2598 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2599 }
2600 }
2601 }
2602
2603 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4_subtile) {
2604 TEST_REQUIRES_X86_SSE41;
2605 for (uint32_t n = 5; n < 8; n++) {
2606 for (size_t k = 1; k <= 40; k += 9) {
2607 for (uint32_t m = 1; m <= 2; m++) {
2608 GemmMicrokernelTester()
2609 .mr(2)
2610 .nr(4)
2611 .kr(2)
2612 .sr(1)
2613 .m(m)
2614 .n(n)
2615 .k(k)
2616 .iterations(1)
2617 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2618 }
2619 }
2620 }
2621 }
2622
2623 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4) {
2624 TEST_REQUIRES_X86_SSE41;
2625 for (uint32_t n = 8; n <= 12; n += 4) {
2626 for (size_t k = 1; k <= 40; k += 9) {
2627 GemmMicrokernelTester()
2628 .mr(2)
2629 .nr(4)
2630 .kr(2)
2631 .sr(1)
2632 .m(2)
2633 .n(4)
2634 .k(k)
2635 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2636 }
2637 }
2638 }
2639
2640 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4_strided_cn) {
2641 TEST_REQUIRES_X86_SSE41;
2642 for (uint32_t n = 8; n <= 12; n += 4) {
2643 for (size_t k = 1; k <= 40; k += 9) {
2644 GemmMicrokernelTester()
2645 .mr(2)
2646 .nr(4)
2647 .kr(2)
2648 .sr(1)
2649 .m(2)
2650 .n(n)
2651 .k(k)
2652 .cn_stride(7)
2653 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2654 }
2655 }
2656 }
2657
2658 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4_strided_a) {
2659 TEST_REQUIRES_X86_SSE41;
2660 for (uint32_t n = 8; n <= 12; n += 4) {
2661 for (size_t k = 1; k <= 40; k += 9) {
2662 GemmMicrokernelTester()
2663 .mr(2)
2664 .nr(4)
2665 .kr(2)
2666 .sr(1)
2667 .m(2)
2668 .n(n)
2669 .k(k)
2670 .a_stride(43)
2671 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2672 }
2673 }
2674 }
2675
2676 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4_subtile) {
2677 TEST_REQUIRES_X86_SSE41;
2678 for (uint32_t n = 8; n <= 12; n += 4) {
2679 for (size_t k = 1; k <= 40; k += 9) {
2680 for (uint32_t m = 1; m <= 2; m++) {
2681 GemmMicrokernelTester()
2682 .mr(2)
2683 .nr(4)
2684 .kr(2)
2685 .sr(1)
2686 .m(m)
2687 .n(n)
2688 .k(k)
2689 .iterations(1)
2690 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2691 }
2692 }
2693 }
2694 }
2695
2696 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, strided_cm_subtile) {
2697 TEST_REQUIRES_X86_SSE41;
2698 for (size_t k = 1; k <= 40; k += 9) {
2699 for (uint32_t m = 1; m <= 2; m++) {
2700 for (uint32_t n = 1; n <= 4; n++) {
2701 GemmMicrokernelTester()
2702 .mr(2)
2703 .nr(4)
2704 .kr(2)
2705 .sr(1)
2706 .m(m)
2707 .n(n)
2708 .k(k)
2709 .cm_stride(7)
2710 .iterations(1)
2711 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2712 }
2713 }
2714 }
2715 }
2716
2717 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, qmin) {
2718 TEST_REQUIRES_X86_SSE41;
2719 GemmMicrokernelTester()
2720 .mr(2)
2721 .nr(4)
2722 .kr(2)
2723 .sr(1)
2724 .m(2)
2725 .n(4)
2726 .k(8)
2727 .qmin(128)
2728 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2729 }
2730
2731 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, qmax) {
2732 TEST_REQUIRES_X86_SSE41;
2733 GemmMicrokernelTester()
2734 .mr(2)
2735 .nr(4)
2736 .kr(2)
2737 .sr(1)
2738 .m(2)
2739 .n(4)
2740 .k(8)
2741 .qmax(128)
2742 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2743 }
2744
2745 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, strided_cm) {
2746 TEST_REQUIRES_X86_SSE41;
2747 GemmMicrokernelTester()
2748 .mr(2)
2749 .nr(4)
2750 .kr(2)
2751 .sr(1)
2752 .m(2)
2753 .n(4)
2754 .k(8)
2755 .cm_stride(7)
2756 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2757 }
2758#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
2759
2760
2761#if XNN_ARCH_X86 || XNN_ARCH_X86_64
2762 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8) {
2763 TEST_REQUIRES_X86_SSE41;
2764 GemmMicrokernelTester()
2765 .mr(3)
2766 .nr(4)
2767 .kr(2)
2768 .sr(1)
2769 .m(3)
2770 .n(4)
2771 .k(8)
2772 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2773 }
2774
2775 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, strided_cn) {
2776 TEST_REQUIRES_X86_SSE41;
2777 GemmMicrokernelTester()
2778 .mr(3)
2779 .nr(4)
2780 .kr(2)
2781 .sr(1)
2782 .m(3)
2783 .n(4)
2784 .k(8)
2785 .cn_stride(7)
2786 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2787 }
2788
2789 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8_strided_a) {
2790 TEST_REQUIRES_X86_SSE41;
2791 GemmMicrokernelTester()
2792 .mr(3)
2793 .nr(4)
2794 .kr(2)
2795 .sr(1)
2796 .m(3)
2797 .n(4)
2798 .k(8)
2799 .a_stride(11)
2800 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2801 }
2802
2803 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8_subtile) {
2804 TEST_REQUIRES_X86_SSE41;
2805 for (uint32_t m = 1; m <= 3; m++) {
2806 for (uint32_t n = 1; n <= 4; n++) {
2807 GemmMicrokernelTester()
2808 .mr(3)
2809 .nr(4)
2810 .kr(2)
2811 .sr(1)
2812 .m(m)
2813 .n(n)
2814 .k(8)
2815 .iterations(1)
2816 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2817 }
2818 }
2819 }
2820
2821 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8_subtile_m) {
2822 TEST_REQUIRES_X86_SSE41;
2823 for (uint32_t m = 1; m <= 3; m++) {
2824 GemmMicrokernelTester()
2825 .mr(3)
2826 .nr(4)
2827 .kr(2)
2828 .sr(1)
2829 .m(m)
2830 .n(4)
2831 .k(8)
2832 .iterations(1)
2833 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2834 }
2835 }
2836
2837 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8_subtile_n) {
2838 TEST_REQUIRES_X86_SSE41;
2839 for (uint32_t n = 1; n <= 4; n++) {
2840 GemmMicrokernelTester()
2841 .mr(3)
2842 .nr(4)
2843 .kr(2)
2844 .sr(1)
2845 .m(3)
2846 .n(n)
2847 .k(8)
2848 .iterations(1)
2849 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2850 }
2851 }
2852
2853 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_lt_8) {
2854 TEST_REQUIRES_X86_SSE41;
2855 for (size_t k = 1; k < 8; k++) {
2856 GemmMicrokernelTester()
2857 .mr(3)
2858 .nr(4)
2859 .kr(2)
2860 .sr(1)
2861 .m(3)
2862 .n(4)
2863 .k(k)
2864 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2865 }
2866 }
2867
2868 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_lt_8_strided_a) {
2869 TEST_REQUIRES_X86_SSE41;
2870 for (size_t k = 1; k < 8; k++) {
2871 GemmMicrokernelTester()
2872 .mr(3)
2873 .nr(4)
2874 .kr(2)
2875 .sr(1)
2876 .m(3)
2877 .n(4)
2878 .k(k)
2879 .a_stride(11)
2880 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2881 }
2882 }
2883
2884 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_lt_8_subtile) {
2885 TEST_REQUIRES_X86_SSE41;
2886 for (size_t k = 1; k < 8; k++) {
2887 for (uint32_t m = 1; m <= 3; m++) {
2888 for (uint32_t n = 1; n <= 4; n++) {
2889 GemmMicrokernelTester()
2890 .mr(3)
2891 .nr(4)
2892 .kr(2)
2893 .sr(1)
2894 .m(m)
2895 .n(n)
2896 .k(k)
2897 .iterations(1)
2898 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2899 }
2900 }
2901 }
2902 }
2903
2904 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_gt_8) {
2905 TEST_REQUIRES_X86_SSE41;
2906 for (size_t k = 9; k < 16; k++) {
2907 GemmMicrokernelTester()
2908 .mr(3)
2909 .nr(4)
2910 .kr(2)
2911 .sr(1)
2912 .m(3)
2913 .n(4)
2914 .k(k)
2915 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2916 }
2917 }
2918
2919 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_gt_8_strided_a) {
2920 TEST_REQUIRES_X86_SSE41;
2921 for (size_t k = 9; k < 16; k++) {
2922 GemmMicrokernelTester()
2923 .mr(3)
2924 .nr(4)
2925 .kr(2)
2926 .sr(1)
2927 .m(3)
2928 .n(4)
2929 .k(k)
2930 .a_stride(19)
2931 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2932 }
2933 }
2934
2935 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_gt_8_subtile) {
2936 TEST_REQUIRES_X86_SSE41;
2937 for (size_t k = 9; k < 16; k++) {
2938 for (uint32_t m = 1; m <= 3; m++) {
2939 for (uint32_t n = 1; n <= 4; n++) {
2940 GemmMicrokernelTester()
2941 .mr(3)
2942 .nr(4)
2943 .kr(2)
2944 .sr(1)
2945 .m(m)
2946 .n(n)
2947 .k(k)
2948 .iterations(1)
2949 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2950 }
2951 }
2952 }
2953 }
2954
2955 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_div_8) {
2956 TEST_REQUIRES_X86_SSE41;
2957 for (size_t k = 16; k <= 80; k += 8) {
2958 GemmMicrokernelTester()
2959 .mr(3)
2960 .nr(4)
2961 .kr(2)
2962 .sr(1)
2963 .m(3)
2964 .n(4)
2965 .k(k)
2966 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2967 }
2968 }
2969
2970 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_div_8_strided_a) {
2971 TEST_REQUIRES_X86_SSE41;
2972 for (size_t k = 16; k <= 80; k += 8) {
2973 GemmMicrokernelTester()
2974 .mr(3)
2975 .nr(4)
2976 .kr(2)
2977 .sr(1)
2978 .m(3)
2979 .n(4)
2980 .k(k)
2981 .a_stride(83)
2982 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
2983 }
2984 }
2985
2986 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_div_8_subtile) {
2987 TEST_REQUIRES_X86_SSE41;
2988 for (size_t k = 16; k <= 80; k += 8) {
2989 for (uint32_t m = 1; m <= 3; m++) {
2990 for (uint32_t n = 1; n <= 4; n++) {
2991 GemmMicrokernelTester()
2992 .mr(3)
2993 .nr(4)
2994 .kr(2)
2995 .sr(1)
2996 .m(m)
2997 .n(n)
2998 .k(k)
2999 .iterations(1)
3000 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3001 }
3002 }
3003 }
3004 }
3005
3006 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4) {
3007 TEST_REQUIRES_X86_SSE41;
3008 for (uint32_t n = 5; n < 8; n++) {
3009 for (size_t k = 1; k <= 40; k += 9) {
3010 GemmMicrokernelTester()
3011 .mr(3)
3012 .nr(4)
3013 .kr(2)
3014 .sr(1)
3015 .m(3)
3016 .n(4)
3017 .k(k)
3018 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3019 }
3020 }
3021 }
3022
3023 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4_strided_cn) {
3024 TEST_REQUIRES_X86_SSE41;
3025 for (uint32_t n = 5; n < 8; n++) {
3026 for (size_t k = 1; k <= 40; k += 9) {
3027 GemmMicrokernelTester()
3028 .mr(3)
3029 .nr(4)
3030 .kr(2)
3031 .sr(1)
3032 .m(3)
3033 .n(4)
3034 .k(k)
3035 .cn_stride(7)
3036 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3037 }
3038 }
3039 }
3040
3041 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4_strided_a) {
3042 TEST_REQUIRES_X86_SSE41;
3043 for (uint32_t n = 5; n < 8; n++) {
3044 for (size_t k = 1; k <= 40; k += 9) {
3045 GemmMicrokernelTester()
3046 .mr(3)
3047 .nr(4)
3048 .kr(2)
3049 .sr(1)
3050 .m(3)
3051 .n(n)
3052 .k(k)
3053 .a_stride(43)
3054 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3055 }
3056 }
3057 }
3058
3059 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4_subtile) {
3060 TEST_REQUIRES_X86_SSE41;
3061 for (uint32_t n = 5; n < 8; n++) {
3062 for (size_t k = 1; k <= 40; k += 9) {
3063 for (uint32_t m = 1; m <= 3; m++) {
3064 GemmMicrokernelTester()
3065 .mr(3)
3066 .nr(4)
3067 .kr(2)
3068 .sr(1)
3069 .m(m)
3070 .n(n)
3071 .k(k)
3072 .iterations(1)
3073 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3074 }
3075 }
3076 }
3077 }
3078
3079 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4) {
3080 TEST_REQUIRES_X86_SSE41;
3081 for (uint32_t n = 8; n <= 12; n += 4) {
3082 for (size_t k = 1; k <= 40; k += 9) {
3083 GemmMicrokernelTester()
3084 .mr(3)
3085 .nr(4)
3086 .kr(2)
3087 .sr(1)
3088 .m(3)
3089 .n(4)
3090 .k(k)
3091 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3092 }
3093 }
3094 }
3095
3096 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4_strided_cn) {
3097 TEST_REQUIRES_X86_SSE41;
3098 for (uint32_t n = 8; n <= 12; n += 4) {
3099 for (size_t k = 1; k <= 40; k += 9) {
3100 GemmMicrokernelTester()
3101 .mr(3)
3102 .nr(4)
3103 .kr(2)
3104 .sr(1)
3105 .m(3)
3106 .n(n)
3107 .k(k)
3108 .cn_stride(7)
3109 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3110 }
3111 }
3112 }
3113
3114 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4_strided_a) {
3115 TEST_REQUIRES_X86_SSE41;
3116 for (uint32_t n = 8; n <= 12; n += 4) {
3117 for (size_t k = 1; k <= 40; k += 9) {
3118 GemmMicrokernelTester()
3119 .mr(3)
3120 .nr(4)
3121 .kr(2)
3122 .sr(1)
3123 .m(3)
3124 .n(n)
3125 .k(k)
3126 .a_stride(43)
3127 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3128 }
3129 }
3130 }
3131
3132 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4_subtile) {
3133 TEST_REQUIRES_X86_SSE41;
3134 for (uint32_t n = 8; n <= 12; n += 4) {
3135 for (size_t k = 1; k <= 40; k += 9) {
3136 for (uint32_t m = 1; m <= 3; m++) {
3137 GemmMicrokernelTester()
3138 .mr(3)
3139 .nr(4)
3140 .kr(2)
3141 .sr(1)
3142 .m(m)
3143 .n(n)
3144 .k(k)
3145 .iterations(1)
3146 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3147 }
3148 }
3149 }
3150 }
3151
3152 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, strided_cm_subtile) {
3153 TEST_REQUIRES_X86_SSE41;
3154 for (size_t k = 1; k <= 40; k += 9) {
3155 for (uint32_t m = 1; m <= 3; m++) {
3156 for (uint32_t n = 1; n <= 4; n++) {
3157 GemmMicrokernelTester()
3158 .mr(3)
3159 .nr(4)
3160 .kr(2)
3161 .sr(1)
3162 .m(m)
3163 .n(n)
3164 .k(k)
3165 .cm_stride(7)
3166 .iterations(1)
3167 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3168 }
3169 }
3170 }
3171 }
3172
3173 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, qmin) {
3174 TEST_REQUIRES_X86_SSE41;
3175 GemmMicrokernelTester()
3176 .mr(3)
3177 .nr(4)
3178 .kr(2)
3179 .sr(1)
3180 .m(3)
3181 .n(4)
3182 .k(8)
3183 .qmin(128)
3184 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3185 }
3186
3187 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, qmax) {
3188 TEST_REQUIRES_X86_SSE41;
3189 GemmMicrokernelTester()
3190 .mr(3)
3191 .nr(4)
3192 .kr(2)
3193 .sr(1)
3194 .m(3)
3195 .n(4)
3196 .k(8)
3197 .qmax(128)
3198 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3199 }
3200
3201 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, strided_cm) {
3202 TEST_REQUIRES_X86_SSE41;
3203 GemmMicrokernelTester()
3204 .mr(3)
3205 .nr(4)
3206 .kr(2)
3207 .sr(1)
3208 .m(3)
3209 .n(4)
3210 .k(8)
3211 .cm_stride(7)
3212 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3213 }
3214#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
3215
3216
3217#if XNN_ARCH_X86 || XNN_ARCH_X86_64
3218 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8) {
3219 TEST_REQUIRES_X86_SSE41;
3220 GemmMicrokernelTester()
3221 .mr(4)
3222 .nr(4)
3223 .kr(2)
3224 .sr(1)
3225 .m(4)
3226 .n(4)
3227 .k(8)
3228 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3229 }
3230
3231 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, strided_cn) {
3232 TEST_REQUIRES_X86_SSE41;
3233 GemmMicrokernelTester()
3234 .mr(4)
3235 .nr(4)
3236 .kr(2)
3237 .sr(1)
3238 .m(4)
3239 .n(4)
3240 .k(8)
3241 .cn_stride(7)
3242 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3243 }
3244
3245 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_strided_a) {
3246 TEST_REQUIRES_X86_SSE41;
3247 GemmMicrokernelTester()
3248 .mr(4)
3249 .nr(4)
3250 .kr(2)
3251 .sr(1)
3252 .m(4)
3253 .n(4)
3254 .k(8)
3255 .a_stride(11)
3256 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3257 }
3258
3259 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_subtile) {
3260 TEST_REQUIRES_X86_SSE41;
3261 for (uint32_t m = 1; m <= 4; m++) {
3262 for (uint32_t n = 1; n <= 4; n++) {
3263 GemmMicrokernelTester()
3264 .mr(4)
3265 .nr(4)
3266 .kr(2)
3267 .sr(1)
3268 .m(m)
3269 .n(n)
3270 .k(8)
3271 .iterations(1)
3272 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3273 }
3274 }
3275 }
3276
3277 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_subtile_m) {
3278 TEST_REQUIRES_X86_SSE41;
3279 for (uint32_t m = 1; m <= 4; m++) {
3280 GemmMicrokernelTester()
3281 .mr(4)
3282 .nr(4)
3283 .kr(2)
3284 .sr(1)
3285 .m(m)
3286 .n(4)
3287 .k(8)
3288 .iterations(1)
3289 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3290 }
3291 }
3292
3293 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_subtile_n) {
3294 TEST_REQUIRES_X86_SSE41;
3295 for (uint32_t n = 1; n <= 4; n++) {
3296 GemmMicrokernelTester()
3297 .mr(4)
3298 .nr(4)
3299 .kr(2)
3300 .sr(1)
3301 .m(4)
3302 .n(n)
3303 .k(8)
3304 .iterations(1)
3305 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3306 }
3307 }
3308
3309 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_lt_8) {
3310 TEST_REQUIRES_X86_SSE41;
3311 for (size_t k = 1; k < 8; k++) {
3312 GemmMicrokernelTester()
3313 .mr(4)
3314 .nr(4)
3315 .kr(2)
3316 .sr(1)
3317 .m(4)
3318 .n(4)
3319 .k(k)
3320 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3321 }
3322 }
3323
3324 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_lt_8_strided_a) {
3325 TEST_REQUIRES_X86_SSE41;
3326 for (size_t k = 1; k < 8; k++) {
3327 GemmMicrokernelTester()
3328 .mr(4)
3329 .nr(4)
3330 .kr(2)
3331 .sr(1)
3332 .m(4)
3333 .n(4)
3334 .k(k)
3335 .a_stride(11)
3336 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3337 }
3338 }
3339
3340 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_lt_8_subtile) {
3341 TEST_REQUIRES_X86_SSE41;
3342 for (size_t k = 1; k < 8; k++) {
3343 for (uint32_t m = 1; m <= 4; m++) {
3344 for (uint32_t n = 1; n <= 4; n++) {
3345 GemmMicrokernelTester()
3346 .mr(4)
3347 .nr(4)
3348 .kr(2)
3349 .sr(1)
3350 .m(m)
3351 .n(n)
3352 .k(k)
3353 .iterations(1)
3354 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3355 }
3356 }
3357 }
3358 }
3359
3360 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_gt_8) {
3361 TEST_REQUIRES_X86_SSE41;
3362 for (size_t k = 9; k < 16; k++) {
3363 GemmMicrokernelTester()
3364 .mr(4)
3365 .nr(4)
3366 .kr(2)
3367 .sr(1)
3368 .m(4)
3369 .n(4)
3370 .k(k)
3371 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3372 }
3373 }
3374
3375 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_gt_8_strided_a) {
3376 TEST_REQUIRES_X86_SSE41;
3377 for (size_t k = 9; k < 16; k++) {
3378 GemmMicrokernelTester()
3379 .mr(4)
3380 .nr(4)
3381 .kr(2)
3382 .sr(1)
3383 .m(4)
3384 .n(4)
3385 .k(k)
3386 .a_stride(19)
3387 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3388 }
3389 }
3390
3391 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_gt_8_subtile) {
3392 TEST_REQUIRES_X86_SSE41;
3393 for (size_t k = 9; k < 16; k++) {
3394 for (uint32_t m = 1; m <= 4; m++) {
3395 for (uint32_t n = 1; n <= 4; n++) {
3396 GemmMicrokernelTester()
3397 .mr(4)
3398 .nr(4)
3399 .kr(2)
3400 .sr(1)
3401 .m(m)
3402 .n(n)
3403 .k(k)
3404 .iterations(1)
3405 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3406 }
3407 }
3408 }
3409 }
3410
3411 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_div_8) {
3412 TEST_REQUIRES_X86_SSE41;
3413 for (size_t k = 16; k <= 80; k += 8) {
3414 GemmMicrokernelTester()
3415 .mr(4)
3416 .nr(4)
3417 .kr(2)
3418 .sr(1)
3419 .m(4)
3420 .n(4)
3421 .k(k)
3422 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3423 }
3424 }
3425
3426 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_div_8_strided_a) {
3427 TEST_REQUIRES_X86_SSE41;
3428 for (size_t k = 16; k <= 80; k += 8) {
3429 GemmMicrokernelTester()
3430 .mr(4)
3431 .nr(4)
3432 .kr(2)
3433 .sr(1)
3434 .m(4)
3435 .n(4)
3436 .k(k)
3437 .a_stride(83)
3438 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3439 }
3440 }
3441
3442 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_div_8_subtile) {
3443 TEST_REQUIRES_X86_SSE41;
3444 for (size_t k = 16; k <= 80; k += 8) {
3445 for (uint32_t m = 1; m <= 4; m++) {
3446 for (uint32_t n = 1; n <= 4; n++) {
3447 GemmMicrokernelTester()
3448 .mr(4)
3449 .nr(4)
3450 .kr(2)
3451 .sr(1)
3452 .m(m)
3453 .n(n)
3454 .k(k)
3455 .iterations(1)
3456 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3457 }
3458 }
3459 }
3460 }
3461
3462 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4) {
3463 TEST_REQUIRES_X86_SSE41;
3464 for (uint32_t n = 5; n < 8; n++) {
3465 for (size_t k = 1; k <= 40; k += 9) {
3466 GemmMicrokernelTester()
3467 .mr(4)
3468 .nr(4)
3469 .kr(2)
3470 .sr(1)
3471 .m(4)
3472 .n(4)
3473 .k(k)
3474 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3475 }
3476 }
3477 }
3478
3479 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4_strided_cn) {
3480 TEST_REQUIRES_X86_SSE41;
3481 for (uint32_t n = 5; n < 8; n++) {
3482 for (size_t k = 1; k <= 40; k += 9) {
3483 GemmMicrokernelTester()
3484 .mr(4)
3485 .nr(4)
3486 .kr(2)
3487 .sr(1)
3488 .m(4)
3489 .n(4)
3490 .k(k)
3491 .cn_stride(7)
3492 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3493 }
3494 }
3495 }
3496
3497 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4_strided_a) {
3498 TEST_REQUIRES_X86_SSE41;
3499 for (uint32_t n = 5; n < 8; n++) {
3500 for (size_t k = 1; k <= 40; k += 9) {
3501 GemmMicrokernelTester()
3502 .mr(4)
3503 .nr(4)
3504 .kr(2)
3505 .sr(1)
3506 .m(4)
3507 .n(n)
3508 .k(k)
3509 .a_stride(43)
3510 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3511 }
3512 }
3513 }
3514
3515 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4_subtile) {
3516 TEST_REQUIRES_X86_SSE41;
3517 for (uint32_t n = 5; n < 8; n++) {
3518 for (size_t k = 1; k <= 40; k += 9) {
3519 for (uint32_t m = 1; m <= 4; m++) {
3520 GemmMicrokernelTester()
3521 .mr(4)
3522 .nr(4)
3523 .kr(2)
3524 .sr(1)
3525 .m(m)
3526 .n(n)
3527 .k(k)
3528 .iterations(1)
3529 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3530 }
3531 }
3532 }
3533 }
3534
3535 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4) {
3536 TEST_REQUIRES_X86_SSE41;
3537 for (uint32_t n = 8; n <= 12; n += 4) {
3538 for (size_t k = 1; k <= 40; k += 9) {
3539 GemmMicrokernelTester()
3540 .mr(4)
3541 .nr(4)
3542 .kr(2)
3543 .sr(1)
3544 .m(4)
3545 .n(4)
3546 .k(k)
3547 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3548 }
3549 }
3550 }
3551
3552 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4_strided_cn) {
3553 TEST_REQUIRES_X86_SSE41;
3554 for (uint32_t n = 8; n <= 12; n += 4) {
3555 for (size_t k = 1; k <= 40; k += 9) {
3556 GemmMicrokernelTester()
3557 .mr(4)
3558 .nr(4)
3559 .kr(2)
3560 .sr(1)
3561 .m(4)
3562 .n(n)
3563 .k(k)
3564 .cn_stride(7)
3565 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3566 }
3567 }
3568 }
3569
3570 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4_strided_a) {
3571 TEST_REQUIRES_X86_SSE41;
3572 for (uint32_t n = 8; n <= 12; n += 4) {
3573 for (size_t k = 1; k <= 40; k += 9) {
3574 GemmMicrokernelTester()
3575 .mr(4)
3576 .nr(4)
3577 .kr(2)
3578 .sr(1)
3579 .m(4)
3580 .n(n)
3581 .k(k)
3582 .a_stride(43)
3583 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3584 }
3585 }
3586 }
3587
3588 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4_subtile) {
3589 TEST_REQUIRES_X86_SSE41;
3590 for (uint32_t n = 8; n <= 12; n += 4) {
3591 for (size_t k = 1; k <= 40; k += 9) {
3592 for (uint32_t m = 1; m <= 4; m++) {
3593 GemmMicrokernelTester()
3594 .mr(4)
3595 .nr(4)
3596 .kr(2)
3597 .sr(1)
3598 .m(m)
3599 .n(n)
3600 .k(k)
3601 .iterations(1)
3602 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3603 }
3604 }
3605 }
3606 }
3607
3608 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, strided_cm_subtile) {
3609 TEST_REQUIRES_X86_SSE41;
3610 for (size_t k = 1; k <= 40; k += 9) {
3611 for (uint32_t m = 1; m <= 4; m++) {
3612 for (uint32_t n = 1; n <= 4; n++) {
3613 GemmMicrokernelTester()
3614 .mr(4)
3615 .nr(4)
3616 .kr(2)
3617 .sr(1)
3618 .m(m)
3619 .n(n)
3620 .k(k)
3621 .cm_stride(7)
3622 .iterations(1)
3623 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3624 }
3625 }
3626 }
3627 }
3628
3629 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, qmin) {
3630 TEST_REQUIRES_X86_SSE41;
3631 GemmMicrokernelTester()
3632 .mr(4)
3633 .nr(4)
3634 .kr(2)
3635 .sr(1)
3636 .m(4)
3637 .n(4)
3638 .k(8)
3639 .qmin(128)
3640 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3641 }
3642
3643 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, qmax) {
3644 TEST_REQUIRES_X86_SSE41;
3645 GemmMicrokernelTester()
3646 .mr(4)
3647 .nr(4)
3648 .kr(2)
3649 .sr(1)
3650 .m(4)
3651 .n(4)
3652 .k(8)
3653 .qmax(128)
3654 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3655 }
3656
3657 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, strided_cm) {
3658 TEST_REQUIRES_X86_SSE41;
3659 GemmMicrokernelTester()
3660 .mr(4)
3661 .nr(4)
3662 .kr(2)
3663 .sr(1)
3664 .m(4)
3665 .n(4)
3666 .k(8)
3667 .cm_stride(7)
3668 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3669 }
3670#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
3671
3672
3673#if XNN_ARCH_X86 || XNN_ARCH_X86_64
3674 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8) {
3675 TEST_REQUIRES_X86_AVX;
3676 GemmMicrokernelTester()
3677 .mr(1)
3678 .nr(4)
3679 .kr(2)
3680 .sr(1)
3681 .m(1)
3682 .n(4)
3683 .k(8)
3684 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3685 }
3686
3687 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, strided_cn) {
3688 TEST_REQUIRES_X86_AVX;
3689 GemmMicrokernelTester()
3690 .mr(1)
3691 .nr(4)
3692 .kr(2)
3693 .sr(1)
3694 .m(1)
3695 .n(4)
3696 .k(8)
3697 .cn_stride(7)
3698 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3699 }
3700
3701 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_strided_a) {
3702 TEST_REQUIRES_X86_AVX;
3703 GemmMicrokernelTester()
3704 .mr(1)
3705 .nr(4)
3706 .kr(2)
3707 .sr(1)
3708 .m(1)
3709 .n(4)
3710 .k(8)
3711 .a_stride(11)
3712 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3713 }
3714
3715 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_subtile) {
3716 TEST_REQUIRES_X86_AVX;
3717 for (uint32_t m = 1; m <= 1; m++) {
3718 for (uint32_t n = 1; n <= 4; n++) {
3719 GemmMicrokernelTester()
3720 .mr(1)
3721 .nr(4)
3722 .kr(2)
3723 .sr(1)
3724 .m(m)
3725 .n(n)
3726 .k(8)
3727 .iterations(1)
3728 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3729 }
3730 }
3731 }
3732
3733 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_subtile_m) {
3734 TEST_REQUIRES_X86_AVX;
3735 for (uint32_t m = 1; m <= 1; m++) {
3736 GemmMicrokernelTester()
3737 .mr(1)
3738 .nr(4)
3739 .kr(2)
3740 .sr(1)
3741 .m(m)
3742 .n(4)
3743 .k(8)
3744 .iterations(1)
3745 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3746 }
3747 }
3748
3749 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_subtile_n) {
3750 TEST_REQUIRES_X86_AVX;
3751 for (uint32_t n = 1; n <= 4; n++) {
3752 GemmMicrokernelTester()
3753 .mr(1)
3754 .nr(4)
3755 .kr(2)
3756 .sr(1)
3757 .m(1)
3758 .n(n)
3759 .k(8)
3760 .iterations(1)
3761 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3762 }
3763 }
3764
3765 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_lt_8) {
3766 TEST_REQUIRES_X86_AVX;
3767 for (size_t k = 1; k < 8; k++) {
3768 GemmMicrokernelTester()
3769 .mr(1)
3770 .nr(4)
3771 .kr(2)
3772 .sr(1)
3773 .m(1)
3774 .n(4)
3775 .k(k)
3776 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3777 }
3778 }
3779
3780 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_lt_8_strided_a) {
3781 TEST_REQUIRES_X86_AVX;
3782 for (size_t k = 1; k < 8; k++) {
3783 GemmMicrokernelTester()
3784 .mr(1)
3785 .nr(4)
3786 .kr(2)
3787 .sr(1)
3788 .m(1)
3789 .n(4)
3790 .k(k)
3791 .a_stride(11)
3792 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3793 }
3794 }
3795
3796 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_lt_8_subtile) {
3797 TEST_REQUIRES_X86_AVX;
3798 for (size_t k = 1; k < 8; k++) {
3799 for (uint32_t m = 1; m <= 1; m++) {
3800 for (uint32_t n = 1; n <= 4; n++) {
3801 GemmMicrokernelTester()
3802 .mr(1)
3803 .nr(4)
3804 .kr(2)
3805 .sr(1)
3806 .m(m)
3807 .n(n)
3808 .k(k)
3809 .iterations(1)
3810 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3811 }
3812 }
3813 }
3814 }
3815
3816 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_gt_8) {
3817 TEST_REQUIRES_X86_AVX;
3818 for (size_t k = 9; k < 16; k++) {
3819 GemmMicrokernelTester()
3820 .mr(1)
3821 .nr(4)
3822 .kr(2)
3823 .sr(1)
3824 .m(1)
3825 .n(4)
3826 .k(k)
3827 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3828 }
3829 }
3830
3831 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_gt_8_strided_a) {
3832 TEST_REQUIRES_X86_AVX;
3833 for (size_t k = 9; k < 16; k++) {
3834 GemmMicrokernelTester()
3835 .mr(1)
3836 .nr(4)
3837 .kr(2)
3838 .sr(1)
3839 .m(1)
3840 .n(4)
3841 .k(k)
3842 .a_stride(19)
3843 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3844 }
3845 }
3846
3847 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_gt_8_subtile) {
3848 TEST_REQUIRES_X86_AVX;
3849 for (size_t k = 9; k < 16; k++) {
3850 for (uint32_t m = 1; m <= 1; m++) {
3851 for (uint32_t n = 1; n <= 4; n++) {
3852 GemmMicrokernelTester()
3853 .mr(1)
3854 .nr(4)
3855 .kr(2)
3856 .sr(1)
3857 .m(m)
3858 .n(n)
3859 .k(k)
3860 .iterations(1)
3861 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3862 }
3863 }
3864 }
3865 }
3866
3867 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_div_8) {
3868 TEST_REQUIRES_X86_AVX;
3869 for (size_t k = 16; k <= 80; k += 8) {
3870 GemmMicrokernelTester()
3871 .mr(1)
3872 .nr(4)
3873 .kr(2)
3874 .sr(1)
3875 .m(1)
3876 .n(4)
3877 .k(k)
3878 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3879 }
3880 }
3881
3882 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_div_8_strided_a) {
3883 TEST_REQUIRES_X86_AVX;
3884 for (size_t k = 16; k <= 80; k += 8) {
3885 GemmMicrokernelTester()
3886 .mr(1)
3887 .nr(4)
3888 .kr(2)
3889 .sr(1)
3890 .m(1)
3891 .n(4)
3892 .k(k)
3893 .a_stride(83)
3894 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3895 }
3896 }
3897
3898 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_div_8_subtile) {
3899 TEST_REQUIRES_X86_AVX;
3900 for (size_t k = 16; k <= 80; k += 8) {
3901 for (uint32_t m = 1; m <= 1; m++) {
3902 for (uint32_t n = 1; n <= 4; n++) {
3903 GemmMicrokernelTester()
3904 .mr(1)
3905 .nr(4)
3906 .kr(2)
3907 .sr(1)
3908 .m(m)
3909 .n(n)
3910 .k(k)
3911 .iterations(1)
3912 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3913 }
3914 }
3915 }
3916 }
3917
3918 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4) {
3919 TEST_REQUIRES_X86_AVX;
3920 for (uint32_t n = 5; n < 8; n++) {
3921 for (size_t k = 1; k <= 40; k += 9) {
3922 GemmMicrokernelTester()
3923 .mr(1)
3924 .nr(4)
3925 .kr(2)
3926 .sr(1)
3927 .m(1)
3928 .n(4)
3929 .k(k)
3930 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3931 }
3932 }
3933 }
3934
3935 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4_strided_cn) {
3936 TEST_REQUIRES_X86_AVX;
3937 for (uint32_t n = 5; n < 8; n++) {
3938 for (size_t k = 1; k <= 40; k += 9) {
3939 GemmMicrokernelTester()
3940 .mr(1)
3941 .nr(4)
3942 .kr(2)
3943 .sr(1)
3944 .m(1)
3945 .n(4)
3946 .k(k)
3947 .cn_stride(7)
3948 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3949 }
3950 }
3951 }
3952
3953 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4_strided_a) {
3954 TEST_REQUIRES_X86_AVX;
3955 for (uint32_t n = 5; n < 8; n++) {
3956 for (size_t k = 1; k <= 40; k += 9) {
3957 GemmMicrokernelTester()
3958 .mr(1)
3959 .nr(4)
3960 .kr(2)
3961 .sr(1)
3962 .m(1)
3963 .n(n)
3964 .k(k)
3965 .a_stride(43)
3966 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3967 }
3968 }
3969 }
3970
3971 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4_subtile) {
3972 TEST_REQUIRES_X86_AVX;
3973 for (uint32_t n = 5; n < 8; n++) {
3974 for (size_t k = 1; k <= 40; k += 9) {
3975 for (uint32_t m = 1; m <= 1; m++) {
3976 GemmMicrokernelTester()
3977 .mr(1)
3978 .nr(4)
3979 .kr(2)
3980 .sr(1)
3981 .m(m)
3982 .n(n)
3983 .k(k)
3984 .iterations(1)
3985 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
3986 }
3987 }
3988 }
3989 }
3990
3991 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4) {
3992 TEST_REQUIRES_X86_AVX;
3993 for (uint32_t n = 8; n <= 12; n += 4) {
3994 for (size_t k = 1; k <= 40; k += 9) {
3995 GemmMicrokernelTester()
3996 .mr(1)
3997 .nr(4)
3998 .kr(2)
3999 .sr(1)
4000 .m(1)
4001 .n(4)
4002 .k(k)
4003 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4004 }
4005 }
4006 }
4007
4008 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4_strided_cn) {
4009 TEST_REQUIRES_X86_AVX;
4010 for (uint32_t n = 8; n <= 12; n += 4) {
4011 for (size_t k = 1; k <= 40; k += 9) {
4012 GemmMicrokernelTester()
4013 .mr(1)
4014 .nr(4)
4015 .kr(2)
4016 .sr(1)
4017 .m(1)
4018 .n(n)
4019 .k(k)
4020 .cn_stride(7)
4021 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4022 }
4023 }
4024 }
4025
4026 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4_strided_a) {
4027 TEST_REQUIRES_X86_AVX;
4028 for (uint32_t n = 8; n <= 12; n += 4) {
4029 for (size_t k = 1; k <= 40; k += 9) {
4030 GemmMicrokernelTester()
4031 .mr(1)
4032 .nr(4)
4033 .kr(2)
4034 .sr(1)
4035 .m(1)
4036 .n(n)
4037 .k(k)
4038 .a_stride(43)
4039 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4040 }
4041 }
4042 }
4043
4044 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4_subtile) {
4045 TEST_REQUIRES_X86_AVX;
4046 for (uint32_t n = 8; n <= 12; n += 4) {
4047 for (size_t k = 1; k <= 40; k += 9) {
4048 for (uint32_t m = 1; m <= 1; m++) {
4049 GemmMicrokernelTester()
4050 .mr(1)
4051 .nr(4)
4052 .kr(2)
4053 .sr(1)
4054 .m(m)
4055 .n(n)
4056 .k(k)
4057 .iterations(1)
4058 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4059 }
4060 }
4061 }
4062 }
4063
4064 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, strided_cm_subtile) {
4065 TEST_REQUIRES_X86_AVX;
4066 for (size_t k = 1; k <= 40; k += 9) {
4067 for (uint32_t m = 1; m <= 1; m++) {
4068 for (uint32_t n = 1; n <= 4; n++) {
4069 GemmMicrokernelTester()
4070 .mr(1)
4071 .nr(4)
4072 .kr(2)
4073 .sr(1)
4074 .m(m)
4075 .n(n)
4076 .k(k)
4077 .cm_stride(7)
4078 .iterations(1)
4079 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4080 }
4081 }
4082 }
4083 }
4084
4085 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, qmin) {
4086 TEST_REQUIRES_X86_AVX;
4087 GemmMicrokernelTester()
4088 .mr(1)
4089 .nr(4)
4090 .kr(2)
4091 .sr(1)
4092 .m(1)
4093 .n(4)
4094 .k(8)
4095 .qmin(128)
4096 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4097 }
4098
4099 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, qmax) {
4100 TEST_REQUIRES_X86_AVX;
4101 GemmMicrokernelTester()
4102 .mr(1)
4103 .nr(4)
4104 .kr(2)
4105 .sr(1)
4106 .m(1)
4107 .n(4)
4108 .k(8)
4109 .qmax(128)
4110 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4111 }
4112
4113 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, strided_cm) {
4114 TEST_REQUIRES_X86_AVX;
4115 GemmMicrokernelTester()
4116 .mr(1)
4117 .nr(4)
4118 .kr(2)
4119 .sr(1)
4120 .m(1)
4121 .n(4)
4122 .k(8)
4123 .cm_stride(7)
4124 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4125 }
4126#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
4127
4128
4129#if XNN_ARCH_X86 || XNN_ARCH_X86_64
4130 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8) {
4131 TEST_REQUIRES_X86_AVX;
4132 GemmMicrokernelTester()
4133 .mr(2)
4134 .nr(4)
4135 .kr(2)
4136 .sr(1)
4137 .m(2)
4138 .n(4)
4139 .k(8)
4140 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4141 }
4142
4143 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, strided_cn) {
4144 TEST_REQUIRES_X86_AVX;
4145 GemmMicrokernelTester()
4146 .mr(2)
4147 .nr(4)
4148 .kr(2)
4149 .sr(1)
4150 .m(2)
4151 .n(4)
4152 .k(8)
4153 .cn_stride(7)
4154 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4155 }
4156
4157 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8_strided_a) {
4158 TEST_REQUIRES_X86_AVX;
4159 GemmMicrokernelTester()
4160 .mr(2)
4161 .nr(4)
4162 .kr(2)
4163 .sr(1)
4164 .m(2)
4165 .n(4)
4166 .k(8)
4167 .a_stride(11)
4168 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4169 }
4170
4171 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8_subtile) {
4172 TEST_REQUIRES_X86_AVX;
4173 for (uint32_t m = 1; m <= 2; m++) {
4174 for (uint32_t n = 1; n <= 4; n++) {
4175 GemmMicrokernelTester()
4176 .mr(2)
4177 .nr(4)
4178 .kr(2)
4179 .sr(1)
4180 .m(m)
4181 .n(n)
4182 .k(8)
4183 .iterations(1)
4184 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4185 }
4186 }
4187 }
4188
4189 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8_subtile_m) {
4190 TEST_REQUIRES_X86_AVX;
4191 for (uint32_t m = 1; m <= 2; m++) {
4192 GemmMicrokernelTester()
4193 .mr(2)
4194 .nr(4)
4195 .kr(2)
4196 .sr(1)
4197 .m(m)
4198 .n(4)
4199 .k(8)
4200 .iterations(1)
4201 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4202 }
4203 }
4204
4205 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8_subtile_n) {
4206 TEST_REQUIRES_X86_AVX;
4207 for (uint32_t n = 1; n <= 4; n++) {
4208 GemmMicrokernelTester()
4209 .mr(2)
4210 .nr(4)
4211 .kr(2)
4212 .sr(1)
4213 .m(2)
4214 .n(n)
4215 .k(8)
4216 .iterations(1)
4217 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4218 }
4219 }
4220
4221 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_lt_8) {
4222 TEST_REQUIRES_X86_AVX;
4223 for (size_t k = 1; k < 8; k++) {
4224 GemmMicrokernelTester()
4225 .mr(2)
4226 .nr(4)
4227 .kr(2)
4228 .sr(1)
4229 .m(2)
4230 .n(4)
4231 .k(k)
4232 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4233 }
4234 }
4235
4236 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_lt_8_strided_a) {
4237 TEST_REQUIRES_X86_AVX;
4238 for (size_t k = 1; k < 8; k++) {
4239 GemmMicrokernelTester()
4240 .mr(2)
4241 .nr(4)
4242 .kr(2)
4243 .sr(1)
4244 .m(2)
4245 .n(4)
4246 .k(k)
4247 .a_stride(11)
4248 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4249 }
4250 }
4251
4252 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_lt_8_subtile) {
4253 TEST_REQUIRES_X86_AVX;
4254 for (size_t k = 1; k < 8; k++) {
4255 for (uint32_t m = 1; m <= 2; m++) {
4256 for (uint32_t n = 1; n <= 4; n++) {
4257 GemmMicrokernelTester()
4258 .mr(2)
4259 .nr(4)
4260 .kr(2)
4261 .sr(1)
4262 .m(m)
4263 .n(n)
4264 .k(k)
4265 .iterations(1)
4266 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4267 }
4268 }
4269 }
4270 }
4271
4272 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_gt_8) {
4273 TEST_REQUIRES_X86_AVX;
4274 for (size_t k = 9; k < 16; k++) {
4275 GemmMicrokernelTester()
4276 .mr(2)
4277 .nr(4)
4278 .kr(2)
4279 .sr(1)
4280 .m(2)
4281 .n(4)
4282 .k(k)
4283 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4284 }
4285 }
4286
4287 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_gt_8_strided_a) {
4288 TEST_REQUIRES_X86_AVX;
4289 for (size_t k = 9; k < 16; k++) {
4290 GemmMicrokernelTester()
4291 .mr(2)
4292 .nr(4)
4293 .kr(2)
4294 .sr(1)
4295 .m(2)
4296 .n(4)
4297 .k(k)
4298 .a_stride(19)
4299 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4300 }
4301 }
4302
4303 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_gt_8_subtile) {
4304 TEST_REQUIRES_X86_AVX;
4305 for (size_t k = 9; k < 16; k++) {
4306 for (uint32_t m = 1; m <= 2; m++) {
4307 for (uint32_t n = 1; n <= 4; n++) {
4308 GemmMicrokernelTester()
4309 .mr(2)
4310 .nr(4)
4311 .kr(2)
4312 .sr(1)
4313 .m(m)
4314 .n(n)
4315 .k(k)
4316 .iterations(1)
4317 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4318 }
4319 }
4320 }
4321 }
4322
4323 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_div_8) {
4324 TEST_REQUIRES_X86_AVX;
4325 for (size_t k = 16; k <= 80; k += 8) {
4326 GemmMicrokernelTester()
4327 .mr(2)
4328 .nr(4)
4329 .kr(2)
4330 .sr(1)
4331 .m(2)
4332 .n(4)
4333 .k(k)
4334 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4335 }
4336 }
4337
4338 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_div_8_strided_a) {
4339 TEST_REQUIRES_X86_AVX;
4340 for (size_t k = 16; k <= 80; k += 8) {
4341 GemmMicrokernelTester()
4342 .mr(2)
4343 .nr(4)
4344 .kr(2)
4345 .sr(1)
4346 .m(2)
4347 .n(4)
4348 .k(k)
4349 .a_stride(83)
4350 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4351 }
4352 }
4353
4354 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_div_8_subtile) {
4355 TEST_REQUIRES_X86_AVX;
4356 for (size_t k = 16; k <= 80; k += 8) {
4357 for (uint32_t m = 1; m <= 2; m++) {
4358 for (uint32_t n = 1; n <= 4; n++) {
4359 GemmMicrokernelTester()
4360 .mr(2)
4361 .nr(4)
4362 .kr(2)
4363 .sr(1)
4364 .m(m)
4365 .n(n)
4366 .k(k)
4367 .iterations(1)
4368 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4369 }
4370 }
4371 }
4372 }
4373
4374 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_gt_4) {
4375 TEST_REQUIRES_X86_AVX;
4376 for (uint32_t n = 5; n < 8; n++) {
4377 for (size_t k = 1; k <= 40; k += 9) {
4378 GemmMicrokernelTester()
4379 .mr(2)
4380 .nr(4)
4381 .kr(2)
4382 .sr(1)
4383 .m(2)
4384 .n(4)
4385 .k(k)
4386 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4387 }
4388 }
4389 }
4390
4391 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_gt_4_strided_cn) {
4392 TEST_REQUIRES_X86_AVX;
4393 for (uint32_t n = 5; n < 8; n++) {
4394 for (size_t k = 1; k <= 40; k += 9) {
4395 GemmMicrokernelTester()
4396 .mr(2)
4397 .nr(4)
4398 .kr(2)
4399 .sr(1)
4400 .m(2)
4401 .n(4)
4402 .k(k)
4403 .cn_stride(7)
4404 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4405 }
4406 }
4407 }
4408
4409 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_gt_4_strided_a) {
4410 TEST_REQUIRES_X86_AVX;
4411 for (uint32_t n = 5; n < 8; n++) {
4412 for (size_t k = 1; k <= 40; k += 9) {
4413 GemmMicrokernelTester()
4414 .mr(2)
4415 .nr(4)
4416 .kr(2)
4417 .sr(1)
4418 .m(2)
4419 .n(n)
4420 .k(k)
4421 .a_stride(43)
4422 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4423 }
4424 }
4425 }
4426
4427 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_gt_4_subtile) {
4428 TEST_REQUIRES_X86_AVX;
4429 for (uint32_t n = 5; n < 8; n++) {
4430 for (size_t k = 1; k <= 40; k += 9) {
4431 for (uint32_t m = 1; m <= 2; m++) {
4432 GemmMicrokernelTester()
4433 .mr(2)
4434 .nr(4)
4435 .kr(2)
4436 .sr(1)
4437 .m(m)
4438 .n(n)
4439 .k(k)
4440 .iterations(1)
4441 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4442 }
4443 }
4444 }
4445 }
4446
4447 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_div_4) {
4448 TEST_REQUIRES_X86_AVX;
4449 for (uint32_t n = 8; n <= 12; n += 4) {
4450 for (size_t k = 1; k <= 40; k += 9) {
4451 GemmMicrokernelTester()
4452 .mr(2)
4453 .nr(4)
4454 .kr(2)
4455 .sr(1)
4456 .m(2)
4457 .n(4)
4458 .k(k)
4459 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4460 }
4461 }
4462 }
4463
4464 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_div_4_strided_cn) {
4465 TEST_REQUIRES_X86_AVX;
4466 for (uint32_t n = 8; n <= 12; n += 4) {
4467 for (size_t k = 1; k <= 40; k += 9) {
4468 GemmMicrokernelTester()
4469 .mr(2)
4470 .nr(4)
4471 .kr(2)
4472 .sr(1)
4473 .m(2)
4474 .n(n)
4475 .k(k)
4476 .cn_stride(7)
4477 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4478 }
4479 }
4480 }
4481
4482 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_div_4_strided_a) {
4483 TEST_REQUIRES_X86_AVX;
4484 for (uint32_t n = 8; n <= 12; n += 4) {
4485 for (size_t k = 1; k <= 40; k += 9) {
4486 GemmMicrokernelTester()
4487 .mr(2)
4488 .nr(4)
4489 .kr(2)
4490 .sr(1)
4491 .m(2)
4492 .n(n)
4493 .k(k)
4494 .a_stride(43)
4495 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4496 }
4497 }
4498 }
4499
4500 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_div_4_subtile) {
4501 TEST_REQUIRES_X86_AVX;
4502 for (uint32_t n = 8; n <= 12; n += 4) {
4503 for (size_t k = 1; k <= 40; k += 9) {
4504 for (uint32_t m = 1; m <= 2; m++) {
4505 GemmMicrokernelTester()
4506 .mr(2)
4507 .nr(4)
4508 .kr(2)
4509 .sr(1)
4510 .m(m)
4511 .n(n)
4512 .k(k)
4513 .iterations(1)
4514 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4515 }
4516 }
4517 }
4518 }
4519
4520 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, strided_cm_subtile) {
4521 TEST_REQUIRES_X86_AVX;
4522 for (size_t k = 1; k <= 40; k += 9) {
4523 for (uint32_t m = 1; m <= 2; m++) {
4524 for (uint32_t n = 1; n <= 4; n++) {
4525 GemmMicrokernelTester()
4526 .mr(2)
4527 .nr(4)
4528 .kr(2)
4529 .sr(1)
4530 .m(m)
4531 .n(n)
4532 .k(k)
4533 .cm_stride(7)
4534 .iterations(1)
4535 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4536 }
4537 }
4538 }
4539 }
4540
4541 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, qmin) {
4542 TEST_REQUIRES_X86_AVX;
4543 GemmMicrokernelTester()
4544 .mr(2)
4545 .nr(4)
4546 .kr(2)
4547 .sr(1)
4548 .m(2)
4549 .n(4)
4550 .k(8)
4551 .qmin(128)
4552 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4553 }
4554
4555 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, qmax) {
4556 TEST_REQUIRES_X86_AVX;
4557 GemmMicrokernelTester()
4558 .mr(2)
4559 .nr(4)
4560 .kr(2)
4561 .sr(1)
4562 .m(2)
4563 .n(4)
4564 .k(8)
4565 .qmax(128)
4566 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4567 }
4568
4569 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, strided_cm) {
4570 TEST_REQUIRES_X86_AVX;
4571 GemmMicrokernelTester()
4572 .mr(2)
4573 .nr(4)
4574 .kr(2)
4575 .sr(1)
4576 .m(2)
4577 .n(4)
4578 .k(8)
4579 .cm_stride(7)
4580 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4581 }
4582#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
4583
4584
4585#if XNN_ARCH_X86 || XNN_ARCH_X86_64
4586 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8) {
4587 TEST_REQUIRES_X86_AVX;
4588 GemmMicrokernelTester()
4589 .mr(3)
4590 .nr(4)
4591 .kr(2)
4592 .sr(1)
4593 .m(3)
4594 .n(4)
4595 .k(8)
4596 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4597 }
4598
4599 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, strided_cn) {
4600 TEST_REQUIRES_X86_AVX;
4601 GemmMicrokernelTester()
4602 .mr(3)
4603 .nr(4)
4604 .kr(2)
4605 .sr(1)
4606 .m(3)
4607 .n(4)
4608 .k(8)
4609 .cn_stride(7)
4610 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4611 }
4612
4613 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8_strided_a) {
4614 TEST_REQUIRES_X86_AVX;
4615 GemmMicrokernelTester()
4616 .mr(3)
4617 .nr(4)
4618 .kr(2)
4619 .sr(1)
4620 .m(3)
4621 .n(4)
4622 .k(8)
4623 .a_stride(11)
4624 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4625 }
4626
4627 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8_subtile) {
4628 TEST_REQUIRES_X86_AVX;
4629 for (uint32_t m = 1; m <= 3; m++) {
4630 for (uint32_t n = 1; n <= 4; n++) {
4631 GemmMicrokernelTester()
4632 .mr(3)
4633 .nr(4)
4634 .kr(2)
4635 .sr(1)
4636 .m(m)
4637 .n(n)
4638 .k(8)
4639 .iterations(1)
4640 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4641 }
4642 }
4643 }
4644
4645 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8_subtile_m) {
4646 TEST_REQUIRES_X86_AVX;
4647 for (uint32_t m = 1; m <= 3; m++) {
4648 GemmMicrokernelTester()
4649 .mr(3)
4650 .nr(4)
4651 .kr(2)
4652 .sr(1)
4653 .m(m)
4654 .n(4)
4655 .k(8)
4656 .iterations(1)
4657 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4658 }
4659 }
4660
4661 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8_subtile_n) {
4662 TEST_REQUIRES_X86_AVX;
4663 for (uint32_t n = 1; n <= 4; n++) {
4664 GemmMicrokernelTester()
4665 .mr(3)
4666 .nr(4)
4667 .kr(2)
4668 .sr(1)
4669 .m(3)
4670 .n(n)
4671 .k(8)
4672 .iterations(1)
4673 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4674 }
4675 }
4676
4677 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_lt_8) {
4678 TEST_REQUIRES_X86_AVX;
4679 for (size_t k = 1; k < 8; k++) {
4680 GemmMicrokernelTester()
4681 .mr(3)
4682 .nr(4)
4683 .kr(2)
4684 .sr(1)
4685 .m(3)
4686 .n(4)
4687 .k(k)
4688 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4689 }
4690 }
4691
4692 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_lt_8_strided_a) {
4693 TEST_REQUIRES_X86_AVX;
4694 for (size_t k = 1; k < 8; k++) {
4695 GemmMicrokernelTester()
4696 .mr(3)
4697 .nr(4)
4698 .kr(2)
4699 .sr(1)
4700 .m(3)
4701 .n(4)
4702 .k(k)
4703 .a_stride(11)
4704 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4705 }
4706 }
4707
4708 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_lt_8_subtile) {
4709 TEST_REQUIRES_X86_AVX;
4710 for (size_t k = 1; k < 8; k++) {
4711 for (uint32_t m = 1; m <= 3; m++) {
4712 for (uint32_t n = 1; n <= 4; n++) {
4713 GemmMicrokernelTester()
4714 .mr(3)
4715 .nr(4)
4716 .kr(2)
4717 .sr(1)
4718 .m(m)
4719 .n(n)
4720 .k(k)
4721 .iterations(1)
4722 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4723 }
4724 }
4725 }
4726 }
4727
4728 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_gt_8) {
4729 TEST_REQUIRES_X86_AVX;
4730 for (size_t k = 9; k < 16; k++) {
4731 GemmMicrokernelTester()
4732 .mr(3)
4733 .nr(4)
4734 .kr(2)
4735 .sr(1)
4736 .m(3)
4737 .n(4)
4738 .k(k)
4739 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4740 }
4741 }
4742
4743 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_gt_8_strided_a) {
4744 TEST_REQUIRES_X86_AVX;
4745 for (size_t k = 9; k < 16; k++) {
4746 GemmMicrokernelTester()
4747 .mr(3)
4748 .nr(4)
4749 .kr(2)
4750 .sr(1)
4751 .m(3)
4752 .n(4)
4753 .k(k)
4754 .a_stride(19)
4755 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4756 }
4757 }
4758
4759 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_gt_8_subtile) {
4760 TEST_REQUIRES_X86_AVX;
4761 for (size_t k = 9; k < 16; k++) {
4762 for (uint32_t m = 1; m <= 3; m++) {
4763 for (uint32_t n = 1; n <= 4; n++) {
4764 GemmMicrokernelTester()
4765 .mr(3)
4766 .nr(4)
4767 .kr(2)
4768 .sr(1)
4769 .m(m)
4770 .n(n)
4771 .k(k)
4772 .iterations(1)
4773 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4774 }
4775 }
4776 }
4777 }
4778
4779 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_div_8) {
4780 TEST_REQUIRES_X86_AVX;
4781 for (size_t k = 16; k <= 80; k += 8) {
4782 GemmMicrokernelTester()
4783 .mr(3)
4784 .nr(4)
4785 .kr(2)
4786 .sr(1)
4787 .m(3)
4788 .n(4)
4789 .k(k)
4790 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4791 }
4792 }
4793
4794 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_div_8_strided_a) {
4795 TEST_REQUIRES_X86_AVX;
4796 for (size_t k = 16; k <= 80; k += 8) {
4797 GemmMicrokernelTester()
4798 .mr(3)
4799 .nr(4)
4800 .kr(2)
4801 .sr(1)
4802 .m(3)
4803 .n(4)
4804 .k(k)
4805 .a_stride(83)
4806 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4807 }
4808 }
4809
4810 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_div_8_subtile) {
4811 TEST_REQUIRES_X86_AVX;
4812 for (size_t k = 16; k <= 80; k += 8) {
4813 for (uint32_t m = 1; m <= 3; m++) {
4814 for (uint32_t n = 1; n <= 4; n++) {
4815 GemmMicrokernelTester()
4816 .mr(3)
4817 .nr(4)
4818 .kr(2)
4819 .sr(1)
4820 .m(m)
4821 .n(n)
4822 .k(k)
4823 .iterations(1)
4824 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4825 }
4826 }
4827 }
4828 }
4829
4830 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4) {
4831 TEST_REQUIRES_X86_AVX;
4832 for (uint32_t n = 5; n < 8; n++) {
4833 for (size_t k = 1; k <= 40; k += 9) {
4834 GemmMicrokernelTester()
4835 .mr(3)
4836 .nr(4)
4837 .kr(2)
4838 .sr(1)
4839 .m(3)
4840 .n(4)
4841 .k(k)
4842 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4843 }
4844 }
4845 }
4846
4847 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4_strided_cn) {
4848 TEST_REQUIRES_X86_AVX;
4849 for (uint32_t n = 5; n < 8; n++) {
4850 for (size_t k = 1; k <= 40; k += 9) {
4851 GemmMicrokernelTester()
4852 .mr(3)
4853 .nr(4)
4854 .kr(2)
4855 .sr(1)
4856 .m(3)
4857 .n(4)
4858 .k(k)
4859 .cn_stride(7)
4860 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4861 }
4862 }
4863 }
4864
4865 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4_strided_a) {
4866 TEST_REQUIRES_X86_AVX;
4867 for (uint32_t n = 5; n < 8; n++) {
4868 for (size_t k = 1; k <= 40; k += 9) {
4869 GemmMicrokernelTester()
4870 .mr(3)
4871 .nr(4)
4872 .kr(2)
4873 .sr(1)
4874 .m(3)
4875 .n(n)
4876 .k(k)
4877 .a_stride(43)
4878 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4879 }
4880 }
4881 }
4882
4883 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4_subtile) {
4884 TEST_REQUIRES_X86_AVX;
4885 for (uint32_t n = 5; n < 8; n++) {
4886 for (size_t k = 1; k <= 40; k += 9) {
4887 for (uint32_t m = 1; m <= 3; m++) {
4888 GemmMicrokernelTester()
4889 .mr(3)
4890 .nr(4)
4891 .kr(2)
4892 .sr(1)
4893 .m(m)
4894 .n(n)
4895 .k(k)
4896 .iterations(1)
4897 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4898 }
4899 }
4900 }
4901 }
4902
4903 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4) {
4904 TEST_REQUIRES_X86_AVX;
4905 for (uint32_t n = 8; n <= 12; n += 4) {
4906 for (size_t k = 1; k <= 40; k += 9) {
4907 GemmMicrokernelTester()
4908 .mr(3)
4909 .nr(4)
4910 .kr(2)
4911 .sr(1)
4912 .m(3)
4913 .n(4)
4914 .k(k)
4915 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4916 }
4917 }
4918 }
4919
4920 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4_strided_cn) {
4921 TEST_REQUIRES_X86_AVX;
4922 for (uint32_t n = 8; n <= 12; n += 4) {
4923 for (size_t k = 1; k <= 40; k += 9) {
4924 GemmMicrokernelTester()
4925 .mr(3)
4926 .nr(4)
4927 .kr(2)
4928 .sr(1)
4929 .m(3)
4930 .n(n)
4931 .k(k)
4932 .cn_stride(7)
4933 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4934 }
4935 }
4936 }
4937
4938 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4_strided_a) {
4939 TEST_REQUIRES_X86_AVX;
4940 for (uint32_t n = 8; n <= 12; n += 4) {
4941 for (size_t k = 1; k <= 40; k += 9) {
4942 GemmMicrokernelTester()
4943 .mr(3)
4944 .nr(4)
4945 .kr(2)
4946 .sr(1)
4947 .m(3)
4948 .n(n)
4949 .k(k)
4950 .a_stride(43)
4951 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4952 }
4953 }
4954 }
4955
4956 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4_subtile) {
4957 TEST_REQUIRES_X86_AVX;
4958 for (uint32_t n = 8; n <= 12; n += 4) {
4959 for (size_t k = 1; k <= 40; k += 9) {
4960 for (uint32_t m = 1; m <= 3; m++) {
4961 GemmMicrokernelTester()
4962 .mr(3)
4963 .nr(4)
4964 .kr(2)
4965 .sr(1)
4966 .m(m)
4967 .n(n)
4968 .k(k)
4969 .iterations(1)
4970 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4971 }
4972 }
4973 }
4974 }
4975
4976 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, strided_cm_subtile) {
4977 TEST_REQUIRES_X86_AVX;
4978 for (size_t k = 1; k <= 40; k += 9) {
4979 for (uint32_t m = 1; m <= 3; m++) {
4980 for (uint32_t n = 1; n <= 4; n++) {
4981 GemmMicrokernelTester()
4982 .mr(3)
4983 .nr(4)
4984 .kr(2)
4985 .sr(1)
4986 .m(m)
4987 .n(n)
4988 .k(k)
4989 .cm_stride(7)
4990 .iterations(1)
4991 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
4992 }
4993 }
4994 }
4995 }
4996
4997 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, qmin) {
4998 TEST_REQUIRES_X86_AVX;
4999 GemmMicrokernelTester()
5000 .mr(3)
5001 .nr(4)
5002 .kr(2)
5003 .sr(1)
5004 .m(3)
5005 .n(4)
5006 .k(8)
5007 .qmin(128)
5008 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5009 }
5010
5011 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, qmax) {
5012 TEST_REQUIRES_X86_AVX;
5013 GemmMicrokernelTester()
5014 .mr(3)
5015 .nr(4)
5016 .kr(2)
5017 .sr(1)
5018 .m(3)
5019 .n(4)
5020 .k(8)
5021 .qmax(128)
5022 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5023 }
5024
5025 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, strided_cm) {
5026 TEST_REQUIRES_X86_AVX;
5027 GemmMicrokernelTester()
5028 .mr(3)
5029 .nr(4)
5030 .kr(2)
5031 .sr(1)
5032 .m(3)
5033 .n(4)
5034 .k(8)
5035 .cm_stride(7)
5036 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5037 }
5038#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
5039
5040
5041#if XNN_ARCH_X86 || XNN_ARCH_X86_64
5042 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8) {
5043 TEST_REQUIRES_X86_AVX;
5044 GemmMicrokernelTester()
5045 .mr(4)
5046 .nr(4)
5047 .kr(2)
5048 .sr(1)
5049 .m(4)
5050 .n(4)
5051 .k(8)
5052 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5053 }
5054
5055 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, strided_cn) {
5056 TEST_REQUIRES_X86_AVX;
5057 GemmMicrokernelTester()
5058 .mr(4)
5059 .nr(4)
5060 .kr(2)
5061 .sr(1)
5062 .m(4)
5063 .n(4)
5064 .k(8)
5065 .cn_stride(7)
5066 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5067 }
5068
5069 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8_strided_a) {
5070 TEST_REQUIRES_X86_AVX;
5071 GemmMicrokernelTester()
5072 .mr(4)
5073 .nr(4)
5074 .kr(2)
5075 .sr(1)
5076 .m(4)
5077 .n(4)
5078 .k(8)
5079 .a_stride(11)
5080 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5081 }
5082
5083 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8_subtile) {
5084 TEST_REQUIRES_X86_AVX;
5085 for (uint32_t m = 1; m <= 4; m++) {
5086 for (uint32_t n = 1; n <= 4; n++) {
5087 GemmMicrokernelTester()
5088 .mr(4)
5089 .nr(4)
5090 .kr(2)
5091 .sr(1)
5092 .m(m)
5093 .n(n)
5094 .k(8)
5095 .iterations(1)
5096 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5097 }
5098 }
5099 }
5100
5101 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8_subtile_m) {
5102 TEST_REQUIRES_X86_AVX;
5103 for (uint32_t m = 1; m <= 4; m++) {
5104 GemmMicrokernelTester()
5105 .mr(4)
5106 .nr(4)
5107 .kr(2)
5108 .sr(1)
5109 .m(m)
5110 .n(4)
5111 .k(8)
5112 .iterations(1)
5113 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5114 }
5115 }
5116
5117 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8_subtile_n) {
5118 TEST_REQUIRES_X86_AVX;
5119 for (uint32_t n = 1; n <= 4; n++) {
5120 GemmMicrokernelTester()
5121 .mr(4)
5122 .nr(4)
5123 .kr(2)
5124 .sr(1)
5125 .m(4)
5126 .n(n)
5127 .k(8)
5128 .iterations(1)
5129 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5130 }
5131 }
5132
5133 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_lt_8) {
5134 TEST_REQUIRES_X86_AVX;
5135 for (size_t k = 1; k < 8; k++) {
5136 GemmMicrokernelTester()
5137 .mr(4)
5138 .nr(4)
5139 .kr(2)
5140 .sr(1)
5141 .m(4)
5142 .n(4)
5143 .k(k)
5144 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5145 }
5146 }
5147
5148 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_lt_8_strided_a) {
5149 TEST_REQUIRES_X86_AVX;
5150 for (size_t k = 1; k < 8; k++) {
5151 GemmMicrokernelTester()
5152 .mr(4)
5153 .nr(4)
5154 .kr(2)
5155 .sr(1)
5156 .m(4)
5157 .n(4)
5158 .k(k)
5159 .a_stride(11)
5160 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5161 }
5162 }
5163
5164 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_lt_8_subtile) {
5165 TEST_REQUIRES_X86_AVX;
5166 for (size_t k = 1; k < 8; k++) {
5167 for (uint32_t m = 1; m <= 4; m++) {
5168 for (uint32_t n = 1; n <= 4; n++) {
5169 GemmMicrokernelTester()
5170 .mr(4)
5171 .nr(4)
5172 .kr(2)
5173 .sr(1)
5174 .m(m)
5175 .n(n)
5176 .k(k)
5177 .iterations(1)
5178 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5179 }
5180 }
5181 }
5182 }
5183
5184 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_gt_8) {
5185 TEST_REQUIRES_X86_AVX;
5186 for (size_t k = 9; k < 16; k++) {
5187 GemmMicrokernelTester()
5188 .mr(4)
5189 .nr(4)
5190 .kr(2)
5191 .sr(1)
5192 .m(4)
5193 .n(4)
5194 .k(k)
5195 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5196 }
5197 }
5198
5199 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_gt_8_strided_a) {
5200 TEST_REQUIRES_X86_AVX;
5201 for (size_t k = 9; k < 16; k++) {
5202 GemmMicrokernelTester()
5203 .mr(4)
5204 .nr(4)
5205 .kr(2)
5206 .sr(1)
5207 .m(4)
5208 .n(4)
5209 .k(k)
5210 .a_stride(19)
5211 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5212 }
5213 }
5214
5215 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_gt_8_subtile) {
5216 TEST_REQUIRES_X86_AVX;
5217 for (size_t k = 9; k < 16; k++) {
5218 for (uint32_t m = 1; m <= 4; m++) {
5219 for (uint32_t n = 1; n <= 4; n++) {
5220 GemmMicrokernelTester()
5221 .mr(4)
5222 .nr(4)
5223 .kr(2)
5224 .sr(1)
5225 .m(m)
5226 .n(n)
5227 .k(k)
5228 .iterations(1)
5229 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5230 }
5231 }
5232 }
5233 }
5234
5235 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_div_8) {
5236 TEST_REQUIRES_X86_AVX;
5237 for (size_t k = 16; k <= 80; k += 8) {
5238 GemmMicrokernelTester()
5239 .mr(4)
5240 .nr(4)
5241 .kr(2)
5242 .sr(1)
5243 .m(4)
5244 .n(4)
5245 .k(k)
5246 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5247 }
5248 }
5249
5250 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_div_8_strided_a) {
5251 TEST_REQUIRES_X86_AVX;
5252 for (size_t k = 16; k <= 80; k += 8) {
5253 GemmMicrokernelTester()
5254 .mr(4)
5255 .nr(4)
5256 .kr(2)
5257 .sr(1)
5258 .m(4)
5259 .n(4)
5260 .k(k)
5261 .a_stride(83)
5262 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5263 }
5264 }
5265
5266 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_div_8_subtile) {
5267 TEST_REQUIRES_X86_AVX;
5268 for (size_t k = 16; k <= 80; k += 8) {
5269 for (uint32_t m = 1; m <= 4; m++) {
5270 for (uint32_t n = 1; n <= 4; n++) {
5271 GemmMicrokernelTester()
5272 .mr(4)
5273 .nr(4)
5274 .kr(2)
5275 .sr(1)
5276 .m(m)
5277 .n(n)
5278 .k(k)
5279 .iterations(1)
5280 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5281 }
5282 }
5283 }
5284 }
5285
5286 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4) {
5287 TEST_REQUIRES_X86_AVX;
5288 for (uint32_t n = 5; n < 8; n++) {
5289 for (size_t k = 1; k <= 40; k += 9) {
5290 GemmMicrokernelTester()
5291 .mr(4)
5292 .nr(4)
5293 .kr(2)
5294 .sr(1)
5295 .m(4)
5296 .n(4)
5297 .k(k)
5298 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5299 }
5300 }
5301 }
5302
5303 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4_strided_cn) {
5304 TEST_REQUIRES_X86_AVX;
5305 for (uint32_t n = 5; n < 8; n++) {
5306 for (size_t k = 1; k <= 40; k += 9) {
5307 GemmMicrokernelTester()
5308 .mr(4)
5309 .nr(4)
5310 .kr(2)
5311 .sr(1)
5312 .m(4)
5313 .n(4)
5314 .k(k)
5315 .cn_stride(7)
5316 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5317 }
5318 }
5319 }
5320
5321 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4_strided_a) {
5322 TEST_REQUIRES_X86_AVX;
5323 for (uint32_t n = 5; n < 8; n++) {
5324 for (size_t k = 1; k <= 40; k += 9) {
5325 GemmMicrokernelTester()
5326 .mr(4)
5327 .nr(4)
5328 .kr(2)
5329 .sr(1)
5330 .m(4)
5331 .n(n)
5332 .k(k)
5333 .a_stride(43)
5334 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5335 }
5336 }
5337 }
5338
5339 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4_subtile) {
5340 TEST_REQUIRES_X86_AVX;
5341 for (uint32_t n = 5; n < 8; n++) {
5342 for (size_t k = 1; k <= 40; k += 9) {
5343 for (uint32_t m = 1; m <= 4; m++) {
5344 GemmMicrokernelTester()
5345 .mr(4)
5346 .nr(4)
5347 .kr(2)
5348 .sr(1)
5349 .m(m)
5350 .n(n)
5351 .k(k)
5352 .iterations(1)
5353 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5354 }
5355 }
5356 }
5357 }
5358
5359 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4) {
5360 TEST_REQUIRES_X86_AVX;
5361 for (uint32_t n = 8; n <= 12; n += 4) {
5362 for (size_t k = 1; k <= 40; k += 9) {
5363 GemmMicrokernelTester()
5364 .mr(4)
5365 .nr(4)
5366 .kr(2)
5367 .sr(1)
5368 .m(4)
5369 .n(4)
5370 .k(k)
5371 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5372 }
5373 }
5374 }
5375
5376 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4_strided_cn) {
5377 TEST_REQUIRES_X86_AVX;
5378 for (uint32_t n = 8; n <= 12; n += 4) {
5379 for (size_t k = 1; k <= 40; k += 9) {
5380 GemmMicrokernelTester()
5381 .mr(4)
5382 .nr(4)
5383 .kr(2)
5384 .sr(1)
5385 .m(4)
5386 .n(n)
5387 .k(k)
5388 .cn_stride(7)
5389 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5390 }
5391 }
5392 }
5393
5394 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4_strided_a) {
5395 TEST_REQUIRES_X86_AVX;
5396 for (uint32_t n = 8; n <= 12; n += 4) {
5397 for (size_t k = 1; k <= 40; k += 9) {
5398 GemmMicrokernelTester()
5399 .mr(4)
5400 .nr(4)
5401 .kr(2)
5402 .sr(1)
5403 .m(4)
5404 .n(n)
5405 .k(k)
5406 .a_stride(43)
5407 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5408 }
5409 }
5410 }
5411
5412 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4_subtile) {
5413 TEST_REQUIRES_X86_AVX;
5414 for (uint32_t n = 8; n <= 12; n += 4) {
5415 for (size_t k = 1; k <= 40; k += 9) {
5416 for (uint32_t m = 1; m <= 4; m++) {
5417 GemmMicrokernelTester()
5418 .mr(4)
5419 .nr(4)
5420 .kr(2)
5421 .sr(1)
5422 .m(m)
5423 .n(n)
5424 .k(k)
5425 .iterations(1)
5426 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5427 }
5428 }
5429 }
5430 }
5431
5432 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, strided_cm_subtile) {
5433 TEST_REQUIRES_X86_AVX;
5434 for (size_t k = 1; k <= 40; k += 9) {
5435 for (uint32_t m = 1; m <= 4; m++) {
5436 for (uint32_t n = 1; n <= 4; n++) {
5437 GemmMicrokernelTester()
5438 .mr(4)
5439 .nr(4)
5440 .kr(2)
5441 .sr(1)
5442 .m(m)
5443 .n(n)
5444 .k(k)
5445 .cm_stride(7)
5446 .iterations(1)
5447 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5448 }
5449 }
5450 }
5451 }
5452
5453 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, qmin) {
5454 TEST_REQUIRES_X86_AVX;
5455 GemmMicrokernelTester()
5456 .mr(4)
5457 .nr(4)
5458 .kr(2)
5459 .sr(1)
5460 .m(4)
5461 .n(4)
5462 .k(8)
5463 .qmin(128)
5464 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5465 }
5466
5467 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, qmax) {
5468 TEST_REQUIRES_X86_AVX;
5469 GemmMicrokernelTester()
5470 .mr(4)
5471 .nr(4)
5472 .kr(2)
5473 .sr(1)
5474 .m(4)
5475 .n(4)
5476 .k(8)
5477 .qmax(128)
5478 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5479 }
5480
5481 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, strided_cm) {
5482 TEST_REQUIRES_X86_AVX;
5483 GemmMicrokernelTester()
5484 .mr(4)
5485 .nr(4)
5486 .kr(2)
5487 .sr(1)
5488 .m(4)
5489 .n(4)
5490 .k(8)
5491 .cm_stride(7)
5492 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5493 }
5494#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
5495
5496
5497#if XNN_ARCH_X86 || XNN_ARCH_X86_64
5498 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8) {
5499 TEST_REQUIRES_X86_XOP;
5500 GemmMicrokernelTester()
5501 .mr(1)
5502 .nr(4)
5503 .kr(2)
5504 .sr(1)
5505 .m(1)
5506 .n(4)
5507 .k(8)
5508 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5509 }
5510
5511 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, strided_cn) {
5512 TEST_REQUIRES_X86_XOP;
5513 GemmMicrokernelTester()
5514 .mr(1)
5515 .nr(4)
5516 .kr(2)
5517 .sr(1)
5518 .m(1)
5519 .n(4)
5520 .k(8)
5521 .cn_stride(7)
5522 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5523 }
5524
5525 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8_strided_a) {
5526 TEST_REQUIRES_X86_XOP;
5527 GemmMicrokernelTester()
5528 .mr(1)
5529 .nr(4)
5530 .kr(2)
5531 .sr(1)
5532 .m(1)
5533 .n(4)
5534 .k(8)
5535 .a_stride(11)
5536 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5537 }
5538
5539 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8_subtile) {
5540 TEST_REQUIRES_X86_XOP;
5541 for (uint32_t m = 1; m <= 1; m++) {
5542 for (uint32_t n = 1; n <= 4; n++) {
5543 GemmMicrokernelTester()
5544 .mr(1)
5545 .nr(4)
5546 .kr(2)
5547 .sr(1)
5548 .m(m)
5549 .n(n)
5550 .k(8)
5551 .iterations(1)
5552 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5553 }
5554 }
5555 }
5556
5557 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8_subtile_m) {
5558 TEST_REQUIRES_X86_XOP;
5559 for (uint32_t m = 1; m <= 1; m++) {
5560 GemmMicrokernelTester()
5561 .mr(1)
5562 .nr(4)
5563 .kr(2)
5564 .sr(1)
5565 .m(m)
5566 .n(4)
5567 .k(8)
5568 .iterations(1)
5569 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5570 }
5571 }
5572
5573 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8_subtile_n) {
5574 TEST_REQUIRES_X86_XOP;
5575 for (uint32_t n = 1; n <= 4; n++) {
5576 GemmMicrokernelTester()
5577 .mr(1)
5578 .nr(4)
5579 .kr(2)
5580 .sr(1)
5581 .m(1)
5582 .n(n)
5583 .k(8)
5584 .iterations(1)
5585 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5586 }
5587 }
5588
5589 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_lt_8) {
5590 TEST_REQUIRES_X86_XOP;
5591 for (size_t k = 1; k < 8; k++) {
5592 GemmMicrokernelTester()
5593 .mr(1)
5594 .nr(4)
5595 .kr(2)
5596 .sr(1)
5597 .m(1)
5598 .n(4)
5599 .k(k)
5600 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5601 }
5602 }
5603
5604 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_lt_8_strided_a) {
5605 TEST_REQUIRES_X86_XOP;
5606 for (size_t k = 1; k < 8; k++) {
5607 GemmMicrokernelTester()
5608 .mr(1)
5609 .nr(4)
5610 .kr(2)
5611 .sr(1)
5612 .m(1)
5613 .n(4)
5614 .k(k)
5615 .a_stride(11)
5616 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5617 }
5618 }
5619
5620 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_lt_8_subtile) {
5621 TEST_REQUIRES_X86_XOP;
5622 for (size_t k = 1; k < 8; k++) {
5623 for (uint32_t m = 1; m <= 1; m++) {
5624 for (uint32_t n = 1; n <= 4; n++) {
5625 GemmMicrokernelTester()
5626 .mr(1)
5627 .nr(4)
5628 .kr(2)
5629 .sr(1)
5630 .m(m)
5631 .n(n)
5632 .k(k)
5633 .iterations(1)
5634 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5635 }
5636 }
5637 }
5638 }
5639
5640 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_gt_8) {
5641 TEST_REQUIRES_X86_XOP;
5642 for (size_t k = 9; k < 16; k++) {
5643 GemmMicrokernelTester()
5644 .mr(1)
5645 .nr(4)
5646 .kr(2)
5647 .sr(1)
5648 .m(1)
5649 .n(4)
5650 .k(k)
5651 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5652 }
5653 }
5654
5655 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_gt_8_strided_a) {
5656 TEST_REQUIRES_X86_XOP;
5657 for (size_t k = 9; k < 16; k++) {
5658 GemmMicrokernelTester()
5659 .mr(1)
5660 .nr(4)
5661 .kr(2)
5662 .sr(1)
5663 .m(1)
5664 .n(4)
5665 .k(k)
5666 .a_stride(19)
5667 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5668 }
5669 }
5670
5671 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_gt_8_subtile) {
5672 TEST_REQUIRES_X86_XOP;
5673 for (size_t k = 9; k < 16; k++) {
5674 for (uint32_t m = 1; m <= 1; m++) {
5675 for (uint32_t n = 1; n <= 4; n++) {
5676 GemmMicrokernelTester()
5677 .mr(1)
5678 .nr(4)
5679 .kr(2)
5680 .sr(1)
5681 .m(m)
5682 .n(n)
5683 .k(k)
5684 .iterations(1)
5685 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5686 }
5687 }
5688 }
5689 }
5690
5691 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_div_8) {
5692 TEST_REQUIRES_X86_XOP;
5693 for (size_t k = 16; k <= 80; k += 8) {
5694 GemmMicrokernelTester()
5695 .mr(1)
5696 .nr(4)
5697 .kr(2)
5698 .sr(1)
5699 .m(1)
5700 .n(4)
5701 .k(k)
5702 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5703 }
5704 }
5705
5706 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_div_8_strided_a) {
5707 TEST_REQUIRES_X86_XOP;
5708 for (size_t k = 16; k <= 80; k += 8) {
5709 GemmMicrokernelTester()
5710 .mr(1)
5711 .nr(4)
5712 .kr(2)
5713 .sr(1)
5714 .m(1)
5715 .n(4)
5716 .k(k)
5717 .a_stride(83)
5718 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5719 }
5720 }
5721
5722 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_div_8_subtile) {
5723 TEST_REQUIRES_X86_XOP;
5724 for (size_t k = 16; k <= 80; k += 8) {
5725 for (uint32_t m = 1; m <= 1; m++) {
5726 for (uint32_t n = 1; n <= 4; n++) {
5727 GemmMicrokernelTester()
5728 .mr(1)
5729 .nr(4)
5730 .kr(2)
5731 .sr(1)
5732 .m(m)
5733 .n(n)
5734 .k(k)
5735 .iterations(1)
5736 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5737 }
5738 }
5739 }
5740 }
5741
5742 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4) {
5743 TEST_REQUIRES_X86_XOP;
5744 for (uint32_t n = 5; n < 8; n++) {
5745 for (size_t k = 1; k <= 40; k += 9) {
5746 GemmMicrokernelTester()
5747 .mr(1)
5748 .nr(4)
5749 .kr(2)
5750 .sr(1)
5751 .m(1)
5752 .n(4)
5753 .k(k)
5754 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5755 }
5756 }
5757 }
5758
5759 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4_strided_cn) {
5760 TEST_REQUIRES_X86_XOP;
5761 for (uint32_t n = 5; n < 8; n++) {
5762 for (size_t k = 1; k <= 40; k += 9) {
5763 GemmMicrokernelTester()
5764 .mr(1)
5765 .nr(4)
5766 .kr(2)
5767 .sr(1)
5768 .m(1)
5769 .n(4)
5770 .k(k)
5771 .cn_stride(7)
5772 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5773 }
5774 }
5775 }
5776
5777 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4_strided_a) {
5778 TEST_REQUIRES_X86_XOP;
5779 for (uint32_t n = 5; n < 8; n++) {
5780 for (size_t k = 1; k <= 40; k += 9) {
5781 GemmMicrokernelTester()
5782 .mr(1)
5783 .nr(4)
5784 .kr(2)
5785 .sr(1)
5786 .m(1)
5787 .n(n)
5788 .k(k)
5789 .a_stride(43)
5790 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5791 }
5792 }
5793 }
5794
5795 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4_subtile) {
5796 TEST_REQUIRES_X86_XOP;
5797 for (uint32_t n = 5; n < 8; n++) {
5798 for (size_t k = 1; k <= 40; k += 9) {
5799 for (uint32_t m = 1; m <= 1; m++) {
5800 GemmMicrokernelTester()
5801 .mr(1)
5802 .nr(4)
5803 .kr(2)
5804 .sr(1)
5805 .m(m)
5806 .n(n)
5807 .k(k)
5808 .iterations(1)
5809 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5810 }
5811 }
5812 }
5813 }
5814
5815 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4) {
5816 TEST_REQUIRES_X86_XOP;
5817 for (uint32_t n = 8; n <= 12; n += 4) {
5818 for (size_t k = 1; k <= 40; k += 9) {
5819 GemmMicrokernelTester()
5820 .mr(1)
5821 .nr(4)
5822 .kr(2)
5823 .sr(1)
5824 .m(1)
5825 .n(4)
5826 .k(k)
5827 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5828 }
5829 }
5830 }
5831
5832 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4_strided_cn) {
5833 TEST_REQUIRES_X86_XOP;
5834 for (uint32_t n = 8; n <= 12; n += 4) {
5835 for (size_t k = 1; k <= 40; k += 9) {
5836 GemmMicrokernelTester()
5837 .mr(1)
5838 .nr(4)
5839 .kr(2)
5840 .sr(1)
5841 .m(1)
5842 .n(n)
5843 .k(k)
5844 .cn_stride(7)
5845 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5846 }
5847 }
5848 }
5849
5850 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4_strided_a) {
5851 TEST_REQUIRES_X86_XOP;
5852 for (uint32_t n = 8; n <= 12; n += 4) {
5853 for (size_t k = 1; k <= 40; k += 9) {
5854 GemmMicrokernelTester()
5855 .mr(1)
5856 .nr(4)
5857 .kr(2)
5858 .sr(1)
5859 .m(1)
5860 .n(n)
5861 .k(k)
5862 .a_stride(43)
5863 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5864 }
5865 }
5866 }
5867
5868 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4_subtile) {
5869 TEST_REQUIRES_X86_XOP;
5870 for (uint32_t n = 8; n <= 12; n += 4) {
5871 for (size_t k = 1; k <= 40; k += 9) {
5872 for (uint32_t m = 1; m <= 1; m++) {
5873 GemmMicrokernelTester()
5874 .mr(1)
5875 .nr(4)
5876 .kr(2)
5877 .sr(1)
5878 .m(m)
5879 .n(n)
5880 .k(k)
5881 .iterations(1)
5882 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5883 }
5884 }
5885 }
5886 }
5887
5888 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, strided_cm_subtile) {
5889 TEST_REQUIRES_X86_XOP;
5890 for (size_t k = 1; k <= 40; k += 9) {
5891 for (uint32_t m = 1; m <= 1; m++) {
5892 for (uint32_t n = 1; n <= 4; n++) {
5893 GemmMicrokernelTester()
5894 .mr(1)
5895 .nr(4)
5896 .kr(2)
5897 .sr(1)
5898 .m(m)
5899 .n(n)
5900 .k(k)
5901 .cm_stride(7)
5902 .iterations(1)
5903 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5904 }
5905 }
5906 }
5907 }
5908
5909 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, qmin) {
5910 TEST_REQUIRES_X86_XOP;
5911 GemmMicrokernelTester()
5912 .mr(1)
5913 .nr(4)
5914 .kr(2)
5915 .sr(1)
5916 .m(1)
5917 .n(4)
5918 .k(8)
5919 .qmin(128)
5920 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5921 }
5922
5923 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, qmax) {
5924 TEST_REQUIRES_X86_XOP;
5925 GemmMicrokernelTester()
5926 .mr(1)
5927 .nr(4)
5928 .kr(2)
5929 .sr(1)
5930 .m(1)
5931 .n(4)
5932 .k(8)
5933 .qmax(128)
5934 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5935 }
5936
5937 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, strided_cm) {
5938 TEST_REQUIRES_X86_XOP;
5939 GemmMicrokernelTester()
5940 .mr(1)
5941 .nr(4)
5942 .kr(2)
5943 .sr(1)
5944 .m(1)
5945 .n(4)
5946 .k(8)
5947 .cm_stride(7)
5948 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5949 }
5950#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
5951
5952
5953#if XNN_ARCH_X86 || XNN_ARCH_X86_64
5954 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8) {
5955 TEST_REQUIRES_X86_XOP;
5956 GemmMicrokernelTester()
5957 .mr(2)
5958 .nr(4)
5959 .kr(2)
5960 .sr(1)
5961 .m(2)
5962 .n(4)
5963 .k(8)
5964 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5965 }
5966
5967 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, strided_cn) {
5968 TEST_REQUIRES_X86_XOP;
5969 GemmMicrokernelTester()
5970 .mr(2)
5971 .nr(4)
5972 .kr(2)
5973 .sr(1)
5974 .m(2)
5975 .n(4)
5976 .k(8)
5977 .cn_stride(7)
5978 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5979 }
5980
5981 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8_strided_a) {
5982 TEST_REQUIRES_X86_XOP;
5983 GemmMicrokernelTester()
5984 .mr(2)
5985 .nr(4)
5986 .kr(2)
5987 .sr(1)
5988 .m(2)
5989 .n(4)
5990 .k(8)
5991 .a_stride(11)
5992 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
5993 }
5994
5995 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8_subtile) {
5996 TEST_REQUIRES_X86_XOP;
5997 for (uint32_t m = 1; m <= 2; m++) {
5998 for (uint32_t n = 1; n <= 4; n++) {
5999 GemmMicrokernelTester()
6000 .mr(2)
6001 .nr(4)
6002 .kr(2)
6003 .sr(1)
6004 .m(m)
6005 .n(n)
6006 .k(8)
6007 .iterations(1)
6008 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6009 }
6010 }
6011 }
6012
6013 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8_subtile_m) {
6014 TEST_REQUIRES_X86_XOP;
6015 for (uint32_t m = 1; m <= 2; m++) {
6016 GemmMicrokernelTester()
6017 .mr(2)
6018 .nr(4)
6019 .kr(2)
6020 .sr(1)
6021 .m(m)
6022 .n(4)
6023 .k(8)
6024 .iterations(1)
6025 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6026 }
6027 }
6028
6029 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8_subtile_n) {
6030 TEST_REQUIRES_X86_XOP;
6031 for (uint32_t n = 1; n <= 4; n++) {
6032 GemmMicrokernelTester()
6033 .mr(2)
6034 .nr(4)
6035 .kr(2)
6036 .sr(1)
6037 .m(2)
6038 .n(n)
6039 .k(8)
6040 .iterations(1)
6041 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6042 }
6043 }
6044
6045 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_lt_8) {
6046 TEST_REQUIRES_X86_XOP;
6047 for (size_t k = 1; k < 8; k++) {
6048 GemmMicrokernelTester()
6049 .mr(2)
6050 .nr(4)
6051 .kr(2)
6052 .sr(1)
6053 .m(2)
6054 .n(4)
6055 .k(k)
6056 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6057 }
6058 }
6059
6060 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_lt_8_strided_a) {
6061 TEST_REQUIRES_X86_XOP;
6062 for (size_t k = 1; k < 8; k++) {
6063 GemmMicrokernelTester()
6064 .mr(2)
6065 .nr(4)
6066 .kr(2)
6067 .sr(1)
6068 .m(2)
6069 .n(4)
6070 .k(k)
6071 .a_stride(11)
6072 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6073 }
6074 }
6075
6076 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_lt_8_subtile) {
6077 TEST_REQUIRES_X86_XOP;
6078 for (size_t k = 1; k < 8; k++) {
6079 for (uint32_t m = 1; m <= 2; m++) {
6080 for (uint32_t n = 1; n <= 4; n++) {
6081 GemmMicrokernelTester()
6082 .mr(2)
6083 .nr(4)
6084 .kr(2)
6085 .sr(1)
6086 .m(m)
6087 .n(n)
6088 .k(k)
6089 .iterations(1)
6090 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6091 }
6092 }
6093 }
6094 }
6095
6096 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_gt_8) {
6097 TEST_REQUIRES_X86_XOP;
6098 for (size_t k = 9; k < 16; k++) {
6099 GemmMicrokernelTester()
6100 .mr(2)
6101 .nr(4)
6102 .kr(2)
6103 .sr(1)
6104 .m(2)
6105 .n(4)
6106 .k(k)
6107 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6108 }
6109 }
6110
6111 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_gt_8_strided_a) {
6112 TEST_REQUIRES_X86_XOP;
6113 for (size_t k = 9; k < 16; k++) {
6114 GemmMicrokernelTester()
6115 .mr(2)
6116 .nr(4)
6117 .kr(2)
6118 .sr(1)
6119 .m(2)
6120 .n(4)
6121 .k(k)
6122 .a_stride(19)
6123 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6124 }
6125 }
6126
6127 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_gt_8_subtile) {
6128 TEST_REQUIRES_X86_XOP;
6129 for (size_t k = 9; k < 16; k++) {
6130 for (uint32_t m = 1; m <= 2; m++) {
6131 for (uint32_t n = 1; n <= 4; n++) {
6132 GemmMicrokernelTester()
6133 .mr(2)
6134 .nr(4)
6135 .kr(2)
6136 .sr(1)
6137 .m(m)
6138 .n(n)
6139 .k(k)
6140 .iterations(1)
6141 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6142 }
6143 }
6144 }
6145 }
6146
6147 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_div_8) {
6148 TEST_REQUIRES_X86_XOP;
6149 for (size_t k = 16; k <= 80; k += 8) {
6150 GemmMicrokernelTester()
6151 .mr(2)
6152 .nr(4)
6153 .kr(2)
6154 .sr(1)
6155 .m(2)
6156 .n(4)
6157 .k(k)
6158 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6159 }
6160 }
6161
6162 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_div_8_strided_a) {
6163 TEST_REQUIRES_X86_XOP;
6164 for (size_t k = 16; k <= 80; k += 8) {
6165 GemmMicrokernelTester()
6166 .mr(2)
6167 .nr(4)
6168 .kr(2)
6169 .sr(1)
6170 .m(2)
6171 .n(4)
6172 .k(k)
6173 .a_stride(83)
6174 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6175 }
6176 }
6177
6178 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_div_8_subtile) {
6179 TEST_REQUIRES_X86_XOP;
6180 for (size_t k = 16; k <= 80; k += 8) {
6181 for (uint32_t m = 1; m <= 2; m++) {
6182 for (uint32_t n = 1; n <= 4; n++) {
6183 GemmMicrokernelTester()
6184 .mr(2)
6185 .nr(4)
6186 .kr(2)
6187 .sr(1)
6188 .m(m)
6189 .n(n)
6190 .k(k)
6191 .iterations(1)
6192 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6193 }
6194 }
6195 }
6196 }
6197
6198 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4) {
6199 TEST_REQUIRES_X86_XOP;
6200 for (uint32_t n = 5; n < 8; n++) {
6201 for (size_t k = 1; k <= 40; k += 9) {
6202 GemmMicrokernelTester()
6203 .mr(2)
6204 .nr(4)
6205 .kr(2)
6206 .sr(1)
6207 .m(2)
6208 .n(4)
6209 .k(k)
6210 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6211 }
6212 }
6213 }
6214
6215 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4_strided_cn) {
6216 TEST_REQUIRES_X86_XOP;
6217 for (uint32_t n = 5; n < 8; n++) {
6218 for (size_t k = 1; k <= 40; k += 9) {
6219 GemmMicrokernelTester()
6220 .mr(2)
6221 .nr(4)
6222 .kr(2)
6223 .sr(1)
6224 .m(2)
6225 .n(4)
6226 .k(k)
6227 .cn_stride(7)
6228 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6229 }
6230 }
6231 }
6232
6233 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4_strided_a) {
6234 TEST_REQUIRES_X86_XOP;
6235 for (uint32_t n = 5; n < 8; n++) {
6236 for (size_t k = 1; k <= 40; k += 9) {
6237 GemmMicrokernelTester()
6238 .mr(2)
6239 .nr(4)
6240 .kr(2)
6241 .sr(1)
6242 .m(2)
6243 .n(n)
6244 .k(k)
6245 .a_stride(43)
6246 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6247 }
6248 }
6249 }
6250
6251 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4_subtile) {
6252 TEST_REQUIRES_X86_XOP;
6253 for (uint32_t n = 5; n < 8; n++) {
6254 for (size_t k = 1; k <= 40; k += 9) {
6255 for (uint32_t m = 1; m <= 2; m++) {
6256 GemmMicrokernelTester()
6257 .mr(2)
6258 .nr(4)
6259 .kr(2)
6260 .sr(1)
6261 .m(m)
6262 .n(n)
6263 .k(k)
6264 .iterations(1)
6265 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6266 }
6267 }
6268 }
6269 }
6270
6271 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4) {
6272 TEST_REQUIRES_X86_XOP;
6273 for (uint32_t n = 8; n <= 12; n += 4) {
6274 for (size_t k = 1; k <= 40; k += 9) {
6275 GemmMicrokernelTester()
6276 .mr(2)
6277 .nr(4)
6278 .kr(2)
6279 .sr(1)
6280 .m(2)
6281 .n(4)
6282 .k(k)
6283 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6284 }
6285 }
6286 }
6287
6288 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4_strided_cn) {
6289 TEST_REQUIRES_X86_XOP;
6290 for (uint32_t n = 8; n <= 12; n += 4) {
6291 for (size_t k = 1; k <= 40; k += 9) {
6292 GemmMicrokernelTester()
6293 .mr(2)
6294 .nr(4)
6295 .kr(2)
6296 .sr(1)
6297 .m(2)
6298 .n(n)
6299 .k(k)
6300 .cn_stride(7)
6301 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6302 }
6303 }
6304 }
6305
6306 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4_strided_a) {
6307 TEST_REQUIRES_X86_XOP;
6308 for (uint32_t n = 8; n <= 12; n += 4) {
6309 for (size_t k = 1; k <= 40; k += 9) {
6310 GemmMicrokernelTester()
6311 .mr(2)
6312 .nr(4)
6313 .kr(2)
6314 .sr(1)
6315 .m(2)
6316 .n(n)
6317 .k(k)
6318 .a_stride(43)
6319 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6320 }
6321 }
6322 }
6323
6324 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4_subtile) {
6325 TEST_REQUIRES_X86_XOP;
6326 for (uint32_t n = 8; n <= 12; n += 4) {
6327 for (size_t k = 1; k <= 40; k += 9) {
6328 for (uint32_t m = 1; m <= 2; m++) {
6329 GemmMicrokernelTester()
6330 .mr(2)
6331 .nr(4)
6332 .kr(2)
6333 .sr(1)
6334 .m(m)
6335 .n(n)
6336 .k(k)
6337 .iterations(1)
6338 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6339 }
6340 }
6341 }
6342 }
6343
6344 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, strided_cm_subtile) {
6345 TEST_REQUIRES_X86_XOP;
6346 for (size_t k = 1; k <= 40; k += 9) {
6347 for (uint32_t m = 1; m <= 2; m++) {
6348 for (uint32_t n = 1; n <= 4; n++) {
6349 GemmMicrokernelTester()
6350 .mr(2)
6351 .nr(4)
6352 .kr(2)
6353 .sr(1)
6354 .m(m)
6355 .n(n)
6356 .k(k)
6357 .cm_stride(7)
6358 .iterations(1)
6359 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6360 }
6361 }
6362 }
6363 }
6364
6365 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, qmin) {
6366 TEST_REQUIRES_X86_XOP;
6367 GemmMicrokernelTester()
6368 .mr(2)
6369 .nr(4)
6370 .kr(2)
6371 .sr(1)
6372 .m(2)
6373 .n(4)
6374 .k(8)
6375 .qmin(128)
6376 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6377 }
6378
6379 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, qmax) {
6380 TEST_REQUIRES_X86_XOP;
6381 GemmMicrokernelTester()
6382 .mr(2)
6383 .nr(4)
6384 .kr(2)
6385 .sr(1)
6386 .m(2)
6387 .n(4)
6388 .k(8)
6389 .qmax(128)
6390 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6391 }
6392
6393 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, strided_cm) {
6394 TEST_REQUIRES_X86_XOP;
6395 GemmMicrokernelTester()
6396 .mr(2)
6397 .nr(4)
6398 .kr(2)
6399 .sr(1)
6400 .m(2)
6401 .n(4)
6402 .k(8)
6403 .cm_stride(7)
6404 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6405 }
6406#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
6407
6408
6409#if XNN_ARCH_X86 || XNN_ARCH_X86_64
6410 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8) {
6411 TEST_REQUIRES_X86_XOP;
6412 GemmMicrokernelTester()
6413 .mr(3)
6414 .nr(4)
6415 .kr(2)
6416 .sr(1)
6417 .m(3)
6418 .n(4)
6419 .k(8)
6420 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6421 }
6422
6423 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, strided_cn) {
6424 TEST_REQUIRES_X86_XOP;
6425 GemmMicrokernelTester()
6426 .mr(3)
6427 .nr(4)
6428 .kr(2)
6429 .sr(1)
6430 .m(3)
6431 .n(4)
6432 .k(8)
6433 .cn_stride(7)
6434 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6435 }
6436
6437 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8_strided_a) {
6438 TEST_REQUIRES_X86_XOP;
6439 GemmMicrokernelTester()
6440 .mr(3)
6441 .nr(4)
6442 .kr(2)
6443 .sr(1)
6444 .m(3)
6445 .n(4)
6446 .k(8)
6447 .a_stride(11)
6448 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6449 }
6450
6451 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8_subtile) {
6452 TEST_REQUIRES_X86_XOP;
6453 for (uint32_t m = 1; m <= 3; m++) {
6454 for (uint32_t n = 1; n <= 4; n++) {
6455 GemmMicrokernelTester()
6456 .mr(3)
6457 .nr(4)
6458 .kr(2)
6459 .sr(1)
6460 .m(m)
6461 .n(n)
6462 .k(8)
6463 .iterations(1)
6464 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6465 }
6466 }
6467 }
6468
6469 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8_subtile_m) {
6470 TEST_REQUIRES_X86_XOP;
6471 for (uint32_t m = 1; m <= 3; m++) {
6472 GemmMicrokernelTester()
6473 .mr(3)
6474 .nr(4)
6475 .kr(2)
6476 .sr(1)
6477 .m(m)
6478 .n(4)
6479 .k(8)
6480 .iterations(1)
6481 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6482 }
6483 }
6484
6485 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8_subtile_n) {
6486 TEST_REQUIRES_X86_XOP;
6487 for (uint32_t n = 1; n <= 4; n++) {
6488 GemmMicrokernelTester()
6489 .mr(3)
6490 .nr(4)
6491 .kr(2)
6492 .sr(1)
6493 .m(3)
6494 .n(n)
6495 .k(8)
6496 .iterations(1)
6497 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6498 }
6499 }
6500
6501 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_lt_8) {
6502 TEST_REQUIRES_X86_XOP;
6503 for (size_t k = 1; k < 8; k++) {
6504 GemmMicrokernelTester()
6505 .mr(3)
6506 .nr(4)
6507 .kr(2)
6508 .sr(1)
6509 .m(3)
6510 .n(4)
6511 .k(k)
6512 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6513 }
6514 }
6515
6516 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_lt_8_strided_a) {
6517 TEST_REQUIRES_X86_XOP;
6518 for (size_t k = 1; k < 8; k++) {
6519 GemmMicrokernelTester()
6520 .mr(3)
6521 .nr(4)
6522 .kr(2)
6523 .sr(1)
6524 .m(3)
6525 .n(4)
6526 .k(k)
6527 .a_stride(11)
6528 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6529 }
6530 }
6531
6532 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_lt_8_subtile) {
6533 TEST_REQUIRES_X86_XOP;
6534 for (size_t k = 1; k < 8; k++) {
6535 for (uint32_t m = 1; m <= 3; m++) {
6536 for (uint32_t n = 1; n <= 4; n++) {
6537 GemmMicrokernelTester()
6538 .mr(3)
6539 .nr(4)
6540 .kr(2)
6541 .sr(1)
6542 .m(m)
6543 .n(n)
6544 .k(k)
6545 .iterations(1)
6546 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6547 }
6548 }
6549 }
6550 }
6551
6552 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_gt_8) {
6553 TEST_REQUIRES_X86_XOP;
6554 for (size_t k = 9; k < 16; k++) {
6555 GemmMicrokernelTester()
6556 .mr(3)
6557 .nr(4)
6558 .kr(2)
6559 .sr(1)
6560 .m(3)
6561 .n(4)
6562 .k(k)
6563 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6564 }
6565 }
6566
6567 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_gt_8_strided_a) {
6568 TEST_REQUIRES_X86_XOP;
6569 for (size_t k = 9; k < 16; k++) {
6570 GemmMicrokernelTester()
6571 .mr(3)
6572 .nr(4)
6573 .kr(2)
6574 .sr(1)
6575 .m(3)
6576 .n(4)
6577 .k(k)
6578 .a_stride(19)
6579 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6580 }
6581 }
6582
6583 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_gt_8_subtile) {
6584 TEST_REQUIRES_X86_XOP;
6585 for (size_t k = 9; k < 16; k++) {
6586 for (uint32_t m = 1; m <= 3; m++) {
6587 for (uint32_t n = 1; n <= 4; n++) {
6588 GemmMicrokernelTester()
6589 .mr(3)
6590 .nr(4)
6591 .kr(2)
6592 .sr(1)
6593 .m(m)
6594 .n(n)
6595 .k(k)
6596 .iterations(1)
6597 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6598 }
6599 }
6600 }
6601 }
6602
6603 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_div_8) {
6604 TEST_REQUIRES_X86_XOP;
6605 for (size_t k = 16; k <= 80; k += 8) {
6606 GemmMicrokernelTester()
6607 .mr(3)
6608 .nr(4)
6609 .kr(2)
6610 .sr(1)
6611 .m(3)
6612 .n(4)
6613 .k(k)
6614 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6615 }
6616 }
6617
6618 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_div_8_strided_a) {
6619 TEST_REQUIRES_X86_XOP;
6620 for (size_t k = 16; k <= 80; k += 8) {
6621 GemmMicrokernelTester()
6622 .mr(3)
6623 .nr(4)
6624 .kr(2)
6625 .sr(1)
6626 .m(3)
6627 .n(4)
6628 .k(k)
6629 .a_stride(83)
6630 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6631 }
6632 }
6633
6634 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_div_8_subtile) {
6635 TEST_REQUIRES_X86_XOP;
6636 for (size_t k = 16; k <= 80; k += 8) {
6637 for (uint32_t m = 1; m <= 3; m++) {
6638 for (uint32_t n = 1; n <= 4; n++) {
6639 GemmMicrokernelTester()
6640 .mr(3)
6641 .nr(4)
6642 .kr(2)
6643 .sr(1)
6644 .m(m)
6645 .n(n)
6646 .k(k)
6647 .iterations(1)
6648 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6649 }
6650 }
6651 }
6652 }
6653
6654 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_gt_4) {
6655 TEST_REQUIRES_X86_XOP;
6656 for (uint32_t n = 5; n < 8; n++) {
6657 for (size_t k = 1; k <= 40; k += 9) {
6658 GemmMicrokernelTester()
6659 .mr(3)
6660 .nr(4)
6661 .kr(2)
6662 .sr(1)
6663 .m(3)
6664 .n(4)
6665 .k(k)
6666 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6667 }
6668 }
6669 }
6670
6671 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_gt_4_strided_cn) {
6672 TEST_REQUIRES_X86_XOP;
6673 for (uint32_t n = 5; n < 8; n++) {
6674 for (size_t k = 1; k <= 40; k += 9) {
6675 GemmMicrokernelTester()
6676 .mr(3)
6677 .nr(4)
6678 .kr(2)
6679 .sr(1)
6680 .m(3)
6681 .n(4)
6682 .k(k)
6683 .cn_stride(7)
6684 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6685 }
6686 }
6687 }
6688
6689 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_gt_4_strided_a) {
6690 TEST_REQUIRES_X86_XOP;
6691 for (uint32_t n = 5; n < 8; n++) {
6692 for (size_t k = 1; k <= 40; k += 9) {
6693 GemmMicrokernelTester()
6694 .mr(3)
6695 .nr(4)
6696 .kr(2)
6697 .sr(1)
6698 .m(3)
6699 .n(n)
6700 .k(k)
6701 .a_stride(43)
6702 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6703 }
6704 }
6705 }
6706
6707 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_gt_4_subtile) {
6708 TEST_REQUIRES_X86_XOP;
6709 for (uint32_t n = 5; n < 8; n++) {
6710 for (size_t k = 1; k <= 40; k += 9) {
6711 for (uint32_t m = 1; m <= 3; m++) {
6712 GemmMicrokernelTester()
6713 .mr(3)
6714 .nr(4)
6715 .kr(2)
6716 .sr(1)
6717 .m(m)
6718 .n(n)
6719 .k(k)
6720 .iterations(1)
6721 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6722 }
6723 }
6724 }
6725 }
6726
6727 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_div_4) {
6728 TEST_REQUIRES_X86_XOP;
6729 for (uint32_t n = 8; n <= 12; n += 4) {
6730 for (size_t k = 1; k <= 40; k += 9) {
6731 GemmMicrokernelTester()
6732 .mr(3)
6733 .nr(4)
6734 .kr(2)
6735 .sr(1)
6736 .m(3)
6737 .n(4)
6738 .k(k)
6739 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6740 }
6741 }
6742 }
6743
6744 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_div_4_strided_cn) {
6745 TEST_REQUIRES_X86_XOP;
6746 for (uint32_t n = 8; n <= 12; n += 4) {
6747 for (size_t k = 1; k <= 40; k += 9) {
6748 GemmMicrokernelTester()
6749 .mr(3)
6750 .nr(4)
6751 .kr(2)
6752 .sr(1)
6753 .m(3)
6754 .n(n)
6755 .k(k)
6756 .cn_stride(7)
6757 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6758 }
6759 }
6760 }
6761
6762 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_div_4_strided_a) {
6763 TEST_REQUIRES_X86_XOP;
6764 for (uint32_t n = 8; n <= 12; n += 4) {
6765 for (size_t k = 1; k <= 40; k += 9) {
6766 GemmMicrokernelTester()
6767 .mr(3)
6768 .nr(4)
6769 .kr(2)
6770 .sr(1)
6771 .m(3)
6772 .n(n)
6773 .k(k)
6774 .a_stride(43)
6775 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6776 }
6777 }
6778 }
6779
6780 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_div_4_subtile) {
6781 TEST_REQUIRES_X86_XOP;
6782 for (uint32_t n = 8; n <= 12; n += 4) {
6783 for (size_t k = 1; k <= 40; k += 9) {
6784 for (uint32_t m = 1; m <= 3; m++) {
6785 GemmMicrokernelTester()
6786 .mr(3)
6787 .nr(4)
6788 .kr(2)
6789 .sr(1)
6790 .m(m)
6791 .n(n)
6792 .k(k)
6793 .iterations(1)
6794 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6795 }
6796 }
6797 }
6798 }
6799
6800 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, strided_cm_subtile) {
6801 TEST_REQUIRES_X86_XOP;
6802 for (size_t k = 1; k <= 40; k += 9) {
6803 for (uint32_t m = 1; m <= 3; m++) {
6804 for (uint32_t n = 1; n <= 4; n++) {
6805 GemmMicrokernelTester()
6806 .mr(3)
6807 .nr(4)
6808 .kr(2)
6809 .sr(1)
6810 .m(m)
6811 .n(n)
6812 .k(k)
6813 .cm_stride(7)
6814 .iterations(1)
6815 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6816 }
6817 }
6818 }
6819 }
6820
6821 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, qmin) {
6822 TEST_REQUIRES_X86_XOP;
6823 GemmMicrokernelTester()
6824 .mr(3)
6825 .nr(4)
6826 .kr(2)
6827 .sr(1)
6828 .m(3)
6829 .n(4)
6830 .k(8)
6831 .qmin(128)
6832 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6833 }
6834
6835 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, qmax) {
6836 TEST_REQUIRES_X86_XOP;
6837 GemmMicrokernelTester()
6838 .mr(3)
6839 .nr(4)
6840 .kr(2)
6841 .sr(1)
6842 .m(3)
6843 .n(4)
6844 .k(8)
6845 .qmax(128)
6846 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6847 }
6848
6849 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, strided_cm) {
6850 TEST_REQUIRES_X86_XOP;
6851 GemmMicrokernelTester()
6852 .mr(3)
6853 .nr(4)
6854 .kr(2)
6855 .sr(1)
6856 .m(3)
6857 .n(4)
6858 .k(8)
6859 .cm_stride(7)
6860 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6861 }
6862#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
6863
6864
6865#if XNN_ARCH_X86 || XNN_ARCH_X86_64
6866 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8) {
6867 TEST_REQUIRES_X86_XOP;
6868 GemmMicrokernelTester()
6869 .mr(4)
6870 .nr(4)
6871 .kr(2)
6872 .sr(1)
6873 .m(4)
6874 .n(4)
6875 .k(8)
6876 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6877 }
6878
6879 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, strided_cn) {
6880 TEST_REQUIRES_X86_XOP;
6881 GemmMicrokernelTester()
6882 .mr(4)
6883 .nr(4)
6884 .kr(2)
6885 .sr(1)
6886 .m(4)
6887 .n(4)
6888 .k(8)
6889 .cn_stride(7)
6890 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6891 }
6892
6893 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8_strided_a) {
6894 TEST_REQUIRES_X86_XOP;
6895 GemmMicrokernelTester()
6896 .mr(4)
6897 .nr(4)
6898 .kr(2)
6899 .sr(1)
6900 .m(4)
6901 .n(4)
6902 .k(8)
6903 .a_stride(11)
6904 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6905 }
6906
6907 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8_subtile) {
6908 TEST_REQUIRES_X86_XOP;
6909 for (uint32_t m = 1; m <= 4; m++) {
6910 for (uint32_t n = 1; n <= 4; n++) {
6911 GemmMicrokernelTester()
6912 .mr(4)
6913 .nr(4)
6914 .kr(2)
6915 .sr(1)
6916 .m(m)
6917 .n(n)
6918 .k(8)
6919 .iterations(1)
6920 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6921 }
6922 }
6923 }
6924
6925 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8_subtile_m) {
6926 TEST_REQUIRES_X86_XOP;
6927 for (uint32_t m = 1; m <= 4; m++) {
6928 GemmMicrokernelTester()
6929 .mr(4)
6930 .nr(4)
6931 .kr(2)
6932 .sr(1)
6933 .m(m)
6934 .n(4)
6935 .k(8)
6936 .iterations(1)
6937 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6938 }
6939 }
6940
6941 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8_subtile_n) {
6942 TEST_REQUIRES_X86_XOP;
6943 for (uint32_t n = 1; n <= 4; n++) {
6944 GemmMicrokernelTester()
6945 .mr(4)
6946 .nr(4)
6947 .kr(2)
6948 .sr(1)
6949 .m(4)
6950 .n(n)
6951 .k(8)
6952 .iterations(1)
6953 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6954 }
6955 }
6956
6957 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_lt_8) {
6958 TEST_REQUIRES_X86_XOP;
6959 for (size_t k = 1; k < 8; k++) {
6960 GemmMicrokernelTester()
6961 .mr(4)
6962 .nr(4)
6963 .kr(2)
6964 .sr(1)
6965 .m(4)
6966 .n(4)
6967 .k(k)
6968 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6969 }
6970 }
6971
6972 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_lt_8_strided_a) {
6973 TEST_REQUIRES_X86_XOP;
6974 for (size_t k = 1; k < 8; k++) {
6975 GemmMicrokernelTester()
6976 .mr(4)
6977 .nr(4)
6978 .kr(2)
6979 .sr(1)
6980 .m(4)
6981 .n(4)
6982 .k(k)
6983 .a_stride(11)
6984 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
6985 }
6986 }
6987
6988 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_lt_8_subtile) {
6989 TEST_REQUIRES_X86_XOP;
6990 for (size_t k = 1; k < 8; k++) {
6991 for (uint32_t m = 1; m <= 4; m++) {
6992 for (uint32_t n = 1; n <= 4; n++) {
6993 GemmMicrokernelTester()
6994 .mr(4)
6995 .nr(4)
6996 .kr(2)
6997 .sr(1)
6998 .m(m)
6999 .n(n)
7000 .k(k)
7001 .iterations(1)
7002 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7003 }
7004 }
7005 }
7006 }
7007
7008 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_gt_8) {
7009 TEST_REQUIRES_X86_XOP;
7010 for (size_t k = 9; k < 16; k++) {
7011 GemmMicrokernelTester()
7012 .mr(4)
7013 .nr(4)
7014 .kr(2)
7015 .sr(1)
7016 .m(4)
7017 .n(4)
7018 .k(k)
7019 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7020 }
7021 }
7022
7023 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_gt_8_strided_a) {
7024 TEST_REQUIRES_X86_XOP;
7025 for (size_t k = 9; k < 16; k++) {
7026 GemmMicrokernelTester()
7027 .mr(4)
7028 .nr(4)
7029 .kr(2)
7030 .sr(1)
7031 .m(4)
7032 .n(4)
7033 .k(k)
7034 .a_stride(19)
7035 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7036 }
7037 }
7038
7039 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_gt_8_subtile) {
7040 TEST_REQUIRES_X86_XOP;
7041 for (size_t k = 9; k < 16; k++) {
7042 for (uint32_t m = 1; m <= 4; m++) {
7043 for (uint32_t n = 1; n <= 4; n++) {
7044 GemmMicrokernelTester()
7045 .mr(4)
7046 .nr(4)
7047 .kr(2)
7048 .sr(1)
7049 .m(m)
7050 .n(n)
7051 .k(k)
7052 .iterations(1)
7053 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7054 }
7055 }
7056 }
7057 }
7058
7059 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_div_8) {
7060 TEST_REQUIRES_X86_XOP;
7061 for (size_t k = 16; k <= 80; k += 8) {
7062 GemmMicrokernelTester()
7063 .mr(4)
7064 .nr(4)
7065 .kr(2)
7066 .sr(1)
7067 .m(4)
7068 .n(4)
7069 .k(k)
7070 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7071 }
7072 }
7073
7074 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_div_8_strided_a) {
7075 TEST_REQUIRES_X86_XOP;
7076 for (size_t k = 16; k <= 80; k += 8) {
7077 GemmMicrokernelTester()
7078 .mr(4)
7079 .nr(4)
7080 .kr(2)
7081 .sr(1)
7082 .m(4)
7083 .n(4)
7084 .k(k)
7085 .a_stride(83)
7086 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7087 }
7088 }
7089
7090 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_div_8_subtile) {
7091 TEST_REQUIRES_X86_XOP;
7092 for (size_t k = 16; k <= 80; k += 8) {
7093 for (uint32_t m = 1; m <= 4; m++) {
7094 for (uint32_t n = 1; n <= 4; n++) {
7095 GemmMicrokernelTester()
7096 .mr(4)
7097 .nr(4)
7098 .kr(2)
7099 .sr(1)
7100 .m(m)
7101 .n(n)
7102 .k(k)
7103 .iterations(1)
7104 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7105 }
7106 }
7107 }
7108 }
7109
7110 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4) {
7111 TEST_REQUIRES_X86_XOP;
7112 for (uint32_t n = 5; n < 8; n++) {
7113 for (size_t k = 1; k <= 40; k += 9) {
7114 GemmMicrokernelTester()
7115 .mr(4)
7116 .nr(4)
7117 .kr(2)
7118 .sr(1)
7119 .m(4)
7120 .n(4)
7121 .k(k)
7122 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7123 }
7124 }
7125 }
7126
7127 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4_strided_cn) {
7128 TEST_REQUIRES_X86_XOP;
7129 for (uint32_t n = 5; n < 8; n++) {
7130 for (size_t k = 1; k <= 40; k += 9) {
7131 GemmMicrokernelTester()
7132 .mr(4)
7133 .nr(4)
7134 .kr(2)
7135 .sr(1)
7136 .m(4)
7137 .n(4)
7138 .k(k)
7139 .cn_stride(7)
7140 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7141 }
7142 }
7143 }
7144
7145 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4_strided_a) {
7146 TEST_REQUIRES_X86_XOP;
7147 for (uint32_t n = 5; n < 8; n++) {
7148 for (size_t k = 1; k <= 40; k += 9) {
7149 GemmMicrokernelTester()
7150 .mr(4)
7151 .nr(4)
7152 .kr(2)
7153 .sr(1)
7154 .m(4)
7155 .n(n)
7156 .k(k)
7157 .a_stride(43)
7158 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7159 }
7160 }
7161 }
7162
7163 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4_subtile) {
7164 TEST_REQUIRES_X86_XOP;
7165 for (uint32_t n = 5; n < 8; n++) {
7166 for (size_t k = 1; k <= 40; k += 9) {
7167 for (uint32_t m = 1; m <= 4; m++) {
7168 GemmMicrokernelTester()
7169 .mr(4)
7170 .nr(4)
7171 .kr(2)
7172 .sr(1)
7173 .m(m)
7174 .n(n)
7175 .k(k)
7176 .iterations(1)
7177 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7178 }
7179 }
7180 }
7181 }
7182
7183 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4) {
7184 TEST_REQUIRES_X86_XOP;
7185 for (uint32_t n = 8; n <= 12; n += 4) {
7186 for (size_t k = 1; k <= 40; k += 9) {
7187 GemmMicrokernelTester()
7188 .mr(4)
7189 .nr(4)
7190 .kr(2)
7191 .sr(1)
7192 .m(4)
7193 .n(4)
7194 .k(k)
7195 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7196 }
7197 }
7198 }
7199
7200 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4_strided_cn) {
7201 TEST_REQUIRES_X86_XOP;
7202 for (uint32_t n = 8; n <= 12; n += 4) {
7203 for (size_t k = 1; k <= 40; k += 9) {
7204 GemmMicrokernelTester()
7205 .mr(4)
7206 .nr(4)
7207 .kr(2)
7208 .sr(1)
7209 .m(4)
7210 .n(n)
7211 .k(k)
7212 .cn_stride(7)
7213 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7214 }
7215 }
7216 }
7217
7218 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4_strided_a) {
7219 TEST_REQUIRES_X86_XOP;
7220 for (uint32_t n = 8; n <= 12; n += 4) {
7221 for (size_t k = 1; k <= 40; k += 9) {
7222 GemmMicrokernelTester()
7223 .mr(4)
7224 .nr(4)
7225 .kr(2)
7226 .sr(1)
7227 .m(4)
7228 .n(n)
7229 .k(k)
7230 .a_stride(43)
7231 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7232 }
7233 }
7234 }
7235
7236 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4_subtile) {
7237 TEST_REQUIRES_X86_XOP;
7238 for (uint32_t n = 8; n <= 12; n += 4) {
7239 for (size_t k = 1; k <= 40; k += 9) {
7240 for (uint32_t m = 1; m <= 4; m++) {
7241 GemmMicrokernelTester()
7242 .mr(4)
7243 .nr(4)
7244 .kr(2)
7245 .sr(1)
7246 .m(m)
7247 .n(n)
7248 .k(k)
7249 .iterations(1)
7250 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7251 }
7252 }
7253 }
7254 }
7255
7256 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, strided_cm_subtile) {
7257 TEST_REQUIRES_X86_XOP;
7258 for (size_t k = 1; k <= 40; k += 9) {
7259 for (uint32_t m = 1; m <= 4; m++) {
7260 for (uint32_t n = 1; n <= 4; n++) {
7261 GemmMicrokernelTester()
7262 .mr(4)
7263 .nr(4)
7264 .kr(2)
7265 .sr(1)
7266 .m(m)
7267 .n(n)
7268 .k(k)
7269 .cm_stride(7)
7270 .iterations(1)
7271 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7272 }
7273 }
7274 }
7275 }
7276
7277 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, qmin) {
7278 TEST_REQUIRES_X86_XOP;
7279 GemmMicrokernelTester()
7280 .mr(4)
7281 .nr(4)
7282 .kr(2)
7283 .sr(1)
7284 .m(4)
7285 .n(4)
7286 .k(8)
7287 .qmin(128)
7288 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7289 }
7290
7291 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, qmax) {
7292 TEST_REQUIRES_X86_XOP;
7293 GemmMicrokernelTester()
7294 .mr(4)
7295 .nr(4)
7296 .kr(2)
7297 .sr(1)
7298 .m(4)
7299 .n(4)
7300 .k(8)
7301 .qmax(128)
7302 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7303 }
7304
7305 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, strided_cm) {
7306 TEST_REQUIRES_X86_XOP;
7307 GemmMicrokernelTester()
7308 .mr(4)
7309 .nr(4)
7310 .kr(2)
7311 .sr(1)
7312 .m(4)
7313 .n(4)
7314 .k(8)
7315 .cm_stride(7)
7316 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7317 }
7318#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
7319
7320
7321#if XNN_ARCH_X86 || XNN_ARCH_X86_64
7322 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8) {
7323 TEST_REQUIRES_X86_SSE2;
7324 GemmMicrokernelTester()
7325 .mr(1)
7326 .nr(4)
7327 .kr(2)
7328 .sr(1)
7329 .m(1)
7330 .n(4)
7331 .k(8)
7332 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7333 }
7334
7335 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, strided_cn) {
7336 TEST_REQUIRES_X86_SSE2;
7337 GemmMicrokernelTester()
7338 .mr(1)
7339 .nr(4)
7340 .kr(2)
7341 .sr(1)
7342 .m(1)
7343 .n(4)
7344 .k(8)
7345 .cn_stride(7)
7346 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7347 }
7348
7349 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8_strided_a) {
7350 TEST_REQUIRES_X86_SSE2;
7351 GemmMicrokernelTester()
7352 .mr(1)
7353 .nr(4)
7354 .kr(2)
7355 .sr(1)
7356 .m(1)
7357 .n(4)
7358 .k(8)
7359 .a_stride(11)
7360 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7361 }
7362
7363 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8_subtile) {
7364 TEST_REQUIRES_X86_SSE2;
7365 for (uint32_t m = 1; m <= 1; m++) {
7366 for (uint32_t n = 1; n <= 4; n++) {
7367 GemmMicrokernelTester()
7368 .mr(1)
7369 .nr(4)
7370 .kr(2)
7371 .sr(1)
7372 .m(m)
7373 .n(n)
7374 .k(8)
7375 .iterations(1)
7376 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7377 }
7378 }
7379 }
7380
7381 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8_subtile_m) {
7382 TEST_REQUIRES_X86_SSE2;
7383 for (uint32_t m = 1; m <= 1; m++) {
7384 GemmMicrokernelTester()
7385 .mr(1)
7386 .nr(4)
7387 .kr(2)
7388 .sr(1)
7389 .m(m)
7390 .n(4)
7391 .k(8)
7392 .iterations(1)
7393 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7394 }
7395 }
7396
7397 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8_subtile_n) {
7398 TEST_REQUIRES_X86_SSE2;
7399 for (uint32_t n = 1; n <= 4; n++) {
7400 GemmMicrokernelTester()
7401 .mr(1)
7402 .nr(4)
7403 .kr(2)
7404 .sr(1)
7405 .m(1)
7406 .n(n)
7407 .k(8)
7408 .iterations(1)
7409 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7410 }
7411 }
7412
7413 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_lt_8) {
7414 TEST_REQUIRES_X86_SSE2;
7415 for (size_t k = 1; k < 8; k++) {
7416 GemmMicrokernelTester()
7417 .mr(1)
7418 .nr(4)
7419 .kr(2)
7420 .sr(1)
7421 .m(1)
7422 .n(4)
7423 .k(k)
7424 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7425 }
7426 }
7427
7428 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_lt_8_strided_a) {
7429 TEST_REQUIRES_X86_SSE2;
7430 for (size_t k = 1; k < 8; k++) {
7431 GemmMicrokernelTester()
7432 .mr(1)
7433 .nr(4)
7434 .kr(2)
7435 .sr(1)
7436 .m(1)
7437 .n(4)
7438 .k(k)
7439 .a_stride(11)
7440 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7441 }
7442 }
7443
7444 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_lt_8_subtile) {
7445 TEST_REQUIRES_X86_SSE2;
7446 for (size_t k = 1; k < 8; k++) {
7447 for (uint32_t m = 1; m <= 1; m++) {
7448 for (uint32_t n = 1; n <= 4; n++) {
7449 GemmMicrokernelTester()
7450 .mr(1)
7451 .nr(4)
7452 .kr(2)
7453 .sr(1)
7454 .m(m)
7455 .n(n)
7456 .k(k)
7457 .iterations(1)
7458 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7459 }
7460 }
7461 }
7462 }
7463
7464 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_gt_8) {
7465 TEST_REQUIRES_X86_SSE2;
7466 for (size_t k = 9; k < 16; k++) {
7467 GemmMicrokernelTester()
7468 .mr(1)
7469 .nr(4)
7470 .kr(2)
7471 .sr(1)
7472 .m(1)
7473 .n(4)
7474 .k(k)
7475 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7476 }
7477 }
7478
7479 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_gt_8_strided_a) {
7480 TEST_REQUIRES_X86_SSE2;
7481 for (size_t k = 9; k < 16; k++) {
7482 GemmMicrokernelTester()
7483 .mr(1)
7484 .nr(4)
7485 .kr(2)
7486 .sr(1)
7487 .m(1)
7488 .n(4)
7489 .k(k)
7490 .a_stride(19)
7491 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7492 }
7493 }
7494
7495 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_gt_8_subtile) {
7496 TEST_REQUIRES_X86_SSE2;
7497 for (size_t k = 9; k < 16; k++) {
7498 for (uint32_t m = 1; m <= 1; m++) {
7499 for (uint32_t n = 1; n <= 4; n++) {
7500 GemmMicrokernelTester()
7501 .mr(1)
7502 .nr(4)
7503 .kr(2)
7504 .sr(1)
7505 .m(m)
7506 .n(n)
7507 .k(k)
7508 .iterations(1)
7509 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7510 }
7511 }
7512 }
7513 }
7514
7515 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_div_8) {
7516 TEST_REQUIRES_X86_SSE2;
7517 for (size_t k = 16; k <= 80; k += 8) {
7518 GemmMicrokernelTester()
7519 .mr(1)
7520 .nr(4)
7521 .kr(2)
7522 .sr(1)
7523 .m(1)
7524 .n(4)
7525 .k(k)
7526 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7527 }
7528 }
7529
7530 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_div_8_strided_a) {
7531 TEST_REQUIRES_X86_SSE2;
7532 for (size_t k = 16; k <= 80; k += 8) {
7533 GemmMicrokernelTester()
7534 .mr(1)
7535 .nr(4)
7536 .kr(2)
7537 .sr(1)
7538 .m(1)
7539 .n(4)
7540 .k(k)
7541 .a_stride(83)
7542 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7543 }
7544 }
7545
7546 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_div_8_subtile) {
7547 TEST_REQUIRES_X86_SSE2;
7548 for (size_t k = 16; k <= 80; k += 8) {
7549 for (uint32_t m = 1; m <= 1; m++) {
7550 for (uint32_t n = 1; n <= 4; n++) {
7551 GemmMicrokernelTester()
7552 .mr(1)
7553 .nr(4)
7554 .kr(2)
7555 .sr(1)
7556 .m(m)
7557 .n(n)
7558 .k(k)
7559 .iterations(1)
7560 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7561 }
7562 }
7563 }
7564 }
7565
7566 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_gt_4) {
7567 TEST_REQUIRES_X86_SSE2;
7568 for (uint32_t n = 5; n < 8; n++) {
7569 for (size_t k = 1; k <= 40; k += 9) {
7570 GemmMicrokernelTester()
7571 .mr(1)
7572 .nr(4)
7573 .kr(2)
7574 .sr(1)
7575 .m(1)
7576 .n(4)
7577 .k(k)
7578 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7579 }
7580 }
7581 }
7582
7583 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_gt_4_strided_cn) {
7584 TEST_REQUIRES_X86_SSE2;
7585 for (uint32_t n = 5; n < 8; n++) {
7586 for (size_t k = 1; k <= 40; k += 9) {
7587 GemmMicrokernelTester()
7588 .mr(1)
7589 .nr(4)
7590 .kr(2)
7591 .sr(1)
7592 .m(1)
7593 .n(4)
7594 .k(k)
7595 .cn_stride(7)
7596 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7597 }
7598 }
7599 }
7600
7601 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_gt_4_strided_a) {
7602 TEST_REQUIRES_X86_SSE2;
7603 for (uint32_t n = 5; n < 8; n++) {
7604 for (size_t k = 1; k <= 40; k += 9) {
7605 GemmMicrokernelTester()
7606 .mr(1)
7607 .nr(4)
7608 .kr(2)
7609 .sr(1)
7610 .m(1)
7611 .n(n)
7612 .k(k)
7613 .a_stride(43)
7614 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7615 }
7616 }
7617 }
7618
7619 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_gt_4_subtile) {
7620 TEST_REQUIRES_X86_SSE2;
7621 for (uint32_t n = 5; n < 8; n++) {
7622 for (size_t k = 1; k <= 40; k += 9) {
7623 for (uint32_t m = 1; m <= 1; m++) {
7624 GemmMicrokernelTester()
7625 .mr(1)
7626 .nr(4)
7627 .kr(2)
7628 .sr(1)
7629 .m(m)
7630 .n(n)
7631 .k(k)
7632 .iterations(1)
7633 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7634 }
7635 }
7636 }
7637 }
7638
7639 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_div_4) {
7640 TEST_REQUIRES_X86_SSE2;
7641 for (uint32_t n = 8; n <= 12; n += 4) {
7642 for (size_t k = 1; k <= 40; k += 9) {
7643 GemmMicrokernelTester()
7644 .mr(1)
7645 .nr(4)
7646 .kr(2)
7647 .sr(1)
7648 .m(1)
7649 .n(4)
7650 .k(k)
7651 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7652 }
7653 }
7654 }
7655
7656 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_div_4_strided_cn) {
7657 TEST_REQUIRES_X86_SSE2;
7658 for (uint32_t n = 8; n <= 12; n += 4) {
7659 for (size_t k = 1; k <= 40; k += 9) {
7660 GemmMicrokernelTester()
7661 .mr(1)
7662 .nr(4)
7663 .kr(2)
7664 .sr(1)
7665 .m(1)
7666 .n(n)
7667 .k(k)
7668 .cn_stride(7)
7669 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7670 }
7671 }
7672 }
7673
7674 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_div_4_strided_a) {
7675 TEST_REQUIRES_X86_SSE2;
7676 for (uint32_t n = 8; n <= 12; n += 4) {
7677 for (size_t k = 1; k <= 40; k += 9) {
7678 GemmMicrokernelTester()
7679 .mr(1)
7680 .nr(4)
7681 .kr(2)
7682 .sr(1)
7683 .m(1)
7684 .n(n)
7685 .k(k)
7686 .a_stride(43)
7687 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7688 }
7689 }
7690 }
7691
7692 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_div_4_subtile) {
7693 TEST_REQUIRES_X86_SSE2;
7694 for (uint32_t n = 8; n <= 12; n += 4) {
7695 for (size_t k = 1; k <= 40; k += 9) {
7696 for (uint32_t m = 1; m <= 1; m++) {
7697 GemmMicrokernelTester()
7698 .mr(1)
7699 .nr(4)
7700 .kr(2)
7701 .sr(1)
7702 .m(m)
7703 .n(n)
7704 .k(k)
7705 .iterations(1)
7706 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7707 }
7708 }
7709 }
7710 }
7711
7712 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, strided_cm_subtile) {
7713 TEST_REQUIRES_X86_SSE2;
7714 for (size_t k = 1; k <= 40; k += 9) {
7715 for (uint32_t m = 1; m <= 1; m++) {
7716 for (uint32_t n = 1; n <= 4; n++) {
7717 GemmMicrokernelTester()
7718 .mr(1)
7719 .nr(4)
7720 .kr(2)
7721 .sr(1)
7722 .m(m)
7723 .n(n)
7724 .k(k)
7725 .cm_stride(7)
7726 .iterations(1)
7727 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7728 }
7729 }
7730 }
7731 }
7732
7733 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, qmin) {
7734 TEST_REQUIRES_X86_SSE2;
7735 GemmMicrokernelTester()
7736 .mr(1)
7737 .nr(4)
7738 .kr(2)
7739 .sr(1)
7740 .m(1)
7741 .n(4)
7742 .k(8)
7743 .qmin(128)
7744 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7745 }
7746
7747 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, qmax) {
7748 TEST_REQUIRES_X86_SSE2;
7749 GemmMicrokernelTester()
7750 .mr(1)
7751 .nr(4)
7752 .kr(2)
7753 .sr(1)
7754 .m(1)
7755 .n(4)
7756 .k(8)
7757 .qmax(128)
7758 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7759 }
7760
7761 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, strided_cm) {
7762 TEST_REQUIRES_X86_SSE2;
7763 GemmMicrokernelTester()
7764 .mr(1)
7765 .nr(4)
7766 .kr(2)
7767 .sr(1)
7768 .m(1)
7769 .n(4)
7770 .k(8)
7771 .cm_stride(7)
7772 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7773 }
7774#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
7775
7776
7777#if XNN_ARCH_X86 || XNN_ARCH_X86_64
7778 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8) {
7779 TEST_REQUIRES_X86_SSE2;
7780 GemmMicrokernelTester()
7781 .mr(2)
7782 .nr(4)
7783 .kr(2)
7784 .sr(1)
7785 .m(2)
7786 .n(4)
7787 .k(8)
7788 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7789 }
7790
7791 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, strided_cn) {
7792 TEST_REQUIRES_X86_SSE2;
7793 GemmMicrokernelTester()
7794 .mr(2)
7795 .nr(4)
7796 .kr(2)
7797 .sr(1)
7798 .m(2)
7799 .n(4)
7800 .k(8)
7801 .cn_stride(7)
7802 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7803 }
7804
7805 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8_strided_a) {
7806 TEST_REQUIRES_X86_SSE2;
7807 GemmMicrokernelTester()
7808 .mr(2)
7809 .nr(4)
7810 .kr(2)
7811 .sr(1)
7812 .m(2)
7813 .n(4)
7814 .k(8)
7815 .a_stride(11)
7816 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7817 }
7818
7819 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8_subtile) {
7820 TEST_REQUIRES_X86_SSE2;
7821 for (uint32_t m = 1; m <= 2; m++) {
7822 for (uint32_t n = 1; n <= 4; n++) {
7823 GemmMicrokernelTester()
7824 .mr(2)
7825 .nr(4)
7826 .kr(2)
7827 .sr(1)
7828 .m(m)
7829 .n(n)
7830 .k(8)
7831 .iterations(1)
7832 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7833 }
7834 }
7835 }
7836
7837 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8_subtile_m) {
7838 TEST_REQUIRES_X86_SSE2;
7839 for (uint32_t m = 1; m <= 2; m++) {
7840 GemmMicrokernelTester()
7841 .mr(2)
7842 .nr(4)
7843 .kr(2)
7844 .sr(1)
7845 .m(m)
7846 .n(4)
7847 .k(8)
7848 .iterations(1)
7849 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7850 }
7851 }
7852
7853 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8_subtile_n) {
7854 TEST_REQUIRES_X86_SSE2;
7855 for (uint32_t n = 1; n <= 4; n++) {
7856 GemmMicrokernelTester()
7857 .mr(2)
7858 .nr(4)
7859 .kr(2)
7860 .sr(1)
7861 .m(2)
7862 .n(n)
7863 .k(8)
7864 .iterations(1)
7865 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7866 }
7867 }
7868
7869 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_lt_8) {
7870 TEST_REQUIRES_X86_SSE2;
7871 for (size_t k = 1; k < 8; k++) {
7872 GemmMicrokernelTester()
7873 .mr(2)
7874 .nr(4)
7875 .kr(2)
7876 .sr(1)
7877 .m(2)
7878 .n(4)
7879 .k(k)
7880 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7881 }
7882 }
7883
7884 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_lt_8_strided_a) {
7885 TEST_REQUIRES_X86_SSE2;
7886 for (size_t k = 1; k < 8; k++) {
7887 GemmMicrokernelTester()
7888 .mr(2)
7889 .nr(4)
7890 .kr(2)
7891 .sr(1)
7892 .m(2)
7893 .n(4)
7894 .k(k)
7895 .a_stride(11)
7896 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7897 }
7898 }
7899
7900 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_lt_8_subtile) {
7901 TEST_REQUIRES_X86_SSE2;
7902 for (size_t k = 1; k < 8; k++) {
7903 for (uint32_t m = 1; m <= 2; m++) {
7904 for (uint32_t n = 1; n <= 4; n++) {
7905 GemmMicrokernelTester()
7906 .mr(2)
7907 .nr(4)
7908 .kr(2)
7909 .sr(1)
7910 .m(m)
7911 .n(n)
7912 .k(k)
7913 .iterations(1)
7914 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7915 }
7916 }
7917 }
7918 }
7919
7920 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_gt_8) {
7921 TEST_REQUIRES_X86_SSE2;
7922 for (size_t k = 9; k < 16; k++) {
7923 GemmMicrokernelTester()
7924 .mr(2)
7925 .nr(4)
7926 .kr(2)
7927 .sr(1)
7928 .m(2)
7929 .n(4)
7930 .k(k)
7931 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7932 }
7933 }
7934
7935 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_gt_8_strided_a) {
7936 TEST_REQUIRES_X86_SSE2;
7937 for (size_t k = 9; k < 16; k++) {
7938 GemmMicrokernelTester()
7939 .mr(2)
7940 .nr(4)
7941 .kr(2)
7942 .sr(1)
7943 .m(2)
7944 .n(4)
7945 .k(k)
7946 .a_stride(19)
7947 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7948 }
7949 }
7950
7951 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_gt_8_subtile) {
7952 TEST_REQUIRES_X86_SSE2;
7953 for (size_t k = 9; k < 16; k++) {
7954 for (uint32_t m = 1; m <= 2; m++) {
7955 for (uint32_t n = 1; n <= 4; n++) {
7956 GemmMicrokernelTester()
7957 .mr(2)
7958 .nr(4)
7959 .kr(2)
7960 .sr(1)
7961 .m(m)
7962 .n(n)
7963 .k(k)
7964 .iterations(1)
7965 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7966 }
7967 }
7968 }
7969 }
7970
7971 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_div_8) {
7972 TEST_REQUIRES_X86_SSE2;
7973 for (size_t k = 16; k <= 80; k += 8) {
7974 GemmMicrokernelTester()
7975 .mr(2)
7976 .nr(4)
7977 .kr(2)
7978 .sr(1)
7979 .m(2)
7980 .n(4)
7981 .k(k)
7982 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7983 }
7984 }
7985
7986 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_div_8_strided_a) {
7987 TEST_REQUIRES_X86_SSE2;
7988 for (size_t k = 16; k <= 80; k += 8) {
7989 GemmMicrokernelTester()
7990 .mr(2)
7991 .nr(4)
7992 .kr(2)
7993 .sr(1)
7994 .m(2)
7995 .n(4)
7996 .k(k)
7997 .a_stride(83)
7998 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
7999 }
8000 }
8001
8002 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_div_8_subtile) {
8003 TEST_REQUIRES_X86_SSE2;
8004 for (size_t k = 16; k <= 80; k += 8) {
8005 for (uint32_t m = 1; m <= 2; m++) {
8006 for (uint32_t n = 1; n <= 4; n++) {
8007 GemmMicrokernelTester()
8008 .mr(2)
8009 .nr(4)
8010 .kr(2)
8011 .sr(1)
8012 .m(m)
8013 .n(n)
8014 .k(k)
8015 .iterations(1)
8016 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8017 }
8018 }
8019 }
8020 }
8021
8022 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4) {
8023 TEST_REQUIRES_X86_SSE2;
8024 for (uint32_t n = 5; n < 8; n++) {
8025 for (size_t k = 1; k <= 40; k += 9) {
8026 GemmMicrokernelTester()
8027 .mr(2)
8028 .nr(4)
8029 .kr(2)
8030 .sr(1)
8031 .m(2)
8032 .n(4)
8033 .k(k)
8034 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8035 }
8036 }
8037 }
8038
8039 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4_strided_cn) {
8040 TEST_REQUIRES_X86_SSE2;
8041 for (uint32_t n = 5; n < 8; n++) {
8042 for (size_t k = 1; k <= 40; k += 9) {
8043 GemmMicrokernelTester()
8044 .mr(2)
8045 .nr(4)
8046 .kr(2)
8047 .sr(1)
8048 .m(2)
8049 .n(4)
8050 .k(k)
8051 .cn_stride(7)
8052 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8053 }
8054 }
8055 }
8056
8057 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4_strided_a) {
8058 TEST_REQUIRES_X86_SSE2;
8059 for (uint32_t n = 5; n < 8; n++) {
8060 for (size_t k = 1; k <= 40; k += 9) {
8061 GemmMicrokernelTester()
8062 .mr(2)
8063 .nr(4)
8064 .kr(2)
8065 .sr(1)
8066 .m(2)
8067 .n(n)
8068 .k(k)
8069 .a_stride(43)
8070 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8071 }
8072 }
8073 }
8074
8075 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4_subtile) {
8076 TEST_REQUIRES_X86_SSE2;
8077 for (uint32_t n = 5; n < 8; n++) {
8078 for (size_t k = 1; k <= 40; k += 9) {
8079 for (uint32_t m = 1; m <= 2; m++) {
8080 GemmMicrokernelTester()
8081 .mr(2)
8082 .nr(4)
8083 .kr(2)
8084 .sr(1)
8085 .m(m)
8086 .n(n)
8087 .k(k)
8088 .iterations(1)
8089 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8090 }
8091 }
8092 }
8093 }
8094
8095 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4) {
8096 TEST_REQUIRES_X86_SSE2;
8097 for (uint32_t n = 8; n <= 12; n += 4) {
8098 for (size_t k = 1; k <= 40; k += 9) {
8099 GemmMicrokernelTester()
8100 .mr(2)
8101 .nr(4)
8102 .kr(2)
8103 .sr(1)
8104 .m(2)
8105 .n(4)
8106 .k(k)
8107 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8108 }
8109 }
8110 }
8111
8112 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4_strided_cn) {
8113 TEST_REQUIRES_X86_SSE2;
8114 for (uint32_t n = 8; n <= 12; n += 4) {
8115 for (size_t k = 1; k <= 40; k += 9) {
8116 GemmMicrokernelTester()
8117 .mr(2)
8118 .nr(4)
8119 .kr(2)
8120 .sr(1)
8121 .m(2)
8122 .n(n)
8123 .k(k)
8124 .cn_stride(7)
8125 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8126 }
8127 }
8128 }
8129
8130 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4_strided_a) {
8131 TEST_REQUIRES_X86_SSE2;
8132 for (uint32_t n = 8; n <= 12; n += 4) {
8133 for (size_t k = 1; k <= 40; k += 9) {
8134 GemmMicrokernelTester()
8135 .mr(2)
8136 .nr(4)
8137 .kr(2)
8138 .sr(1)
8139 .m(2)
8140 .n(n)
8141 .k(k)
8142 .a_stride(43)
8143 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8144 }
8145 }
8146 }
8147
8148 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4_subtile) {
8149 TEST_REQUIRES_X86_SSE2;
8150 for (uint32_t n = 8; n <= 12; n += 4) {
8151 for (size_t k = 1; k <= 40; k += 9) {
8152 for (uint32_t m = 1; m <= 2; m++) {
8153 GemmMicrokernelTester()
8154 .mr(2)
8155 .nr(4)
8156 .kr(2)
8157 .sr(1)
8158 .m(m)
8159 .n(n)
8160 .k(k)
8161 .iterations(1)
8162 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8163 }
8164 }
8165 }
8166 }
8167
8168 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, strided_cm_subtile) {
8169 TEST_REQUIRES_X86_SSE2;
8170 for (size_t k = 1; k <= 40; k += 9) {
8171 for (uint32_t m = 1; m <= 2; m++) {
8172 for (uint32_t n = 1; n <= 4; n++) {
8173 GemmMicrokernelTester()
8174 .mr(2)
8175 .nr(4)
8176 .kr(2)
8177 .sr(1)
8178 .m(m)
8179 .n(n)
8180 .k(k)
8181 .cm_stride(7)
8182 .iterations(1)
8183 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8184 }
8185 }
8186 }
8187 }
8188
8189 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, qmin) {
8190 TEST_REQUIRES_X86_SSE2;
8191 GemmMicrokernelTester()
8192 .mr(2)
8193 .nr(4)
8194 .kr(2)
8195 .sr(1)
8196 .m(2)
8197 .n(4)
8198 .k(8)
8199 .qmin(128)
8200 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8201 }
8202
8203 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, qmax) {
8204 TEST_REQUIRES_X86_SSE2;
8205 GemmMicrokernelTester()
8206 .mr(2)
8207 .nr(4)
8208 .kr(2)
8209 .sr(1)
8210 .m(2)
8211 .n(4)
8212 .k(8)
8213 .qmax(128)
8214 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8215 }
8216
8217 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, strided_cm) {
8218 TEST_REQUIRES_X86_SSE2;
8219 GemmMicrokernelTester()
8220 .mr(2)
8221 .nr(4)
8222 .kr(2)
8223 .sr(1)
8224 .m(2)
8225 .n(4)
8226 .k(8)
8227 .cm_stride(7)
8228 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8229 }
8230#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
8231
8232
8233#if XNN_ARCH_X86 || XNN_ARCH_X86_64
8234 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8) {
8235 TEST_REQUIRES_X86_SSE2;
8236 GemmMicrokernelTester()
8237 .mr(3)
8238 .nr(4)
8239 .kr(2)
8240 .sr(1)
8241 .m(3)
8242 .n(4)
8243 .k(8)
8244 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8245 }
8246
8247 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, strided_cn) {
8248 TEST_REQUIRES_X86_SSE2;
8249 GemmMicrokernelTester()
8250 .mr(3)
8251 .nr(4)
8252 .kr(2)
8253 .sr(1)
8254 .m(3)
8255 .n(4)
8256 .k(8)
8257 .cn_stride(7)
8258 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8259 }
8260
8261 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8_strided_a) {
8262 TEST_REQUIRES_X86_SSE2;
8263 GemmMicrokernelTester()
8264 .mr(3)
8265 .nr(4)
8266 .kr(2)
8267 .sr(1)
8268 .m(3)
8269 .n(4)
8270 .k(8)
8271 .a_stride(11)
8272 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8273 }
8274
8275 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8_subtile) {
8276 TEST_REQUIRES_X86_SSE2;
8277 for (uint32_t m = 1; m <= 3; m++) {
8278 for (uint32_t n = 1; n <= 4; n++) {
8279 GemmMicrokernelTester()
8280 .mr(3)
8281 .nr(4)
8282 .kr(2)
8283 .sr(1)
8284 .m(m)
8285 .n(n)
8286 .k(8)
8287 .iterations(1)
8288 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8289 }
8290 }
8291 }
8292
8293 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8_subtile_m) {
8294 TEST_REQUIRES_X86_SSE2;
8295 for (uint32_t m = 1; m <= 3; m++) {
8296 GemmMicrokernelTester()
8297 .mr(3)
8298 .nr(4)
8299 .kr(2)
8300 .sr(1)
8301 .m(m)
8302 .n(4)
8303 .k(8)
8304 .iterations(1)
8305 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8306 }
8307 }
8308
8309 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8_subtile_n) {
8310 TEST_REQUIRES_X86_SSE2;
8311 for (uint32_t n = 1; n <= 4; n++) {
8312 GemmMicrokernelTester()
8313 .mr(3)
8314 .nr(4)
8315 .kr(2)
8316 .sr(1)
8317 .m(3)
8318 .n(n)
8319 .k(8)
8320 .iterations(1)
8321 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8322 }
8323 }
8324
8325 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_lt_8) {
8326 TEST_REQUIRES_X86_SSE2;
8327 for (size_t k = 1; k < 8; k++) {
8328 GemmMicrokernelTester()
8329 .mr(3)
8330 .nr(4)
8331 .kr(2)
8332 .sr(1)
8333 .m(3)
8334 .n(4)
8335 .k(k)
8336 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8337 }
8338 }
8339
8340 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_lt_8_strided_a) {
8341 TEST_REQUIRES_X86_SSE2;
8342 for (size_t k = 1; k < 8; k++) {
8343 GemmMicrokernelTester()
8344 .mr(3)
8345 .nr(4)
8346 .kr(2)
8347 .sr(1)
8348 .m(3)
8349 .n(4)
8350 .k(k)
8351 .a_stride(11)
8352 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8353 }
8354 }
8355
8356 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_lt_8_subtile) {
8357 TEST_REQUIRES_X86_SSE2;
8358 for (size_t k = 1; k < 8; k++) {
8359 for (uint32_t m = 1; m <= 3; m++) {
8360 for (uint32_t n = 1; n <= 4; n++) {
8361 GemmMicrokernelTester()
8362 .mr(3)
8363 .nr(4)
8364 .kr(2)
8365 .sr(1)
8366 .m(m)
8367 .n(n)
8368 .k(k)
8369 .iterations(1)
8370 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8371 }
8372 }
8373 }
8374 }
8375
8376 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_gt_8) {
8377 TEST_REQUIRES_X86_SSE2;
8378 for (size_t k = 9; k < 16; k++) {
8379 GemmMicrokernelTester()
8380 .mr(3)
8381 .nr(4)
8382 .kr(2)
8383 .sr(1)
8384 .m(3)
8385 .n(4)
8386 .k(k)
8387 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8388 }
8389 }
8390
8391 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_gt_8_strided_a) {
8392 TEST_REQUIRES_X86_SSE2;
8393 for (size_t k = 9; k < 16; k++) {
8394 GemmMicrokernelTester()
8395 .mr(3)
8396 .nr(4)
8397 .kr(2)
8398 .sr(1)
8399 .m(3)
8400 .n(4)
8401 .k(k)
8402 .a_stride(19)
8403 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8404 }
8405 }
8406
8407 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_gt_8_subtile) {
8408 TEST_REQUIRES_X86_SSE2;
8409 for (size_t k = 9; k < 16; k++) {
8410 for (uint32_t m = 1; m <= 3; m++) {
8411 for (uint32_t n = 1; n <= 4; n++) {
8412 GemmMicrokernelTester()
8413 .mr(3)
8414 .nr(4)
8415 .kr(2)
8416 .sr(1)
8417 .m(m)
8418 .n(n)
8419 .k(k)
8420 .iterations(1)
8421 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8422 }
8423 }
8424 }
8425 }
8426
8427 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_div_8) {
8428 TEST_REQUIRES_X86_SSE2;
8429 for (size_t k = 16; k <= 80; k += 8) {
8430 GemmMicrokernelTester()
8431 .mr(3)
8432 .nr(4)
8433 .kr(2)
8434 .sr(1)
8435 .m(3)
8436 .n(4)
8437 .k(k)
8438 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8439 }
8440 }
8441
8442 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_div_8_strided_a) {
8443 TEST_REQUIRES_X86_SSE2;
8444 for (size_t k = 16; k <= 80; k += 8) {
8445 GemmMicrokernelTester()
8446 .mr(3)
8447 .nr(4)
8448 .kr(2)
8449 .sr(1)
8450 .m(3)
8451 .n(4)
8452 .k(k)
8453 .a_stride(83)
8454 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8455 }
8456 }
8457
8458 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_div_8_subtile) {
8459 TEST_REQUIRES_X86_SSE2;
8460 for (size_t k = 16; k <= 80; k += 8) {
8461 for (uint32_t m = 1; m <= 3; m++) {
8462 for (uint32_t n = 1; n <= 4; n++) {
8463 GemmMicrokernelTester()
8464 .mr(3)
8465 .nr(4)
8466 .kr(2)
8467 .sr(1)
8468 .m(m)
8469 .n(n)
8470 .k(k)
8471 .iterations(1)
8472 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8473 }
8474 }
8475 }
8476 }
8477
8478 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4) {
8479 TEST_REQUIRES_X86_SSE2;
8480 for (uint32_t n = 5; n < 8; n++) {
8481 for (size_t k = 1; k <= 40; k += 9) {
8482 GemmMicrokernelTester()
8483 .mr(3)
8484 .nr(4)
8485 .kr(2)
8486 .sr(1)
8487 .m(3)
8488 .n(4)
8489 .k(k)
8490 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8491 }
8492 }
8493 }
8494
8495 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4_strided_cn) {
8496 TEST_REQUIRES_X86_SSE2;
8497 for (uint32_t n = 5; n < 8; n++) {
8498 for (size_t k = 1; k <= 40; k += 9) {
8499 GemmMicrokernelTester()
8500 .mr(3)
8501 .nr(4)
8502 .kr(2)
8503 .sr(1)
8504 .m(3)
8505 .n(4)
8506 .k(k)
8507 .cn_stride(7)
8508 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8509 }
8510 }
8511 }
8512
8513 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4_strided_a) {
8514 TEST_REQUIRES_X86_SSE2;
8515 for (uint32_t n = 5; n < 8; n++) {
8516 for (size_t k = 1; k <= 40; k += 9) {
8517 GemmMicrokernelTester()
8518 .mr(3)
8519 .nr(4)
8520 .kr(2)
8521 .sr(1)
8522 .m(3)
8523 .n(n)
8524 .k(k)
8525 .a_stride(43)
8526 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8527 }
8528 }
8529 }
8530
8531 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4_subtile) {
8532 TEST_REQUIRES_X86_SSE2;
8533 for (uint32_t n = 5; n < 8; n++) {
8534 for (size_t k = 1; k <= 40; k += 9) {
8535 for (uint32_t m = 1; m <= 3; m++) {
8536 GemmMicrokernelTester()
8537 .mr(3)
8538 .nr(4)
8539 .kr(2)
8540 .sr(1)
8541 .m(m)
8542 .n(n)
8543 .k(k)
8544 .iterations(1)
8545 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8546 }
8547 }
8548 }
8549 }
8550
8551 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4) {
8552 TEST_REQUIRES_X86_SSE2;
8553 for (uint32_t n = 8; n <= 12; n += 4) {
8554 for (size_t k = 1; k <= 40; k += 9) {
8555 GemmMicrokernelTester()
8556 .mr(3)
8557 .nr(4)
8558 .kr(2)
8559 .sr(1)
8560 .m(3)
8561 .n(4)
8562 .k(k)
8563 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8564 }
8565 }
8566 }
8567
8568 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4_strided_cn) {
8569 TEST_REQUIRES_X86_SSE2;
8570 for (uint32_t n = 8; n <= 12; n += 4) {
8571 for (size_t k = 1; k <= 40; k += 9) {
8572 GemmMicrokernelTester()
8573 .mr(3)
8574 .nr(4)
8575 .kr(2)
8576 .sr(1)
8577 .m(3)
8578 .n(n)
8579 .k(k)
8580 .cn_stride(7)
8581 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8582 }
8583 }
8584 }
8585
8586 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4_strided_a) {
8587 TEST_REQUIRES_X86_SSE2;
8588 for (uint32_t n = 8; n <= 12; n += 4) {
8589 for (size_t k = 1; k <= 40; k += 9) {
8590 GemmMicrokernelTester()
8591 .mr(3)
8592 .nr(4)
8593 .kr(2)
8594 .sr(1)
8595 .m(3)
8596 .n(n)
8597 .k(k)
8598 .a_stride(43)
8599 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8600 }
8601 }
8602 }
8603
8604 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4_subtile) {
8605 TEST_REQUIRES_X86_SSE2;
8606 for (uint32_t n = 8; n <= 12; n += 4) {
8607 for (size_t k = 1; k <= 40; k += 9) {
8608 for (uint32_t m = 1; m <= 3; m++) {
8609 GemmMicrokernelTester()
8610 .mr(3)
8611 .nr(4)
8612 .kr(2)
8613 .sr(1)
8614 .m(m)
8615 .n(n)
8616 .k(k)
8617 .iterations(1)
8618 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8619 }
8620 }
8621 }
8622 }
8623
8624 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, strided_cm_subtile) {
8625 TEST_REQUIRES_X86_SSE2;
8626 for (size_t k = 1; k <= 40; k += 9) {
8627 for (uint32_t m = 1; m <= 3; m++) {
8628 for (uint32_t n = 1; n <= 4; n++) {
8629 GemmMicrokernelTester()
8630 .mr(3)
8631 .nr(4)
8632 .kr(2)
8633 .sr(1)
8634 .m(m)
8635 .n(n)
8636 .k(k)
8637 .cm_stride(7)
8638 .iterations(1)
8639 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8640 }
8641 }
8642 }
8643 }
8644
8645 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, qmin) {
8646 TEST_REQUIRES_X86_SSE2;
8647 GemmMicrokernelTester()
8648 .mr(3)
8649 .nr(4)
8650 .kr(2)
8651 .sr(1)
8652 .m(3)
8653 .n(4)
8654 .k(8)
8655 .qmin(128)
8656 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8657 }
8658
8659 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, qmax) {
8660 TEST_REQUIRES_X86_SSE2;
8661 GemmMicrokernelTester()
8662 .mr(3)
8663 .nr(4)
8664 .kr(2)
8665 .sr(1)
8666 .m(3)
8667 .n(4)
8668 .k(8)
8669 .qmax(128)
8670 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8671 }
8672
8673 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, strided_cm) {
8674 TEST_REQUIRES_X86_SSE2;
8675 GemmMicrokernelTester()
8676 .mr(3)
8677 .nr(4)
8678 .kr(2)
8679 .sr(1)
8680 .m(3)
8681 .n(4)
8682 .k(8)
8683 .cm_stride(7)
8684 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8685 }
8686#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
8687
8688
8689#if XNN_ARCH_X86 || XNN_ARCH_X86_64
8690 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8) {
8691 TEST_REQUIRES_X86_SSE2;
8692 GemmMicrokernelTester()
8693 .mr(4)
8694 .nr(4)
8695 .kr(2)
8696 .sr(1)
8697 .m(4)
8698 .n(4)
8699 .k(8)
8700 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8701 }
8702
8703 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, strided_cn) {
8704 TEST_REQUIRES_X86_SSE2;
8705 GemmMicrokernelTester()
8706 .mr(4)
8707 .nr(4)
8708 .kr(2)
8709 .sr(1)
8710 .m(4)
8711 .n(4)
8712 .k(8)
8713 .cn_stride(7)
8714 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8715 }
8716
8717 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8_strided_a) {
8718 TEST_REQUIRES_X86_SSE2;
8719 GemmMicrokernelTester()
8720 .mr(4)
8721 .nr(4)
8722 .kr(2)
8723 .sr(1)
8724 .m(4)
8725 .n(4)
8726 .k(8)
8727 .a_stride(11)
8728 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8729 }
8730
8731 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8_subtile) {
8732 TEST_REQUIRES_X86_SSE2;
8733 for (uint32_t m = 1; m <= 4; m++) {
8734 for (uint32_t n = 1; n <= 4; n++) {
8735 GemmMicrokernelTester()
8736 .mr(4)
8737 .nr(4)
8738 .kr(2)
8739 .sr(1)
8740 .m(m)
8741 .n(n)
8742 .k(8)
8743 .iterations(1)
8744 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8745 }
8746 }
8747 }
8748
8749 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8_subtile_m) {
8750 TEST_REQUIRES_X86_SSE2;
8751 for (uint32_t m = 1; m <= 4; m++) {
8752 GemmMicrokernelTester()
8753 .mr(4)
8754 .nr(4)
8755 .kr(2)
8756 .sr(1)
8757 .m(m)
8758 .n(4)
8759 .k(8)
8760 .iterations(1)
8761 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8762 }
8763 }
8764
8765 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8_subtile_n) {
8766 TEST_REQUIRES_X86_SSE2;
8767 for (uint32_t n = 1; n <= 4; n++) {
8768 GemmMicrokernelTester()
8769 .mr(4)
8770 .nr(4)
8771 .kr(2)
8772 .sr(1)
8773 .m(4)
8774 .n(n)
8775 .k(8)
8776 .iterations(1)
8777 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8778 }
8779 }
8780
8781 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_lt_8) {
8782 TEST_REQUIRES_X86_SSE2;
8783 for (size_t k = 1; k < 8; k++) {
8784 GemmMicrokernelTester()
8785 .mr(4)
8786 .nr(4)
8787 .kr(2)
8788 .sr(1)
8789 .m(4)
8790 .n(4)
8791 .k(k)
8792 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8793 }
8794 }
8795
8796 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_lt_8_strided_a) {
8797 TEST_REQUIRES_X86_SSE2;
8798 for (size_t k = 1; k < 8; k++) {
8799 GemmMicrokernelTester()
8800 .mr(4)
8801 .nr(4)
8802 .kr(2)
8803 .sr(1)
8804 .m(4)
8805 .n(4)
8806 .k(k)
8807 .a_stride(11)
8808 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8809 }
8810 }
8811
8812 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_lt_8_subtile) {
8813 TEST_REQUIRES_X86_SSE2;
8814 for (size_t k = 1; k < 8; k++) {
8815 for (uint32_t m = 1; m <= 4; m++) {
8816 for (uint32_t n = 1; n <= 4; n++) {
8817 GemmMicrokernelTester()
8818 .mr(4)
8819 .nr(4)
8820 .kr(2)
8821 .sr(1)
8822 .m(m)
8823 .n(n)
8824 .k(k)
8825 .iterations(1)
8826 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8827 }
8828 }
8829 }
8830 }
8831
8832 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_gt_8) {
8833 TEST_REQUIRES_X86_SSE2;
8834 for (size_t k = 9; k < 16; k++) {
8835 GemmMicrokernelTester()
8836 .mr(4)
8837 .nr(4)
8838 .kr(2)
8839 .sr(1)
8840 .m(4)
8841 .n(4)
8842 .k(k)
8843 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8844 }
8845 }
8846
8847 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_gt_8_strided_a) {
8848 TEST_REQUIRES_X86_SSE2;
8849 for (size_t k = 9; k < 16; k++) {
8850 GemmMicrokernelTester()
8851 .mr(4)
8852 .nr(4)
8853 .kr(2)
8854 .sr(1)
8855 .m(4)
8856 .n(4)
8857 .k(k)
8858 .a_stride(19)
8859 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8860 }
8861 }
8862
8863 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_gt_8_subtile) {
8864 TEST_REQUIRES_X86_SSE2;
8865 for (size_t k = 9; k < 16; k++) {
8866 for (uint32_t m = 1; m <= 4; m++) {
8867 for (uint32_t n = 1; n <= 4; n++) {
8868 GemmMicrokernelTester()
8869 .mr(4)
8870 .nr(4)
8871 .kr(2)
8872 .sr(1)
8873 .m(m)
8874 .n(n)
8875 .k(k)
8876 .iterations(1)
8877 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8878 }
8879 }
8880 }
8881 }
8882
8883 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_div_8) {
8884 TEST_REQUIRES_X86_SSE2;
8885 for (size_t k = 16; k <= 80; k += 8) {
8886 GemmMicrokernelTester()
8887 .mr(4)
8888 .nr(4)
8889 .kr(2)
8890 .sr(1)
8891 .m(4)
8892 .n(4)
8893 .k(k)
8894 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8895 }
8896 }
8897
8898 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_div_8_strided_a) {
8899 TEST_REQUIRES_X86_SSE2;
8900 for (size_t k = 16; k <= 80; k += 8) {
8901 GemmMicrokernelTester()
8902 .mr(4)
8903 .nr(4)
8904 .kr(2)
8905 .sr(1)
8906 .m(4)
8907 .n(4)
8908 .k(k)
8909 .a_stride(83)
8910 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8911 }
8912 }
8913
8914 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_div_8_subtile) {
8915 TEST_REQUIRES_X86_SSE2;
8916 for (size_t k = 16; k <= 80; k += 8) {
8917 for (uint32_t m = 1; m <= 4; m++) {
8918 for (uint32_t n = 1; n <= 4; n++) {
8919 GemmMicrokernelTester()
8920 .mr(4)
8921 .nr(4)
8922 .kr(2)
8923 .sr(1)
8924 .m(m)
8925 .n(n)
8926 .k(k)
8927 .iterations(1)
8928 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8929 }
8930 }
8931 }
8932 }
8933
8934 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4) {
8935 TEST_REQUIRES_X86_SSE2;
8936 for (uint32_t n = 5; n < 8; n++) {
8937 for (size_t k = 1; k <= 40; k += 9) {
8938 GemmMicrokernelTester()
8939 .mr(4)
8940 .nr(4)
8941 .kr(2)
8942 .sr(1)
8943 .m(4)
8944 .n(4)
8945 .k(k)
8946 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8947 }
8948 }
8949 }
8950
8951 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4_strided_cn) {
8952 TEST_REQUIRES_X86_SSE2;
8953 for (uint32_t n = 5; n < 8; n++) {
8954 for (size_t k = 1; k <= 40; k += 9) {
8955 GemmMicrokernelTester()
8956 .mr(4)
8957 .nr(4)
8958 .kr(2)
8959 .sr(1)
8960 .m(4)
8961 .n(4)
8962 .k(k)
8963 .cn_stride(7)
8964 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8965 }
8966 }
8967 }
8968
8969 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4_strided_a) {
8970 TEST_REQUIRES_X86_SSE2;
8971 for (uint32_t n = 5; n < 8; n++) {
8972 for (size_t k = 1; k <= 40; k += 9) {
8973 GemmMicrokernelTester()
8974 .mr(4)
8975 .nr(4)
8976 .kr(2)
8977 .sr(1)
8978 .m(4)
8979 .n(n)
8980 .k(k)
8981 .a_stride(43)
8982 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
8983 }
8984 }
8985 }
8986
8987 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4_subtile) {
8988 TEST_REQUIRES_X86_SSE2;
8989 for (uint32_t n = 5; n < 8; n++) {
8990 for (size_t k = 1; k <= 40; k += 9) {
8991 for (uint32_t m = 1; m <= 4; m++) {
8992 GemmMicrokernelTester()
8993 .mr(4)
8994 .nr(4)
8995 .kr(2)
8996 .sr(1)
8997 .m(m)
8998 .n(n)
8999 .k(k)
9000 .iterations(1)
9001 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9002 }
9003 }
9004 }
9005 }
9006
9007 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4) {
9008 TEST_REQUIRES_X86_SSE2;
9009 for (uint32_t n = 8; n <= 12; n += 4) {
9010 for (size_t k = 1; k <= 40; k += 9) {
9011 GemmMicrokernelTester()
9012 .mr(4)
9013 .nr(4)
9014 .kr(2)
9015 .sr(1)
9016 .m(4)
9017 .n(4)
9018 .k(k)
9019 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9020 }
9021 }
9022 }
9023
9024 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4_strided_cn) {
9025 TEST_REQUIRES_X86_SSE2;
9026 for (uint32_t n = 8; n <= 12; n += 4) {
9027 for (size_t k = 1; k <= 40; k += 9) {
9028 GemmMicrokernelTester()
9029 .mr(4)
9030 .nr(4)
9031 .kr(2)
9032 .sr(1)
9033 .m(4)
9034 .n(n)
9035 .k(k)
9036 .cn_stride(7)
9037 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9038 }
9039 }
9040 }
9041
9042 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4_strided_a) {
9043 TEST_REQUIRES_X86_SSE2;
9044 for (uint32_t n = 8; n <= 12; n += 4) {
9045 for (size_t k = 1; k <= 40; k += 9) {
9046 GemmMicrokernelTester()
9047 .mr(4)
9048 .nr(4)
9049 .kr(2)
9050 .sr(1)
9051 .m(4)
9052 .n(n)
9053 .k(k)
9054 .a_stride(43)
9055 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9056 }
9057 }
9058 }
9059
9060 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4_subtile) {
9061 TEST_REQUIRES_X86_SSE2;
9062 for (uint32_t n = 8; n <= 12; n += 4) {
9063 for (size_t k = 1; k <= 40; k += 9) {
9064 for (uint32_t m = 1; m <= 4; m++) {
9065 GemmMicrokernelTester()
9066 .mr(4)
9067 .nr(4)
9068 .kr(2)
9069 .sr(1)
9070 .m(m)
9071 .n(n)
9072 .k(k)
9073 .iterations(1)
9074 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9075 }
9076 }
9077 }
9078 }
9079
9080 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, strided_cm_subtile) {
9081 TEST_REQUIRES_X86_SSE2;
9082 for (size_t k = 1; k <= 40; k += 9) {
9083 for (uint32_t m = 1; m <= 4; m++) {
9084 for (uint32_t n = 1; n <= 4; n++) {
9085 GemmMicrokernelTester()
9086 .mr(4)
9087 .nr(4)
9088 .kr(2)
9089 .sr(1)
9090 .m(m)
9091 .n(n)
9092 .k(k)
9093 .cm_stride(7)
9094 .iterations(1)
9095 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9096 }
9097 }
9098 }
9099 }
9100
9101 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, qmin) {
9102 TEST_REQUIRES_X86_SSE2;
9103 GemmMicrokernelTester()
9104 .mr(4)
9105 .nr(4)
9106 .kr(2)
9107 .sr(1)
9108 .m(4)
9109 .n(4)
9110 .k(8)
9111 .qmin(128)
9112 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9113 }
9114
9115 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, qmax) {
9116 TEST_REQUIRES_X86_SSE2;
9117 GemmMicrokernelTester()
9118 .mr(4)
9119 .nr(4)
9120 .kr(2)
9121 .sr(1)
9122 .m(4)
9123 .n(4)
9124 .k(8)
9125 .qmax(128)
9126 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9127 }
9128
9129 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, strided_cm) {
9130 TEST_REQUIRES_X86_SSE2;
9131 GemmMicrokernelTester()
9132 .mr(4)
9133 .nr(4)
9134 .kr(2)
9135 .sr(1)
9136 .m(4)
9137 .n(4)
9138 .k(8)
9139 .cm_stride(7)
9140 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9141 }
9142#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
9143
9144
9145#if XNN_ARCH_X86 || XNN_ARCH_X86_64
9146 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8) {
9147 TEST_REQUIRES_X86_SSE41;
9148 GemmMicrokernelTester()
9149 .mr(1)
9150 .nr(4)
9151 .kr(2)
9152 .sr(1)
9153 .m(1)
9154 .n(4)
9155 .k(8)
9156 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9157 }
9158
9159 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, strided_cn) {
9160 TEST_REQUIRES_X86_SSE41;
9161 GemmMicrokernelTester()
9162 .mr(1)
9163 .nr(4)
9164 .kr(2)
9165 .sr(1)
9166 .m(1)
9167 .n(4)
9168 .k(8)
9169 .cn_stride(7)
9170 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9171 }
9172
9173 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8_strided_a) {
9174 TEST_REQUIRES_X86_SSE41;
9175 GemmMicrokernelTester()
9176 .mr(1)
9177 .nr(4)
9178 .kr(2)
9179 .sr(1)
9180 .m(1)
9181 .n(4)
9182 .k(8)
9183 .a_stride(11)
9184 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9185 }
9186
9187 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8_subtile) {
9188 TEST_REQUIRES_X86_SSE41;
9189 for (uint32_t m = 1; m <= 1; m++) {
9190 for (uint32_t n = 1; n <= 4; n++) {
9191 GemmMicrokernelTester()
9192 .mr(1)
9193 .nr(4)
9194 .kr(2)
9195 .sr(1)
9196 .m(m)
9197 .n(n)
9198 .k(8)
9199 .iterations(1)
9200 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9201 }
9202 }
9203 }
9204
9205 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8_subtile_m) {
9206 TEST_REQUIRES_X86_SSE41;
9207 for (uint32_t m = 1; m <= 1; m++) {
9208 GemmMicrokernelTester()
9209 .mr(1)
9210 .nr(4)
9211 .kr(2)
9212 .sr(1)
9213 .m(m)
9214 .n(4)
9215 .k(8)
9216 .iterations(1)
9217 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9218 }
9219 }
9220
9221 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8_subtile_n) {
9222 TEST_REQUIRES_X86_SSE41;
9223 for (uint32_t n = 1; n <= 4; n++) {
9224 GemmMicrokernelTester()
9225 .mr(1)
9226 .nr(4)
9227 .kr(2)
9228 .sr(1)
9229 .m(1)
9230 .n(n)
9231 .k(8)
9232 .iterations(1)
9233 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9234 }
9235 }
9236
9237 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_lt_8) {
9238 TEST_REQUIRES_X86_SSE41;
9239 for (size_t k = 1; k < 8; k++) {
9240 GemmMicrokernelTester()
9241 .mr(1)
9242 .nr(4)
9243 .kr(2)
9244 .sr(1)
9245 .m(1)
9246 .n(4)
9247 .k(k)
9248 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9249 }
9250 }
9251
9252 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_lt_8_strided_a) {
9253 TEST_REQUIRES_X86_SSE41;
9254 for (size_t k = 1; k < 8; k++) {
9255 GemmMicrokernelTester()
9256 .mr(1)
9257 .nr(4)
9258 .kr(2)
9259 .sr(1)
9260 .m(1)
9261 .n(4)
9262 .k(k)
9263 .a_stride(11)
9264 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9265 }
9266 }
9267
9268 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_lt_8_subtile) {
9269 TEST_REQUIRES_X86_SSE41;
9270 for (size_t k = 1; k < 8; k++) {
9271 for (uint32_t m = 1; m <= 1; m++) {
9272 for (uint32_t n = 1; n <= 4; n++) {
9273 GemmMicrokernelTester()
9274 .mr(1)
9275 .nr(4)
9276 .kr(2)
9277 .sr(1)
9278 .m(m)
9279 .n(n)
9280 .k(k)
9281 .iterations(1)
9282 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9283 }
9284 }
9285 }
9286 }
9287
9288 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_gt_8) {
9289 TEST_REQUIRES_X86_SSE41;
9290 for (size_t k = 9; k < 16; k++) {
9291 GemmMicrokernelTester()
9292 .mr(1)
9293 .nr(4)
9294 .kr(2)
9295 .sr(1)
9296 .m(1)
9297 .n(4)
9298 .k(k)
9299 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9300 }
9301 }
9302
9303 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_gt_8_strided_a) {
9304 TEST_REQUIRES_X86_SSE41;
9305 for (size_t k = 9; k < 16; k++) {
9306 GemmMicrokernelTester()
9307 .mr(1)
9308 .nr(4)
9309 .kr(2)
9310 .sr(1)
9311 .m(1)
9312 .n(4)
9313 .k(k)
9314 .a_stride(19)
9315 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9316 }
9317 }
9318
9319 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_gt_8_subtile) {
9320 TEST_REQUIRES_X86_SSE41;
9321 for (size_t k = 9; k < 16; k++) {
9322 for (uint32_t m = 1; m <= 1; m++) {
9323 for (uint32_t n = 1; n <= 4; n++) {
9324 GemmMicrokernelTester()
9325 .mr(1)
9326 .nr(4)
9327 .kr(2)
9328 .sr(1)
9329 .m(m)
9330 .n(n)
9331 .k(k)
9332 .iterations(1)
9333 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9334 }
9335 }
9336 }
9337 }
9338
9339 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_div_8) {
9340 TEST_REQUIRES_X86_SSE41;
9341 for (size_t k = 16; k <= 80; k += 8) {
9342 GemmMicrokernelTester()
9343 .mr(1)
9344 .nr(4)
9345 .kr(2)
9346 .sr(1)
9347 .m(1)
9348 .n(4)
9349 .k(k)
9350 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9351 }
9352 }
9353
9354 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_div_8_strided_a) {
9355 TEST_REQUIRES_X86_SSE41;
9356 for (size_t k = 16; k <= 80; k += 8) {
9357 GemmMicrokernelTester()
9358 .mr(1)
9359 .nr(4)
9360 .kr(2)
9361 .sr(1)
9362 .m(1)
9363 .n(4)
9364 .k(k)
9365 .a_stride(83)
9366 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9367 }
9368 }
9369
9370 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_div_8_subtile) {
9371 TEST_REQUIRES_X86_SSE41;
9372 for (size_t k = 16; k <= 80; k += 8) {
9373 for (uint32_t m = 1; m <= 1; m++) {
9374 for (uint32_t n = 1; n <= 4; n++) {
9375 GemmMicrokernelTester()
9376 .mr(1)
9377 .nr(4)
9378 .kr(2)
9379 .sr(1)
9380 .m(m)
9381 .n(n)
9382 .k(k)
9383 .iterations(1)
9384 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9385 }
9386 }
9387 }
9388 }
9389
9390 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4) {
9391 TEST_REQUIRES_X86_SSE41;
9392 for (uint32_t n = 5; n < 8; n++) {
9393 for (size_t k = 1; k <= 40; k += 9) {
9394 GemmMicrokernelTester()
9395 .mr(1)
9396 .nr(4)
9397 .kr(2)
9398 .sr(1)
9399 .m(1)
9400 .n(4)
9401 .k(k)
9402 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9403 }
9404 }
9405 }
9406
9407 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4_strided_cn) {
9408 TEST_REQUIRES_X86_SSE41;
9409 for (uint32_t n = 5; n < 8; n++) {
9410 for (size_t k = 1; k <= 40; k += 9) {
9411 GemmMicrokernelTester()
9412 .mr(1)
9413 .nr(4)
9414 .kr(2)
9415 .sr(1)
9416 .m(1)
9417 .n(4)
9418 .k(k)
9419 .cn_stride(7)
9420 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9421 }
9422 }
9423 }
9424
9425 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4_strided_a) {
9426 TEST_REQUIRES_X86_SSE41;
9427 for (uint32_t n = 5; n < 8; n++) {
9428 for (size_t k = 1; k <= 40; k += 9) {
9429 GemmMicrokernelTester()
9430 .mr(1)
9431 .nr(4)
9432 .kr(2)
9433 .sr(1)
9434 .m(1)
9435 .n(n)
9436 .k(k)
9437 .a_stride(43)
9438 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9439 }
9440 }
9441 }
9442
9443 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4_subtile) {
9444 TEST_REQUIRES_X86_SSE41;
9445 for (uint32_t n = 5; n < 8; n++) {
9446 for (size_t k = 1; k <= 40; k += 9) {
9447 for (uint32_t m = 1; m <= 1; m++) {
9448 GemmMicrokernelTester()
9449 .mr(1)
9450 .nr(4)
9451 .kr(2)
9452 .sr(1)
9453 .m(m)
9454 .n(n)
9455 .k(k)
9456 .iterations(1)
9457 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9458 }
9459 }
9460 }
9461 }
9462
9463 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4) {
9464 TEST_REQUIRES_X86_SSE41;
9465 for (uint32_t n = 8; n <= 12; n += 4) {
9466 for (size_t k = 1; k <= 40; k += 9) {
9467 GemmMicrokernelTester()
9468 .mr(1)
9469 .nr(4)
9470 .kr(2)
9471 .sr(1)
9472 .m(1)
9473 .n(4)
9474 .k(k)
9475 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9476 }
9477 }
9478 }
9479
9480 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4_strided_cn) {
9481 TEST_REQUIRES_X86_SSE41;
9482 for (uint32_t n = 8; n <= 12; n += 4) {
9483 for (size_t k = 1; k <= 40; k += 9) {
9484 GemmMicrokernelTester()
9485 .mr(1)
9486 .nr(4)
9487 .kr(2)
9488 .sr(1)
9489 .m(1)
9490 .n(n)
9491 .k(k)
9492 .cn_stride(7)
9493 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9494 }
9495 }
9496 }
9497
9498 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4_strided_a) {
9499 TEST_REQUIRES_X86_SSE41;
9500 for (uint32_t n = 8; n <= 12; n += 4) {
9501 for (size_t k = 1; k <= 40; k += 9) {
9502 GemmMicrokernelTester()
9503 .mr(1)
9504 .nr(4)
9505 .kr(2)
9506 .sr(1)
9507 .m(1)
9508 .n(n)
9509 .k(k)
9510 .a_stride(43)
9511 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9512 }
9513 }
9514 }
9515
9516 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4_subtile) {
9517 TEST_REQUIRES_X86_SSE41;
9518 for (uint32_t n = 8; n <= 12; n += 4) {
9519 for (size_t k = 1; k <= 40; k += 9) {
9520 for (uint32_t m = 1; m <= 1; m++) {
9521 GemmMicrokernelTester()
9522 .mr(1)
9523 .nr(4)
9524 .kr(2)
9525 .sr(1)
9526 .m(m)
9527 .n(n)
9528 .k(k)
9529 .iterations(1)
9530 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9531 }
9532 }
9533 }
9534 }
9535
9536 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, strided_cm_subtile) {
9537 TEST_REQUIRES_X86_SSE41;
9538 for (size_t k = 1; k <= 40; k += 9) {
9539 for (uint32_t m = 1; m <= 1; m++) {
9540 for (uint32_t n = 1; n <= 4; n++) {
9541 GemmMicrokernelTester()
9542 .mr(1)
9543 .nr(4)
9544 .kr(2)
9545 .sr(1)
9546 .m(m)
9547 .n(n)
9548 .k(k)
9549 .cm_stride(7)
9550 .iterations(1)
9551 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9552 }
9553 }
9554 }
9555 }
9556
9557 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, qmin) {
9558 TEST_REQUIRES_X86_SSE41;
9559 GemmMicrokernelTester()
9560 .mr(1)
9561 .nr(4)
9562 .kr(2)
9563 .sr(1)
9564 .m(1)
9565 .n(4)
9566 .k(8)
9567 .qmin(128)
9568 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9569 }
9570
9571 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, qmax) {
9572 TEST_REQUIRES_X86_SSE41;
9573 GemmMicrokernelTester()
9574 .mr(1)
9575 .nr(4)
9576 .kr(2)
9577 .sr(1)
9578 .m(1)
9579 .n(4)
9580 .k(8)
9581 .qmax(128)
9582 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9583 }
9584
9585 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, strided_cm) {
9586 TEST_REQUIRES_X86_SSE41;
9587 GemmMicrokernelTester()
9588 .mr(1)
9589 .nr(4)
9590 .kr(2)
9591 .sr(1)
9592 .m(1)
9593 .n(4)
9594 .k(8)
9595 .cm_stride(7)
9596 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9597 }
9598#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
9599
9600
9601#if XNN_ARCH_X86 || XNN_ARCH_X86_64
9602 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8) {
9603 TEST_REQUIRES_X86_SSE41;
9604 GemmMicrokernelTester()
9605 .mr(2)
9606 .nr(4)
9607 .kr(2)
9608 .sr(1)
9609 .m(2)
9610 .n(4)
9611 .k(8)
9612 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9613 }
9614
9615 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, strided_cn) {
9616 TEST_REQUIRES_X86_SSE41;
9617 GemmMicrokernelTester()
9618 .mr(2)
9619 .nr(4)
9620 .kr(2)
9621 .sr(1)
9622 .m(2)
9623 .n(4)
9624 .k(8)
9625 .cn_stride(7)
9626 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9627 }
9628
9629 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8_strided_a) {
9630 TEST_REQUIRES_X86_SSE41;
9631 GemmMicrokernelTester()
9632 .mr(2)
9633 .nr(4)
9634 .kr(2)
9635 .sr(1)
9636 .m(2)
9637 .n(4)
9638 .k(8)
9639 .a_stride(11)
9640 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9641 }
9642
9643 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8_subtile) {
9644 TEST_REQUIRES_X86_SSE41;
9645 for (uint32_t m = 1; m <= 2; m++) {
9646 for (uint32_t n = 1; n <= 4; n++) {
9647 GemmMicrokernelTester()
9648 .mr(2)
9649 .nr(4)
9650 .kr(2)
9651 .sr(1)
9652 .m(m)
9653 .n(n)
9654 .k(8)
9655 .iterations(1)
9656 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9657 }
9658 }
9659 }
9660
9661 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8_subtile_m) {
9662 TEST_REQUIRES_X86_SSE41;
9663 for (uint32_t m = 1; m <= 2; m++) {
9664 GemmMicrokernelTester()
9665 .mr(2)
9666 .nr(4)
9667 .kr(2)
9668 .sr(1)
9669 .m(m)
9670 .n(4)
9671 .k(8)
9672 .iterations(1)
9673 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9674 }
9675 }
9676
9677 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8_subtile_n) {
9678 TEST_REQUIRES_X86_SSE41;
9679 for (uint32_t n = 1; n <= 4; n++) {
9680 GemmMicrokernelTester()
9681 .mr(2)
9682 .nr(4)
9683 .kr(2)
9684 .sr(1)
9685 .m(2)
9686 .n(n)
9687 .k(8)
9688 .iterations(1)
9689 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9690 }
9691 }
9692
9693 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_lt_8) {
9694 TEST_REQUIRES_X86_SSE41;
9695 for (size_t k = 1; k < 8; k++) {
9696 GemmMicrokernelTester()
9697 .mr(2)
9698 .nr(4)
9699 .kr(2)
9700 .sr(1)
9701 .m(2)
9702 .n(4)
9703 .k(k)
9704 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9705 }
9706 }
9707
9708 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_lt_8_strided_a) {
9709 TEST_REQUIRES_X86_SSE41;
9710 for (size_t k = 1; k < 8; k++) {
9711 GemmMicrokernelTester()
9712 .mr(2)
9713 .nr(4)
9714 .kr(2)
9715 .sr(1)
9716 .m(2)
9717 .n(4)
9718 .k(k)
9719 .a_stride(11)
9720 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9721 }
9722 }
9723
9724 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_lt_8_subtile) {
9725 TEST_REQUIRES_X86_SSE41;
9726 for (size_t k = 1; k < 8; k++) {
9727 for (uint32_t m = 1; m <= 2; m++) {
9728 for (uint32_t n = 1; n <= 4; n++) {
9729 GemmMicrokernelTester()
9730 .mr(2)
9731 .nr(4)
9732 .kr(2)
9733 .sr(1)
9734 .m(m)
9735 .n(n)
9736 .k(k)
9737 .iterations(1)
9738 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9739 }
9740 }
9741 }
9742 }
9743
9744 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_gt_8) {
9745 TEST_REQUIRES_X86_SSE41;
9746 for (size_t k = 9; k < 16; k++) {
9747 GemmMicrokernelTester()
9748 .mr(2)
9749 .nr(4)
9750 .kr(2)
9751 .sr(1)
9752 .m(2)
9753 .n(4)
9754 .k(k)
9755 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9756 }
9757 }
9758
9759 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_gt_8_strided_a) {
9760 TEST_REQUIRES_X86_SSE41;
9761 for (size_t k = 9; k < 16; k++) {
9762 GemmMicrokernelTester()
9763 .mr(2)
9764 .nr(4)
9765 .kr(2)
9766 .sr(1)
9767 .m(2)
9768 .n(4)
9769 .k(k)
9770 .a_stride(19)
9771 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9772 }
9773 }
9774
9775 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_gt_8_subtile) {
9776 TEST_REQUIRES_X86_SSE41;
9777 for (size_t k = 9; k < 16; k++) {
9778 for (uint32_t m = 1; m <= 2; m++) {
9779 for (uint32_t n = 1; n <= 4; n++) {
9780 GemmMicrokernelTester()
9781 .mr(2)
9782 .nr(4)
9783 .kr(2)
9784 .sr(1)
9785 .m(m)
9786 .n(n)
9787 .k(k)
9788 .iterations(1)
9789 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9790 }
9791 }
9792 }
9793 }
9794
9795 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_div_8) {
9796 TEST_REQUIRES_X86_SSE41;
9797 for (size_t k = 16; k <= 80; k += 8) {
9798 GemmMicrokernelTester()
9799 .mr(2)
9800 .nr(4)
9801 .kr(2)
9802 .sr(1)
9803 .m(2)
9804 .n(4)
9805 .k(k)
9806 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9807 }
9808 }
9809
9810 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_div_8_strided_a) {
9811 TEST_REQUIRES_X86_SSE41;
9812 for (size_t k = 16; k <= 80; k += 8) {
9813 GemmMicrokernelTester()
9814 .mr(2)
9815 .nr(4)
9816 .kr(2)
9817 .sr(1)
9818 .m(2)
9819 .n(4)
9820 .k(k)
9821 .a_stride(83)
9822 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9823 }
9824 }
9825
9826 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_div_8_subtile) {
9827 TEST_REQUIRES_X86_SSE41;
9828 for (size_t k = 16; k <= 80; k += 8) {
9829 for (uint32_t m = 1; m <= 2; m++) {
9830 for (uint32_t n = 1; n <= 4; n++) {
9831 GemmMicrokernelTester()
9832 .mr(2)
9833 .nr(4)
9834 .kr(2)
9835 .sr(1)
9836 .m(m)
9837 .n(n)
9838 .k(k)
9839 .iterations(1)
9840 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9841 }
9842 }
9843 }
9844 }
9845
9846 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4) {
9847 TEST_REQUIRES_X86_SSE41;
9848 for (uint32_t n = 5; n < 8; n++) {
9849 for (size_t k = 1; k <= 40; k += 9) {
9850 GemmMicrokernelTester()
9851 .mr(2)
9852 .nr(4)
9853 .kr(2)
9854 .sr(1)
9855 .m(2)
9856 .n(4)
9857 .k(k)
9858 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9859 }
9860 }
9861 }
9862
9863 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4_strided_cn) {
9864 TEST_REQUIRES_X86_SSE41;
9865 for (uint32_t n = 5; n < 8; n++) {
9866 for (size_t k = 1; k <= 40; k += 9) {
9867 GemmMicrokernelTester()
9868 .mr(2)
9869 .nr(4)
9870 .kr(2)
9871 .sr(1)
9872 .m(2)
9873 .n(4)
9874 .k(k)
9875 .cn_stride(7)
9876 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9877 }
9878 }
9879 }
9880
9881 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4_strided_a) {
9882 TEST_REQUIRES_X86_SSE41;
9883 for (uint32_t n = 5; n < 8; n++) {
9884 for (size_t k = 1; k <= 40; k += 9) {
9885 GemmMicrokernelTester()
9886 .mr(2)
9887 .nr(4)
9888 .kr(2)
9889 .sr(1)
9890 .m(2)
9891 .n(n)
9892 .k(k)
9893 .a_stride(43)
9894 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9895 }
9896 }
9897 }
9898
9899 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4_subtile) {
9900 TEST_REQUIRES_X86_SSE41;
9901 for (uint32_t n = 5; n < 8; n++) {
9902 for (size_t k = 1; k <= 40; k += 9) {
9903 for (uint32_t m = 1; m <= 2; m++) {
9904 GemmMicrokernelTester()
9905 .mr(2)
9906 .nr(4)
9907 .kr(2)
9908 .sr(1)
9909 .m(m)
9910 .n(n)
9911 .k(k)
9912 .iterations(1)
9913 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9914 }
9915 }
9916 }
9917 }
9918
9919 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4) {
9920 TEST_REQUIRES_X86_SSE41;
9921 for (uint32_t n = 8; n <= 12; n += 4) {
9922 for (size_t k = 1; k <= 40; k += 9) {
9923 GemmMicrokernelTester()
9924 .mr(2)
9925 .nr(4)
9926 .kr(2)
9927 .sr(1)
9928 .m(2)
9929 .n(4)
9930 .k(k)
9931 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9932 }
9933 }
9934 }
9935
9936 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4_strided_cn) {
9937 TEST_REQUIRES_X86_SSE41;
9938 for (uint32_t n = 8; n <= 12; n += 4) {
9939 for (size_t k = 1; k <= 40; k += 9) {
9940 GemmMicrokernelTester()
9941 .mr(2)
9942 .nr(4)
9943 .kr(2)
9944 .sr(1)
9945 .m(2)
9946 .n(n)
9947 .k(k)
9948 .cn_stride(7)
9949 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9950 }
9951 }
9952 }
9953
9954 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4_strided_a) {
9955 TEST_REQUIRES_X86_SSE41;
9956 for (uint32_t n = 8; n <= 12; n += 4) {
9957 for (size_t k = 1; k <= 40; k += 9) {
9958 GemmMicrokernelTester()
9959 .mr(2)
9960 .nr(4)
9961 .kr(2)
9962 .sr(1)
9963 .m(2)
9964 .n(n)
9965 .k(k)
9966 .a_stride(43)
9967 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9968 }
9969 }
9970 }
9971
9972 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4_subtile) {
9973 TEST_REQUIRES_X86_SSE41;
9974 for (uint32_t n = 8; n <= 12; n += 4) {
9975 for (size_t k = 1; k <= 40; k += 9) {
9976 for (uint32_t m = 1; m <= 2; m++) {
9977 GemmMicrokernelTester()
9978 .mr(2)
9979 .nr(4)
9980 .kr(2)
9981 .sr(1)
9982 .m(m)
9983 .n(n)
9984 .k(k)
9985 .iterations(1)
9986 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
9987 }
9988 }
9989 }
9990 }
9991
9992 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, strided_cm_subtile) {
9993 TEST_REQUIRES_X86_SSE41;
9994 for (size_t k = 1; k <= 40; k += 9) {
9995 for (uint32_t m = 1; m <= 2; m++) {
9996 for (uint32_t n = 1; n <= 4; n++) {
9997 GemmMicrokernelTester()
9998 .mr(2)
9999 .nr(4)
10000 .kr(2)
10001 .sr(1)
10002 .m(m)
10003 .n(n)
10004 .k(k)
10005 .cm_stride(7)
10006 .iterations(1)
10007 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10008 }
10009 }
10010 }
10011 }
10012
10013 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, qmin) {
10014 TEST_REQUIRES_X86_SSE41;
10015 GemmMicrokernelTester()
10016 .mr(2)
10017 .nr(4)
10018 .kr(2)
10019 .sr(1)
10020 .m(2)
10021 .n(4)
10022 .k(8)
10023 .qmin(128)
10024 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10025 }
10026
10027 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, qmax) {
10028 TEST_REQUIRES_X86_SSE41;
10029 GemmMicrokernelTester()
10030 .mr(2)
10031 .nr(4)
10032 .kr(2)
10033 .sr(1)
10034 .m(2)
10035 .n(4)
10036 .k(8)
10037 .qmax(128)
10038 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10039 }
10040
10041 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, strided_cm) {
10042 TEST_REQUIRES_X86_SSE41;
10043 GemmMicrokernelTester()
10044 .mr(2)
10045 .nr(4)
10046 .kr(2)
10047 .sr(1)
10048 .m(2)
10049 .n(4)
10050 .k(8)
10051 .cm_stride(7)
10052 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10053 }
10054#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
10055
10056
10057#if XNN_ARCH_X86 || XNN_ARCH_X86_64
10058 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8) {
10059 TEST_REQUIRES_X86_SSE41;
10060 GemmMicrokernelTester()
10061 .mr(3)
10062 .nr(4)
10063 .kr(2)
10064 .sr(1)
10065 .m(3)
10066 .n(4)
10067 .k(8)
10068 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10069 }
10070
10071 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, strided_cn) {
10072 TEST_REQUIRES_X86_SSE41;
10073 GemmMicrokernelTester()
10074 .mr(3)
10075 .nr(4)
10076 .kr(2)
10077 .sr(1)
10078 .m(3)
10079 .n(4)
10080 .k(8)
10081 .cn_stride(7)
10082 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10083 }
10084
10085 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8_strided_a) {
10086 TEST_REQUIRES_X86_SSE41;
10087 GemmMicrokernelTester()
10088 .mr(3)
10089 .nr(4)
10090 .kr(2)
10091 .sr(1)
10092 .m(3)
10093 .n(4)
10094 .k(8)
10095 .a_stride(11)
10096 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10097 }
10098
10099 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8_subtile) {
10100 TEST_REQUIRES_X86_SSE41;
10101 for (uint32_t m = 1; m <= 3; m++) {
10102 for (uint32_t n = 1; n <= 4; n++) {
10103 GemmMicrokernelTester()
10104 .mr(3)
10105 .nr(4)
10106 .kr(2)
10107 .sr(1)
10108 .m(m)
10109 .n(n)
10110 .k(8)
10111 .iterations(1)
10112 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10113 }
10114 }
10115 }
10116
10117 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8_subtile_m) {
10118 TEST_REQUIRES_X86_SSE41;
10119 for (uint32_t m = 1; m <= 3; m++) {
10120 GemmMicrokernelTester()
10121 .mr(3)
10122 .nr(4)
10123 .kr(2)
10124 .sr(1)
10125 .m(m)
10126 .n(4)
10127 .k(8)
10128 .iterations(1)
10129 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10130 }
10131 }
10132
10133 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8_subtile_n) {
10134 TEST_REQUIRES_X86_SSE41;
10135 for (uint32_t n = 1; n <= 4; n++) {
10136 GemmMicrokernelTester()
10137 .mr(3)
10138 .nr(4)
10139 .kr(2)
10140 .sr(1)
10141 .m(3)
10142 .n(n)
10143 .k(8)
10144 .iterations(1)
10145 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10146 }
10147 }
10148
10149 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_lt_8) {
10150 TEST_REQUIRES_X86_SSE41;
10151 for (size_t k = 1; k < 8; k++) {
10152 GemmMicrokernelTester()
10153 .mr(3)
10154 .nr(4)
10155 .kr(2)
10156 .sr(1)
10157 .m(3)
10158 .n(4)
10159 .k(k)
10160 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10161 }
10162 }
10163
10164 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_lt_8_strided_a) {
10165 TEST_REQUIRES_X86_SSE41;
10166 for (size_t k = 1; k < 8; k++) {
10167 GemmMicrokernelTester()
10168 .mr(3)
10169 .nr(4)
10170 .kr(2)
10171 .sr(1)
10172 .m(3)
10173 .n(4)
10174 .k(k)
10175 .a_stride(11)
10176 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10177 }
10178 }
10179
10180 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_lt_8_subtile) {
10181 TEST_REQUIRES_X86_SSE41;
10182 for (size_t k = 1; k < 8; k++) {
10183 for (uint32_t m = 1; m <= 3; m++) {
10184 for (uint32_t n = 1; n <= 4; n++) {
10185 GemmMicrokernelTester()
10186 .mr(3)
10187 .nr(4)
10188 .kr(2)
10189 .sr(1)
10190 .m(m)
10191 .n(n)
10192 .k(k)
10193 .iterations(1)
10194 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10195 }
10196 }
10197 }
10198 }
10199
10200 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_gt_8) {
10201 TEST_REQUIRES_X86_SSE41;
10202 for (size_t k = 9; k < 16; k++) {
10203 GemmMicrokernelTester()
10204 .mr(3)
10205 .nr(4)
10206 .kr(2)
10207 .sr(1)
10208 .m(3)
10209 .n(4)
10210 .k(k)
10211 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10212 }
10213 }
10214
10215 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_gt_8_strided_a) {
10216 TEST_REQUIRES_X86_SSE41;
10217 for (size_t k = 9; k < 16; k++) {
10218 GemmMicrokernelTester()
10219 .mr(3)
10220 .nr(4)
10221 .kr(2)
10222 .sr(1)
10223 .m(3)
10224 .n(4)
10225 .k(k)
10226 .a_stride(19)
10227 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10228 }
10229 }
10230
10231 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_gt_8_subtile) {
10232 TEST_REQUIRES_X86_SSE41;
10233 for (size_t k = 9; k < 16; k++) {
10234 for (uint32_t m = 1; m <= 3; m++) {
10235 for (uint32_t n = 1; n <= 4; n++) {
10236 GemmMicrokernelTester()
10237 .mr(3)
10238 .nr(4)
10239 .kr(2)
10240 .sr(1)
10241 .m(m)
10242 .n(n)
10243 .k(k)
10244 .iterations(1)
10245 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10246 }
10247 }
10248 }
10249 }
10250
10251 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_div_8) {
10252 TEST_REQUIRES_X86_SSE41;
10253 for (size_t k = 16; k <= 80; k += 8) {
10254 GemmMicrokernelTester()
10255 .mr(3)
10256 .nr(4)
10257 .kr(2)
10258 .sr(1)
10259 .m(3)
10260 .n(4)
10261 .k(k)
10262 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10263 }
10264 }
10265
10266 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_div_8_strided_a) {
10267 TEST_REQUIRES_X86_SSE41;
10268 for (size_t k = 16; k <= 80; k += 8) {
10269 GemmMicrokernelTester()
10270 .mr(3)
10271 .nr(4)
10272 .kr(2)
10273 .sr(1)
10274 .m(3)
10275 .n(4)
10276 .k(k)
10277 .a_stride(83)
10278 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10279 }
10280 }
10281
10282 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_div_8_subtile) {
10283 TEST_REQUIRES_X86_SSE41;
10284 for (size_t k = 16; k <= 80; k += 8) {
10285 for (uint32_t m = 1; m <= 3; m++) {
10286 for (uint32_t n = 1; n <= 4; n++) {
10287 GemmMicrokernelTester()
10288 .mr(3)
10289 .nr(4)
10290 .kr(2)
10291 .sr(1)
10292 .m(m)
10293 .n(n)
10294 .k(k)
10295 .iterations(1)
10296 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10297 }
10298 }
10299 }
10300 }
10301
10302 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4) {
10303 TEST_REQUIRES_X86_SSE41;
10304 for (uint32_t n = 5; n < 8; n++) {
10305 for (size_t k = 1; k <= 40; k += 9) {
10306 GemmMicrokernelTester()
10307 .mr(3)
10308 .nr(4)
10309 .kr(2)
10310 .sr(1)
10311 .m(3)
10312 .n(4)
10313 .k(k)
10314 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10315 }
10316 }
10317 }
10318
10319 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4_strided_cn) {
10320 TEST_REQUIRES_X86_SSE41;
10321 for (uint32_t n = 5; n < 8; n++) {
10322 for (size_t k = 1; k <= 40; k += 9) {
10323 GemmMicrokernelTester()
10324 .mr(3)
10325 .nr(4)
10326 .kr(2)
10327 .sr(1)
10328 .m(3)
10329 .n(4)
10330 .k(k)
10331 .cn_stride(7)
10332 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10333 }
10334 }
10335 }
10336
10337 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4_strided_a) {
10338 TEST_REQUIRES_X86_SSE41;
10339 for (uint32_t n = 5; n < 8; n++) {
10340 for (size_t k = 1; k <= 40; k += 9) {
10341 GemmMicrokernelTester()
10342 .mr(3)
10343 .nr(4)
10344 .kr(2)
10345 .sr(1)
10346 .m(3)
10347 .n(n)
10348 .k(k)
10349 .a_stride(43)
10350 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10351 }
10352 }
10353 }
10354
10355 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4_subtile) {
10356 TEST_REQUIRES_X86_SSE41;
10357 for (uint32_t n = 5; n < 8; n++) {
10358 for (size_t k = 1; k <= 40; k += 9) {
10359 for (uint32_t m = 1; m <= 3; m++) {
10360 GemmMicrokernelTester()
10361 .mr(3)
10362 .nr(4)
10363 .kr(2)
10364 .sr(1)
10365 .m(m)
10366 .n(n)
10367 .k(k)
10368 .iterations(1)
10369 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10370 }
10371 }
10372 }
10373 }
10374
10375 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4) {
10376 TEST_REQUIRES_X86_SSE41;
10377 for (uint32_t n = 8; n <= 12; n += 4) {
10378 for (size_t k = 1; k <= 40; k += 9) {
10379 GemmMicrokernelTester()
10380 .mr(3)
10381 .nr(4)
10382 .kr(2)
10383 .sr(1)
10384 .m(3)
10385 .n(4)
10386 .k(k)
10387 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10388 }
10389 }
10390 }
10391
10392 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4_strided_cn) {
10393 TEST_REQUIRES_X86_SSE41;
10394 for (uint32_t n = 8; n <= 12; n += 4) {
10395 for (size_t k = 1; k <= 40; k += 9) {
10396 GemmMicrokernelTester()
10397 .mr(3)
10398 .nr(4)
10399 .kr(2)
10400 .sr(1)
10401 .m(3)
10402 .n(n)
10403 .k(k)
10404 .cn_stride(7)
10405 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10406 }
10407 }
10408 }
10409
10410 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4_strided_a) {
10411 TEST_REQUIRES_X86_SSE41;
10412 for (uint32_t n = 8; n <= 12; n += 4) {
10413 for (size_t k = 1; k <= 40; k += 9) {
10414 GemmMicrokernelTester()
10415 .mr(3)
10416 .nr(4)
10417 .kr(2)
10418 .sr(1)
10419 .m(3)
10420 .n(n)
10421 .k(k)
10422 .a_stride(43)
10423 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10424 }
10425 }
10426 }
10427
10428 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4_subtile) {
10429 TEST_REQUIRES_X86_SSE41;
10430 for (uint32_t n = 8; n <= 12; n += 4) {
10431 for (size_t k = 1; k <= 40; k += 9) {
10432 for (uint32_t m = 1; m <= 3; m++) {
10433 GemmMicrokernelTester()
10434 .mr(3)
10435 .nr(4)
10436 .kr(2)
10437 .sr(1)
10438 .m(m)
10439 .n(n)
10440 .k(k)
10441 .iterations(1)
10442 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10443 }
10444 }
10445 }
10446 }
10447
10448 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, strided_cm_subtile) {
10449 TEST_REQUIRES_X86_SSE41;
10450 for (size_t k = 1; k <= 40; k += 9) {
10451 for (uint32_t m = 1; m <= 3; m++) {
10452 for (uint32_t n = 1; n <= 4; n++) {
10453 GemmMicrokernelTester()
10454 .mr(3)
10455 .nr(4)
10456 .kr(2)
10457 .sr(1)
10458 .m(m)
10459 .n(n)
10460 .k(k)
10461 .cm_stride(7)
10462 .iterations(1)
10463 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10464 }
10465 }
10466 }
10467 }
10468
10469 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, qmin) {
10470 TEST_REQUIRES_X86_SSE41;
10471 GemmMicrokernelTester()
10472 .mr(3)
10473 .nr(4)
10474 .kr(2)
10475 .sr(1)
10476 .m(3)
10477 .n(4)
10478 .k(8)
10479 .qmin(128)
10480 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10481 }
10482
10483 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, qmax) {
10484 TEST_REQUIRES_X86_SSE41;
10485 GemmMicrokernelTester()
10486 .mr(3)
10487 .nr(4)
10488 .kr(2)
10489 .sr(1)
10490 .m(3)
10491 .n(4)
10492 .k(8)
10493 .qmax(128)
10494 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10495 }
10496
10497 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, strided_cm) {
10498 TEST_REQUIRES_X86_SSE41;
10499 GemmMicrokernelTester()
10500 .mr(3)
10501 .nr(4)
10502 .kr(2)
10503 .sr(1)
10504 .m(3)
10505 .n(4)
10506 .k(8)
10507 .cm_stride(7)
10508 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10509 }
10510#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
10511
10512
10513#if XNN_ARCH_X86 || XNN_ARCH_X86_64
10514 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8) {
10515 TEST_REQUIRES_X86_SSE41;
10516 GemmMicrokernelTester()
10517 .mr(4)
10518 .nr(4)
10519 .kr(2)
10520 .sr(1)
10521 .m(4)
10522 .n(4)
10523 .k(8)
10524 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10525 }
10526
10527 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, strided_cn) {
10528 TEST_REQUIRES_X86_SSE41;
10529 GemmMicrokernelTester()
10530 .mr(4)
10531 .nr(4)
10532 .kr(2)
10533 .sr(1)
10534 .m(4)
10535 .n(4)
10536 .k(8)
10537 .cn_stride(7)
10538 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10539 }
10540
10541 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8_strided_a) {
10542 TEST_REQUIRES_X86_SSE41;
10543 GemmMicrokernelTester()
10544 .mr(4)
10545 .nr(4)
10546 .kr(2)
10547 .sr(1)
10548 .m(4)
10549 .n(4)
10550 .k(8)
10551 .a_stride(11)
10552 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10553 }
10554
10555 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8_subtile) {
10556 TEST_REQUIRES_X86_SSE41;
10557 for (uint32_t m = 1; m <= 4; m++) {
10558 for (uint32_t n = 1; n <= 4; n++) {
10559 GemmMicrokernelTester()
10560 .mr(4)
10561 .nr(4)
10562 .kr(2)
10563 .sr(1)
10564 .m(m)
10565 .n(n)
10566 .k(8)
10567 .iterations(1)
10568 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10569 }
10570 }
10571 }
10572
10573 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8_subtile_m) {
10574 TEST_REQUIRES_X86_SSE41;
10575 for (uint32_t m = 1; m <= 4; m++) {
10576 GemmMicrokernelTester()
10577 .mr(4)
10578 .nr(4)
10579 .kr(2)
10580 .sr(1)
10581 .m(m)
10582 .n(4)
10583 .k(8)
10584 .iterations(1)
10585 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10586 }
10587 }
10588
10589 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8_subtile_n) {
10590 TEST_REQUIRES_X86_SSE41;
10591 for (uint32_t n = 1; n <= 4; n++) {
10592 GemmMicrokernelTester()
10593 .mr(4)
10594 .nr(4)
10595 .kr(2)
10596 .sr(1)
10597 .m(4)
10598 .n(n)
10599 .k(8)
10600 .iterations(1)
10601 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10602 }
10603 }
10604
10605 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_lt_8) {
10606 TEST_REQUIRES_X86_SSE41;
10607 for (size_t k = 1; k < 8; k++) {
10608 GemmMicrokernelTester()
10609 .mr(4)
10610 .nr(4)
10611 .kr(2)
10612 .sr(1)
10613 .m(4)
10614 .n(4)
10615 .k(k)
10616 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10617 }
10618 }
10619
10620 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_lt_8_strided_a) {
10621 TEST_REQUIRES_X86_SSE41;
10622 for (size_t k = 1; k < 8; k++) {
10623 GemmMicrokernelTester()
10624 .mr(4)
10625 .nr(4)
10626 .kr(2)
10627 .sr(1)
10628 .m(4)
10629 .n(4)
10630 .k(k)
10631 .a_stride(11)
10632 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10633 }
10634 }
10635
10636 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_lt_8_subtile) {
10637 TEST_REQUIRES_X86_SSE41;
10638 for (size_t k = 1; k < 8; k++) {
10639 for (uint32_t m = 1; m <= 4; m++) {
10640 for (uint32_t n = 1; n <= 4; n++) {
10641 GemmMicrokernelTester()
10642 .mr(4)
10643 .nr(4)
10644 .kr(2)
10645 .sr(1)
10646 .m(m)
10647 .n(n)
10648 .k(k)
10649 .iterations(1)
10650 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10651 }
10652 }
10653 }
10654 }
10655
10656 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_gt_8) {
10657 TEST_REQUIRES_X86_SSE41;
10658 for (size_t k = 9; k < 16; k++) {
10659 GemmMicrokernelTester()
10660 .mr(4)
10661 .nr(4)
10662 .kr(2)
10663 .sr(1)
10664 .m(4)
10665 .n(4)
10666 .k(k)
10667 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10668 }
10669 }
10670
10671 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_gt_8_strided_a) {
10672 TEST_REQUIRES_X86_SSE41;
10673 for (size_t k = 9; k < 16; k++) {
10674 GemmMicrokernelTester()
10675 .mr(4)
10676 .nr(4)
10677 .kr(2)
10678 .sr(1)
10679 .m(4)
10680 .n(4)
10681 .k(k)
10682 .a_stride(19)
10683 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10684 }
10685 }
10686
10687 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_gt_8_subtile) {
10688 TEST_REQUIRES_X86_SSE41;
10689 for (size_t k = 9; k < 16; k++) {
10690 for (uint32_t m = 1; m <= 4; m++) {
10691 for (uint32_t n = 1; n <= 4; n++) {
10692 GemmMicrokernelTester()
10693 .mr(4)
10694 .nr(4)
10695 .kr(2)
10696 .sr(1)
10697 .m(m)
10698 .n(n)
10699 .k(k)
10700 .iterations(1)
10701 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10702 }
10703 }
10704 }
10705 }
10706
10707 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_div_8) {
10708 TEST_REQUIRES_X86_SSE41;
10709 for (size_t k = 16; k <= 80; k += 8) {
10710 GemmMicrokernelTester()
10711 .mr(4)
10712 .nr(4)
10713 .kr(2)
10714 .sr(1)
10715 .m(4)
10716 .n(4)
10717 .k(k)
10718 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10719 }
10720 }
10721
10722 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_div_8_strided_a) {
10723 TEST_REQUIRES_X86_SSE41;
10724 for (size_t k = 16; k <= 80; k += 8) {
10725 GemmMicrokernelTester()
10726 .mr(4)
10727 .nr(4)
10728 .kr(2)
10729 .sr(1)
10730 .m(4)
10731 .n(4)
10732 .k(k)
10733 .a_stride(83)
10734 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10735 }
10736 }
10737
10738 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_div_8_subtile) {
10739 TEST_REQUIRES_X86_SSE41;
10740 for (size_t k = 16; k <= 80; k += 8) {
10741 for (uint32_t m = 1; m <= 4; m++) {
10742 for (uint32_t n = 1; n <= 4; n++) {
10743 GemmMicrokernelTester()
10744 .mr(4)
10745 .nr(4)
10746 .kr(2)
10747 .sr(1)
10748 .m(m)
10749 .n(n)
10750 .k(k)
10751 .iterations(1)
10752 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10753 }
10754 }
10755 }
10756 }
10757
10758 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4) {
10759 TEST_REQUIRES_X86_SSE41;
10760 for (uint32_t n = 5; n < 8; n++) {
10761 for (size_t k = 1; k <= 40; k += 9) {
10762 GemmMicrokernelTester()
10763 .mr(4)
10764 .nr(4)
10765 .kr(2)
10766 .sr(1)
10767 .m(4)
10768 .n(4)
10769 .k(k)
10770 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10771 }
10772 }
10773 }
10774
10775 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4_strided_cn) {
10776 TEST_REQUIRES_X86_SSE41;
10777 for (uint32_t n = 5; n < 8; n++) {
10778 for (size_t k = 1; k <= 40; k += 9) {
10779 GemmMicrokernelTester()
10780 .mr(4)
10781 .nr(4)
10782 .kr(2)
10783 .sr(1)
10784 .m(4)
10785 .n(4)
10786 .k(k)
10787 .cn_stride(7)
10788 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10789 }
10790 }
10791 }
10792
10793 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4_strided_a) {
10794 TEST_REQUIRES_X86_SSE41;
10795 for (uint32_t n = 5; n < 8; n++) {
10796 for (size_t k = 1; k <= 40; k += 9) {
10797 GemmMicrokernelTester()
10798 .mr(4)
10799 .nr(4)
10800 .kr(2)
10801 .sr(1)
10802 .m(4)
10803 .n(n)
10804 .k(k)
10805 .a_stride(43)
10806 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10807 }
10808 }
10809 }
10810
10811 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4_subtile) {
10812 TEST_REQUIRES_X86_SSE41;
10813 for (uint32_t n = 5; n < 8; n++) {
10814 for (size_t k = 1; k <= 40; k += 9) {
10815 for (uint32_t m = 1; m <= 4; m++) {
10816 GemmMicrokernelTester()
10817 .mr(4)
10818 .nr(4)
10819 .kr(2)
10820 .sr(1)
10821 .m(m)
10822 .n(n)
10823 .k(k)
10824 .iterations(1)
10825 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10826 }
10827 }
10828 }
10829 }
10830
10831 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4) {
10832 TEST_REQUIRES_X86_SSE41;
10833 for (uint32_t n = 8; n <= 12; n += 4) {
10834 for (size_t k = 1; k <= 40; k += 9) {
10835 GemmMicrokernelTester()
10836 .mr(4)
10837 .nr(4)
10838 .kr(2)
10839 .sr(1)
10840 .m(4)
10841 .n(4)
10842 .k(k)
10843 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10844 }
10845 }
10846 }
10847
10848 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4_strided_cn) {
10849 TEST_REQUIRES_X86_SSE41;
10850 for (uint32_t n = 8; n <= 12; n += 4) {
10851 for (size_t k = 1; k <= 40; k += 9) {
10852 GemmMicrokernelTester()
10853 .mr(4)
10854 .nr(4)
10855 .kr(2)
10856 .sr(1)
10857 .m(4)
10858 .n(n)
10859 .k(k)
10860 .cn_stride(7)
10861 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10862 }
10863 }
10864 }
10865
10866 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4_strided_a) {
10867 TEST_REQUIRES_X86_SSE41;
10868 for (uint32_t n = 8; n <= 12; n += 4) {
10869 for (size_t k = 1; k <= 40; k += 9) {
10870 GemmMicrokernelTester()
10871 .mr(4)
10872 .nr(4)
10873 .kr(2)
10874 .sr(1)
10875 .m(4)
10876 .n(n)
10877 .k(k)
10878 .a_stride(43)
10879 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10880 }
10881 }
10882 }
10883
10884 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4_subtile) {
10885 TEST_REQUIRES_X86_SSE41;
10886 for (uint32_t n = 8; n <= 12; n += 4) {
10887 for (size_t k = 1; k <= 40; k += 9) {
10888 for (uint32_t m = 1; m <= 4; m++) {
10889 GemmMicrokernelTester()
10890 .mr(4)
10891 .nr(4)
10892 .kr(2)
10893 .sr(1)
10894 .m(m)
10895 .n(n)
10896 .k(k)
10897 .iterations(1)
10898 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10899 }
10900 }
10901 }
10902 }
10903
10904 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, strided_cm_subtile) {
10905 TEST_REQUIRES_X86_SSE41;
10906 for (size_t k = 1; k <= 40; k += 9) {
10907 for (uint32_t m = 1; m <= 4; m++) {
10908 for (uint32_t n = 1; n <= 4; n++) {
10909 GemmMicrokernelTester()
10910 .mr(4)
10911 .nr(4)
10912 .kr(2)
10913 .sr(1)
10914 .m(m)
10915 .n(n)
10916 .k(k)
10917 .cm_stride(7)
10918 .iterations(1)
10919 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10920 }
10921 }
10922 }
10923 }
10924
10925 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, qmin) {
10926 TEST_REQUIRES_X86_SSE41;
10927 GemmMicrokernelTester()
10928 .mr(4)
10929 .nr(4)
10930 .kr(2)
10931 .sr(1)
10932 .m(4)
10933 .n(4)
10934 .k(8)
10935 .qmin(128)
10936 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10937 }
10938
10939 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, qmax) {
10940 TEST_REQUIRES_X86_SSE41;
10941 GemmMicrokernelTester()
10942 .mr(4)
10943 .nr(4)
10944 .kr(2)
10945 .sr(1)
10946 .m(4)
10947 .n(4)
10948 .k(8)
10949 .qmax(128)
10950 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10951 }
10952
10953 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, strided_cm) {
10954 TEST_REQUIRES_X86_SSE41;
10955 GemmMicrokernelTester()
10956 .mr(4)
10957 .nr(4)
10958 .kr(2)
10959 .sr(1)
10960 .m(4)
10961 .n(4)
10962 .k(8)
10963 .cm_stride(7)
10964 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10965 }
10966#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
10967
10968
10969#if XNN_ARCH_X86 || XNN_ARCH_X86_64
10970 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8) {
10971 TEST_REQUIRES_X86_AVX;
10972 GemmMicrokernelTester()
10973 .mr(1)
10974 .nr(4)
10975 .kr(2)
10976 .sr(1)
10977 .m(1)
10978 .n(4)
10979 .k(8)
10980 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10981 }
10982
10983 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, strided_cn) {
10984 TEST_REQUIRES_X86_AVX;
10985 GemmMicrokernelTester()
10986 .mr(1)
10987 .nr(4)
10988 .kr(2)
10989 .sr(1)
10990 .m(1)
10991 .n(4)
10992 .k(8)
10993 .cn_stride(7)
10994 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
10995 }
10996
10997 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_strided_a) {
10998 TEST_REQUIRES_X86_AVX;
10999 GemmMicrokernelTester()
11000 .mr(1)
11001 .nr(4)
11002 .kr(2)
11003 .sr(1)
11004 .m(1)
11005 .n(4)
11006 .k(8)
11007 .a_stride(11)
11008 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11009 }
11010
11011 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_subtile) {
11012 TEST_REQUIRES_X86_AVX;
11013 for (uint32_t m = 1; m <= 1; m++) {
11014 for (uint32_t n = 1; n <= 4; n++) {
11015 GemmMicrokernelTester()
11016 .mr(1)
11017 .nr(4)
11018 .kr(2)
11019 .sr(1)
11020 .m(m)
11021 .n(n)
11022 .k(8)
11023 .iterations(1)
11024 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11025 }
11026 }
11027 }
11028
11029 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_subtile_m) {
11030 TEST_REQUIRES_X86_AVX;
11031 for (uint32_t m = 1; m <= 1; m++) {
11032 GemmMicrokernelTester()
11033 .mr(1)
11034 .nr(4)
11035 .kr(2)
11036 .sr(1)
11037 .m(m)
11038 .n(4)
11039 .k(8)
11040 .iterations(1)
11041 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11042 }
11043 }
11044
11045 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_subtile_n) {
11046 TEST_REQUIRES_X86_AVX;
11047 for (uint32_t n = 1; n <= 4; n++) {
11048 GemmMicrokernelTester()
11049 .mr(1)
11050 .nr(4)
11051 .kr(2)
11052 .sr(1)
11053 .m(1)
11054 .n(n)
11055 .k(8)
11056 .iterations(1)
11057 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11058 }
11059 }
11060
11061 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_lt_8) {
11062 TEST_REQUIRES_X86_AVX;
11063 for (size_t k = 1; k < 8; k++) {
11064 GemmMicrokernelTester()
11065 .mr(1)
11066 .nr(4)
11067 .kr(2)
11068 .sr(1)
11069 .m(1)
11070 .n(4)
11071 .k(k)
11072 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11073 }
11074 }
11075
11076 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_lt_8_strided_a) {
11077 TEST_REQUIRES_X86_AVX;
11078 for (size_t k = 1; k < 8; k++) {
11079 GemmMicrokernelTester()
11080 .mr(1)
11081 .nr(4)
11082 .kr(2)
11083 .sr(1)
11084 .m(1)
11085 .n(4)
11086 .k(k)
11087 .a_stride(11)
11088 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11089 }
11090 }
11091
11092 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_lt_8_subtile) {
11093 TEST_REQUIRES_X86_AVX;
11094 for (size_t k = 1; k < 8; k++) {
11095 for (uint32_t m = 1; m <= 1; m++) {
11096 for (uint32_t n = 1; n <= 4; n++) {
11097 GemmMicrokernelTester()
11098 .mr(1)
11099 .nr(4)
11100 .kr(2)
11101 .sr(1)
11102 .m(m)
11103 .n(n)
11104 .k(k)
11105 .iterations(1)
11106 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11107 }
11108 }
11109 }
11110 }
11111
11112 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_gt_8) {
11113 TEST_REQUIRES_X86_AVX;
11114 for (size_t k = 9; k < 16; k++) {
11115 GemmMicrokernelTester()
11116 .mr(1)
11117 .nr(4)
11118 .kr(2)
11119 .sr(1)
11120 .m(1)
11121 .n(4)
11122 .k(k)
11123 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11124 }
11125 }
11126
11127 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_gt_8_strided_a) {
11128 TEST_REQUIRES_X86_AVX;
11129 for (size_t k = 9; k < 16; k++) {
11130 GemmMicrokernelTester()
11131 .mr(1)
11132 .nr(4)
11133 .kr(2)
11134 .sr(1)
11135 .m(1)
11136 .n(4)
11137 .k(k)
11138 .a_stride(19)
11139 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11140 }
11141 }
11142
11143 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_gt_8_subtile) {
11144 TEST_REQUIRES_X86_AVX;
11145 for (size_t k = 9; k < 16; k++) {
11146 for (uint32_t m = 1; m <= 1; m++) {
11147 for (uint32_t n = 1; n <= 4; n++) {
11148 GemmMicrokernelTester()
11149 .mr(1)
11150 .nr(4)
11151 .kr(2)
11152 .sr(1)
11153 .m(m)
11154 .n(n)
11155 .k(k)
11156 .iterations(1)
11157 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11158 }
11159 }
11160 }
11161 }
11162
11163 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_div_8) {
11164 TEST_REQUIRES_X86_AVX;
11165 for (size_t k = 16; k <= 80; k += 8) {
11166 GemmMicrokernelTester()
11167 .mr(1)
11168 .nr(4)
11169 .kr(2)
11170 .sr(1)
11171 .m(1)
11172 .n(4)
11173 .k(k)
11174 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11175 }
11176 }
11177
11178 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_div_8_strided_a) {
11179 TEST_REQUIRES_X86_AVX;
11180 for (size_t k = 16; k <= 80; k += 8) {
11181 GemmMicrokernelTester()
11182 .mr(1)
11183 .nr(4)
11184 .kr(2)
11185 .sr(1)
11186 .m(1)
11187 .n(4)
11188 .k(k)
11189 .a_stride(83)
11190 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11191 }
11192 }
11193
11194 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_div_8_subtile) {
11195 TEST_REQUIRES_X86_AVX;
11196 for (size_t k = 16; k <= 80; k += 8) {
11197 for (uint32_t m = 1; m <= 1; m++) {
11198 for (uint32_t n = 1; n <= 4; n++) {
11199 GemmMicrokernelTester()
11200 .mr(1)
11201 .nr(4)
11202 .kr(2)
11203 .sr(1)
11204 .m(m)
11205 .n(n)
11206 .k(k)
11207 .iterations(1)
11208 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11209 }
11210 }
11211 }
11212 }
11213
11214 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4) {
11215 TEST_REQUIRES_X86_AVX;
11216 for (uint32_t n = 5; n < 8; n++) {
11217 for (size_t k = 1; k <= 40; k += 9) {
11218 GemmMicrokernelTester()
11219 .mr(1)
11220 .nr(4)
11221 .kr(2)
11222 .sr(1)
11223 .m(1)
11224 .n(4)
11225 .k(k)
11226 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11227 }
11228 }
11229 }
11230
11231 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4_strided_cn) {
11232 TEST_REQUIRES_X86_AVX;
11233 for (uint32_t n = 5; n < 8; n++) {
11234 for (size_t k = 1; k <= 40; k += 9) {
11235 GemmMicrokernelTester()
11236 .mr(1)
11237 .nr(4)
11238 .kr(2)
11239 .sr(1)
11240 .m(1)
11241 .n(4)
11242 .k(k)
11243 .cn_stride(7)
11244 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11245 }
11246 }
11247 }
11248
11249 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4_strided_a) {
11250 TEST_REQUIRES_X86_AVX;
11251 for (uint32_t n = 5; n < 8; n++) {
11252 for (size_t k = 1; k <= 40; k += 9) {
11253 GemmMicrokernelTester()
11254 .mr(1)
11255 .nr(4)
11256 .kr(2)
11257 .sr(1)
11258 .m(1)
11259 .n(n)
11260 .k(k)
11261 .a_stride(43)
11262 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11263 }
11264 }
11265 }
11266
11267 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4_subtile) {
11268 TEST_REQUIRES_X86_AVX;
11269 for (uint32_t n = 5; n < 8; n++) {
11270 for (size_t k = 1; k <= 40; k += 9) {
11271 for (uint32_t m = 1; m <= 1; m++) {
11272 GemmMicrokernelTester()
11273 .mr(1)
11274 .nr(4)
11275 .kr(2)
11276 .sr(1)
11277 .m(m)
11278 .n(n)
11279 .k(k)
11280 .iterations(1)
11281 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11282 }
11283 }
11284 }
11285 }
11286
11287 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4) {
11288 TEST_REQUIRES_X86_AVX;
11289 for (uint32_t n = 8; n <= 12; n += 4) {
11290 for (size_t k = 1; k <= 40; k += 9) {
11291 GemmMicrokernelTester()
11292 .mr(1)
11293 .nr(4)
11294 .kr(2)
11295 .sr(1)
11296 .m(1)
11297 .n(4)
11298 .k(k)
11299 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11300 }
11301 }
11302 }
11303
11304 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4_strided_cn) {
11305 TEST_REQUIRES_X86_AVX;
11306 for (uint32_t n = 8; n <= 12; n += 4) {
11307 for (size_t k = 1; k <= 40; k += 9) {
11308 GemmMicrokernelTester()
11309 .mr(1)
11310 .nr(4)
11311 .kr(2)
11312 .sr(1)
11313 .m(1)
11314 .n(n)
11315 .k(k)
11316 .cn_stride(7)
11317 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11318 }
11319 }
11320 }
11321
11322 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4_strided_a) {
11323 TEST_REQUIRES_X86_AVX;
11324 for (uint32_t n = 8; n <= 12; n += 4) {
11325 for (size_t k = 1; k <= 40; k += 9) {
11326 GemmMicrokernelTester()
11327 .mr(1)
11328 .nr(4)
11329 .kr(2)
11330 .sr(1)
11331 .m(1)
11332 .n(n)
11333 .k(k)
11334 .a_stride(43)
11335 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11336 }
11337 }
11338 }
11339
11340 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4_subtile) {
11341 TEST_REQUIRES_X86_AVX;
11342 for (uint32_t n = 8; n <= 12; n += 4) {
11343 for (size_t k = 1; k <= 40; k += 9) {
11344 for (uint32_t m = 1; m <= 1; m++) {
11345 GemmMicrokernelTester()
11346 .mr(1)
11347 .nr(4)
11348 .kr(2)
11349 .sr(1)
11350 .m(m)
11351 .n(n)
11352 .k(k)
11353 .iterations(1)
11354 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11355 }
11356 }
11357 }
11358 }
11359
11360 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, strided_cm_subtile) {
11361 TEST_REQUIRES_X86_AVX;
11362 for (size_t k = 1; k <= 40; k += 9) {
11363 for (uint32_t m = 1; m <= 1; m++) {
11364 for (uint32_t n = 1; n <= 4; n++) {
11365 GemmMicrokernelTester()
11366 .mr(1)
11367 .nr(4)
11368 .kr(2)
11369 .sr(1)
11370 .m(m)
11371 .n(n)
11372 .k(k)
11373 .cm_stride(7)
11374 .iterations(1)
11375 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11376 }
11377 }
11378 }
11379 }
11380
11381 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, qmin) {
11382 TEST_REQUIRES_X86_AVX;
11383 GemmMicrokernelTester()
11384 .mr(1)
11385 .nr(4)
11386 .kr(2)
11387 .sr(1)
11388 .m(1)
11389 .n(4)
11390 .k(8)
11391 .qmin(128)
11392 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11393 }
11394
11395 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, qmax) {
11396 TEST_REQUIRES_X86_AVX;
11397 GemmMicrokernelTester()
11398 .mr(1)
11399 .nr(4)
11400 .kr(2)
11401 .sr(1)
11402 .m(1)
11403 .n(4)
11404 .k(8)
11405 .qmax(128)
11406 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11407 }
11408
11409 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, strided_cm) {
11410 TEST_REQUIRES_X86_AVX;
11411 GemmMicrokernelTester()
11412 .mr(1)
11413 .nr(4)
11414 .kr(2)
11415 .sr(1)
11416 .m(1)
11417 .n(4)
11418 .k(8)
11419 .cm_stride(7)
11420 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11421 }
11422#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
11423
11424
11425#if XNN_ARCH_X86 || XNN_ARCH_X86_64
11426 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8) {
11427 TEST_REQUIRES_X86_AVX;
11428 GemmMicrokernelTester()
11429 .mr(2)
11430 .nr(4)
11431 .kr(2)
11432 .sr(1)
11433 .m(2)
11434 .n(4)
11435 .k(8)
11436 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11437 }
11438
11439 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, strided_cn) {
11440 TEST_REQUIRES_X86_AVX;
11441 GemmMicrokernelTester()
11442 .mr(2)
11443 .nr(4)
11444 .kr(2)
11445 .sr(1)
11446 .m(2)
11447 .n(4)
11448 .k(8)
11449 .cn_stride(7)
11450 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11451 }
11452
11453 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_strided_a) {
11454 TEST_REQUIRES_X86_AVX;
11455 GemmMicrokernelTester()
11456 .mr(2)
11457 .nr(4)
11458 .kr(2)
11459 .sr(1)
11460 .m(2)
11461 .n(4)
11462 .k(8)
11463 .a_stride(11)
11464 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11465 }
11466
11467 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_subtile) {
11468 TEST_REQUIRES_X86_AVX;
11469 for (uint32_t m = 1; m <= 2; m++) {
11470 for (uint32_t n = 1; n <= 4; n++) {
11471 GemmMicrokernelTester()
11472 .mr(2)
11473 .nr(4)
11474 .kr(2)
11475 .sr(1)
11476 .m(m)
11477 .n(n)
11478 .k(8)
11479 .iterations(1)
11480 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11481 }
11482 }
11483 }
11484
11485 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_subtile_m) {
11486 TEST_REQUIRES_X86_AVX;
11487 for (uint32_t m = 1; m <= 2; m++) {
11488 GemmMicrokernelTester()
11489 .mr(2)
11490 .nr(4)
11491 .kr(2)
11492 .sr(1)
11493 .m(m)
11494 .n(4)
11495 .k(8)
11496 .iterations(1)
11497 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11498 }
11499 }
11500
11501 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_subtile_n) {
11502 TEST_REQUIRES_X86_AVX;
11503 for (uint32_t n = 1; n <= 4; n++) {
11504 GemmMicrokernelTester()
11505 .mr(2)
11506 .nr(4)
11507 .kr(2)
11508 .sr(1)
11509 .m(2)
11510 .n(n)
11511 .k(8)
11512 .iterations(1)
11513 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11514 }
11515 }
11516
11517 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_lt_8) {
11518 TEST_REQUIRES_X86_AVX;
11519 for (size_t k = 1; k < 8; k++) {
11520 GemmMicrokernelTester()
11521 .mr(2)
11522 .nr(4)
11523 .kr(2)
11524 .sr(1)
11525 .m(2)
11526 .n(4)
11527 .k(k)
11528 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11529 }
11530 }
11531
11532 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_lt_8_strided_a) {
11533 TEST_REQUIRES_X86_AVX;
11534 for (size_t k = 1; k < 8; k++) {
11535 GemmMicrokernelTester()
11536 .mr(2)
11537 .nr(4)
11538 .kr(2)
11539 .sr(1)
11540 .m(2)
11541 .n(4)
11542 .k(k)
11543 .a_stride(11)
11544 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11545 }
11546 }
11547
11548 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_lt_8_subtile) {
11549 TEST_REQUIRES_X86_AVX;
11550 for (size_t k = 1; k < 8; k++) {
11551 for (uint32_t m = 1; m <= 2; m++) {
11552 for (uint32_t n = 1; n <= 4; n++) {
11553 GemmMicrokernelTester()
11554 .mr(2)
11555 .nr(4)
11556 .kr(2)
11557 .sr(1)
11558 .m(m)
11559 .n(n)
11560 .k(k)
11561 .iterations(1)
11562 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11563 }
11564 }
11565 }
11566 }
11567
11568 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_gt_8) {
11569 TEST_REQUIRES_X86_AVX;
11570 for (size_t k = 9; k < 16; k++) {
11571 GemmMicrokernelTester()
11572 .mr(2)
11573 .nr(4)
11574 .kr(2)
11575 .sr(1)
11576 .m(2)
11577 .n(4)
11578 .k(k)
11579 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11580 }
11581 }
11582
11583 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_gt_8_strided_a) {
11584 TEST_REQUIRES_X86_AVX;
11585 for (size_t k = 9; k < 16; k++) {
11586 GemmMicrokernelTester()
11587 .mr(2)
11588 .nr(4)
11589 .kr(2)
11590 .sr(1)
11591 .m(2)
11592 .n(4)
11593 .k(k)
11594 .a_stride(19)
11595 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11596 }
11597 }
11598
11599 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_gt_8_subtile) {
11600 TEST_REQUIRES_X86_AVX;
11601 for (size_t k = 9; k < 16; k++) {
11602 for (uint32_t m = 1; m <= 2; m++) {
11603 for (uint32_t n = 1; n <= 4; n++) {
11604 GemmMicrokernelTester()
11605 .mr(2)
11606 .nr(4)
11607 .kr(2)
11608 .sr(1)
11609 .m(m)
11610 .n(n)
11611 .k(k)
11612 .iterations(1)
11613 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11614 }
11615 }
11616 }
11617 }
11618
11619 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_div_8) {
11620 TEST_REQUIRES_X86_AVX;
11621 for (size_t k = 16; k <= 80; k += 8) {
11622 GemmMicrokernelTester()
11623 .mr(2)
11624 .nr(4)
11625 .kr(2)
11626 .sr(1)
11627 .m(2)
11628 .n(4)
11629 .k(k)
11630 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11631 }
11632 }
11633
11634 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_div_8_strided_a) {
11635 TEST_REQUIRES_X86_AVX;
11636 for (size_t k = 16; k <= 80; k += 8) {
11637 GemmMicrokernelTester()
11638 .mr(2)
11639 .nr(4)
11640 .kr(2)
11641 .sr(1)
11642 .m(2)
11643 .n(4)
11644 .k(k)
11645 .a_stride(83)
11646 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11647 }
11648 }
11649
11650 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_div_8_subtile) {
11651 TEST_REQUIRES_X86_AVX;
11652 for (size_t k = 16; k <= 80; k += 8) {
11653 for (uint32_t m = 1; m <= 2; m++) {
11654 for (uint32_t n = 1; n <= 4; n++) {
11655 GemmMicrokernelTester()
11656 .mr(2)
11657 .nr(4)
11658 .kr(2)
11659 .sr(1)
11660 .m(m)
11661 .n(n)
11662 .k(k)
11663 .iterations(1)
11664 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11665 }
11666 }
11667 }
11668 }
11669
11670 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4) {
11671 TEST_REQUIRES_X86_AVX;
11672 for (uint32_t n = 5; n < 8; n++) {
11673 for (size_t k = 1; k <= 40; k += 9) {
11674 GemmMicrokernelTester()
11675 .mr(2)
11676 .nr(4)
11677 .kr(2)
11678 .sr(1)
11679 .m(2)
11680 .n(4)
11681 .k(k)
11682 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11683 }
11684 }
11685 }
11686
11687 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4_strided_cn) {
11688 TEST_REQUIRES_X86_AVX;
11689 for (uint32_t n = 5; n < 8; n++) {
11690 for (size_t k = 1; k <= 40; k += 9) {
11691 GemmMicrokernelTester()
11692 .mr(2)
11693 .nr(4)
11694 .kr(2)
11695 .sr(1)
11696 .m(2)
11697 .n(4)
11698 .k(k)
11699 .cn_stride(7)
11700 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11701 }
11702 }
11703 }
11704
11705 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4_strided_a) {
11706 TEST_REQUIRES_X86_AVX;
11707 for (uint32_t n = 5; n < 8; n++) {
11708 for (size_t k = 1; k <= 40; k += 9) {
11709 GemmMicrokernelTester()
11710 .mr(2)
11711 .nr(4)
11712 .kr(2)
11713 .sr(1)
11714 .m(2)
11715 .n(n)
11716 .k(k)
11717 .a_stride(43)
11718 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11719 }
11720 }
11721 }
11722
11723 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4_subtile) {
11724 TEST_REQUIRES_X86_AVX;
11725 for (uint32_t n = 5; n < 8; n++) {
11726 for (size_t k = 1; k <= 40; k += 9) {
11727 for (uint32_t m = 1; m <= 2; m++) {
11728 GemmMicrokernelTester()
11729 .mr(2)
11730 .nr(4)
11731 .kr(2)
11732 .sr(1)
11733 .m(m)
11734 .n(n)
11735 .k(k)
11736 .iterations(1)
11737 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11738 }
11739 }
11740 }
11741 }
11742
11743 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4) {
11744 TEST_REQUIRES_X86_AVX;
11745 for (uint32_t n = 8; n <= 12; n += 4) {
11746 for (size_t k = 1; k <= 40; k += 9) {
11747 GemmMicrokernelTester()
11748 .mr(2)
11749 .nr(4)
11750 .kr(2)
11751 .sr(1)
11752 .m(2)
11753 .n(4)
11754 .k(k)
11755 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11756 }
11757 }
11758 }
11759
11760 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4_strided_cn) {
11761 TEST_REQUIRES_X86_AVX;
11762 for (uint32_t n = 8; n <= 12; n += 4) {
11763 for (size_t k = 1; k <= 40; k += 9) {
11764 GemmMicrokernelTester()
11765 .mr(2)
11766 .nr(4)
11767 .kr(2)
11768 .sr(1)
11769 .m(2)
11770 .n(n)
11771 .k(k)
11772 .cn_stride(7)
11773 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11774 }
11775 }
11776 }
11777
11778 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4_strided_a) {
11779 TEST_REQUIRES_X86_AVX;
11780 for (uint32_t n = 8; n <= 12; n += 4) {
11781 for (size_t k = 1; k <= 40; k += 9) {
11782 GemmMicrokernelTester()
11783 .mr(2)
11784 .nr(4)
11785 .kr(2)
11786 .sr(1)
11787 .m(2)
11788 .n(n)
11789 .k(k)
11790 .a_stride(43)
11791 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11792 }
11793 }
11794 }
11795
11796 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4_subtile) {
11797 TEST_REQUIRES_X86_AVX;
11798 for (uint32_t n = 8; n <= 12; n += 4) {
11799 for (size_t k = 1; k <= 40; k += 9) {
11800 for (uint32_t m = 1; m <= 2; m++) {
11801 GemmMicrokernelTester()
11802 .mr(2)
11803 .nr(4)
11804 .kr(2)
11805 .sr(1)
11806 .m(m)
11807 .n(n)
11808 .k(k)
11809 .iterations(1)
11810 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11811 }
11812 }
11813 }
11814 }
11815
11816 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, strided_cm_subtile) {
11817 TEST_REQUIRES_X86_AVX;
11818 for (size_t k = 1; k <= 40; k += 9) {
11819 for (uint32_t m = 1; m <= 2; m++) {
11820 for (uint32_t n = 1; n <= 4; n++) {
11821 GemmMicrokernelTester()
11822 .mr(2)
11823 .nr(4)
11824 .kr(2)
11825 .sr(1)
11826 .m(m)
11827 .n(n)
11828 .k(k)
11829 .cm_stride(7)
11830 .iterations(1)
11831 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11832 }
11833 }
11834 }
11835 }
11836
11837 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, qmin) {
11838 TEST_REQUIRES_X86_AVX;
11839 GemmMicrokernelTester()
11840 .mr(2)
11841 .nr(4)
11842 .kr(2)
11843 .sr(1)
11844 .m(2)
11845 .n(4)
11846 .k(8)
11847 .qmin(128)
11848 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11849 }
11850
11851 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, qmax) {
11852 TEST_REQUIRES_X86_AVX;
11853 GemmMicrokernelTester()
11854 .mr(2)
11855 .nr(4)
11856 .kr(2)
11857 .sr(1)
11858 .m(2)
11859 .n(4)
11860 .k(8)
11861 .qmax(128)
11862 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11863 }
11864
11865 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, strided_cm) {
11866 TEST_REQUIRES_X86_AVX;
11867 GemmMicrokernelTester()
11868 .mr(2)
11869 .nr(4)
11870 .kr(2)
11871 .sr(1)
11872 .m(2)
11873 .n(4)
11874 .k(8)
11875 .cm_stride(7)
11876 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11877 }
11878#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
11879
11880
11881#if XNN_ARCH_X86 || XNN_ARCH_X86_64
11882 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8) {
11883 TEST_REQUIRES_X86_AVX;
11884 GemmMicrokernelTester()
11885 .mr(3)
11886 .nr(4)
11887 .kr(2)
11888 .sr(1)
11889 .m(3)
11890 .n(4)
11891 .k(8)
11892 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11893 }
11894
11895 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, strided_cn) {
11896 TEST_REQUIRES_X86_AVX;
11897 GemmMicrokernelTester()
11898 .mr(3)
11899 .nr(4)
11900 .kr(2)
11901 .sr(1)
11902 .m(3)
11903 .n(4)
11904 .k(8)
11905 .cn_stride(7)
11906 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11907 }
11908
11909 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8_strided_a) {
11910 TEST_REQUIRES_X86_AVX;
11911 GemmMicrokernelTester()
11912 .mr(3)
11913 .nr(4)
11914 .kr(2)
11915 .sr(1)
11916 .m(3)
11917 .n(4)
11918 .k(8)
11919 .a_stride(11)
11920 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11921 }
11922
11923 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8_subtile) {
11924 TEST_REQUIRES_X86_AVX;
11925 for (uint32_t m = 1; m <= 3; m++) {
11926 for (uint32_t n = 1; n <= 4; n++) {
11927 GemmMicrokernelTester()
11928 .mr(3)
11929 .nr(4)
11930 .kr(2)
11931 .sr(1)
11932 .m(m)
11933 .n(n)
11934 .k(8)
11935 .iterations(1)
11936 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11937 }
11938 }
11939 }
11940
11941 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8_subtile_m) {
11942 TEST_REQUIRES_X86_AVX;
11943 for (uint32_t m = 1; m <= 3; m++) {
11944 GemmMicrokernelTester()
11945 .mr(3)
11946 .nr(4)
11947 .kr(2)
11948 .sr(1)
11949 .m(m)
11950 .n(4)
11951 .k(8)
11952 .iterations(1)
11953 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11954 }
11955 }
11956
11957 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8_subtile_n) {
11958 TEST_REQUIRES_X86_AVX;
11959 for (uint32_t n = 1; n <= 4; n++) {
11960 GemmMicrokernelTester()
11961 .mr(3)
11962 .nr(4)
11963 .kr(2)
11964 .sr(1)
11965 .m(3)
11966 .n(n)
11967 .k(8)
11968 .iterations(1)
11969 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11970 }
11971 }
11972
11973 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_lt_8) {
11974 TEST_REQUIRES_X86_AVX;
11975 for (size_t k = 1; k < 8; k++) {
11976 GemmMicrokernelTester()
11977 .mr(3)
11978 .nr(4)
11979 .kr(2)
11980 .sr(1)
11981 .m(3)
11982 .n(4)
11983 .k(k)
11984 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
11985 }
11986 }
11987
11988 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_lt_8_strided_a) {
11989 TEST_REQUIRES_X86_AVX;
11990 for (size_t k = 1; k < 8; k++) {
11991 GemmMicrokernelTester()
11992 .mr(3)
11993 .nr(4)
11994 .kr(2)
11995 .sr(1)
11996 .m(3)
11997 .n(4)
11998 .k(k)
11999 .a_stride(11)
12000 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12001 }
12002 }
12003
12004 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_lt_8_subtile) {
12005 TEST_REQUIRES_X86_AVX;
12006 for (size_t k = 1; k < 8; k++) {
12007 for (uint32_t m = 1; m <= 3; m++) {
12008 for (uint32_t n = 1; n <= 4; n++) {
12009 GemmMicrokernelTester()
12010 .mr(3)
12011 .nr(4)
12012 .kr(2)
12013 .sr(1)
12014 .m(m)
12015 .n(n)
12016 .k(k)
12017 .iterations(1)
12018 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12019 }
12020 }
12021 }
12022 }
12023
12024 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_gt_8) {
12025 TEST_REQUIRES_X86_AVX;
12026 for (size_t k = 9; k < 16; k++) {
12027 GemmMicrokernelTester()
12028 .mr(3)
12029 .nr(4)
12030 .kr(2)
12031 .sr(1)
12032 .m(3)
12033 .n(4)
12034 .k(k)
12035 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12036 }
12037 }
12038
12039 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_gt_8_strided_a) {
12040 TEST_REQUIRES_X86_AVX;
12041 for (size_t k = 9; k < 16; k++) {
12042 GemmMicrokernelTester()
12043 .mr(3)
12044 .nr(4)
12045 .kr(2)
12046 .sr(1)
12047 .m(3)
12048 .n(4)
12049 .k(k)
12050 .a_stride(19)
12051 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12052 }
12053 }
12054
12055 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_gt_8_subtile) {
12056 TEST_REQUIRES_X86_AVX;
12057 for (size_t k = 9; k < 16; k++) {
12058 for (uint32_t m = 1; m <= 3; m++) {
12059 for (uint32_t n = 1; n <= 4; n++) {
12060 GemmMicrokernelTester()
12061 .mr(3)
12062 .nr(4)
12063 .kr(2)
12064 .sr(1)
12065 .m(m)
12066 .n(n)
12067 .k(k)
12068 .iterations(1)
12069 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12070 }
12071 }
12072 }
12073 }
12074
12075 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_div_8) {
12076 TEST_REQUIRES_X86_AVX;
12077 for (size_t k = 16; k <= 80; k += 8) {
12078 GemmMicrokernelTester()
12079 .mr(3)
12080 .nr(4)
12081 .kr(2)
12082 .sr(1)
12083 .m(3)
12084 .n(4)
12085 .k(k)
12086 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12087 }
12088 }
12089
12090 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_div_8_strided_a) {
12091 TEST_REQUIRES_X86_AVX;
12092 for (size_t k = 16; k <= 80; k += 8) {
12093 GemmMicrokernelTester()
12094 .mr(3)
12095 .nr(4)
12096 .kr(2)
12097 .sr(1)
12098 .m(3)
12099 .n(4)
12100 .k(k)
12101 .a_stride(83)
12102 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12103 }
12104 }
12105
12106 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_div_8_subtile) {
12107 TEST_REQUIRES_X86_AVX;
12108 for (size_t k = 16; k <= 80; k += 8) {
12109 for (uint32_t m = 1; m <= 3; m++) {
12110 for (uint32_t n = 1; n <= 4; n++) {
12111 GemmMicrokernelTester()
12112 .mr(3)
12113 .nr(4)
12114 .kr(2)
12115 .sr(1)
12116 .m(m)
12117 .n(n)
12118 .k(k)
12119 .iterations(1)
12120 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12121 }
12122 }
12123 }
12124 }
12125
12126 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4) {
12127 TEST_REQUIRES_X86_AVX;
12128 for (uint32_t n = 5; n < 8; n++) {
12129 for (size_t k = 1; k <= 40; k += 9) {
12130 GemmMicrokernelTester()
12131 .mr(3)
12132 .nr(4)
12133 .kr(2)
12134 .sr(1)
12135 .m(3)
12136 .n(4)
12137 .k(k)
12138 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12139 }
12140 }
12141 }
12142
12143 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4_strided_cn) {
12144 TEST_REQUIRES_X86_AVX;
12145 for (uint32_t n = 5; n < 8; n++) {
12146 for (size_t k = 1; k <= 40; k += 9) {
12147 GemmMicrokernelTester()
12148 .mr(3)
12149 .nr(4)
12150 .kr(2)
12151 .sr(1)
12152 .m(3)
12153 .n(4)
12154 .k(k)
12155 .cn_stride(7)
12156 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12157 }
12158 }
12159 }
12160
12161 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4_strided_a) {
12162 TEST_REQUIRES_X86_AVX;
12163 for (uint32_t n = 5; n < 8; n++) {
12164 for (size_t k = 1; k <= 40; k += 9) {
12165 GemmMicrokernelTester()
12166 .mr(3)
12167 .nr(4)
12168 .kr(2)
12169 .sr(1)
12170 .m(3)
12171 .n(n)
12172 .k(k)
12173 .a_stride(43)
12174 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12175 }
12176 }
12177 }
12178
12179 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4_subtile) {
12180 TEST_REQUIRES_X86_AVX;
12181 for (uint32_t n = 5; n < 8; n++) {
12182 for (size_t k = 1; k <= 40; k += 9) {
12183 for (uint32_t m = 1; m <= 3; m++) {
12184 GemmMicrokernelTester()
12185 .mr(3)
12186 .nr(4)
12187 .kr(2)
12188 .sr(1)
12189 .m(m)
12190 .n(n)
12191 .k(k)
12192 .iterations(1)
12193 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12194 }
12195 }
12196 }
12197 }
12198
12199 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4) {
12200 TEST_REQUIRES_X86_AVX;
12201 for (uint32_t n = 8; n <= 12; n += 4) {
12202 for (size_t k = 1; k <= 40; k += 9) {
12203 GemmMicrokernelTester()
12204 .mr(3)
12205 .nr(4)
12206 .kr(2)
12207 .sr(1)
12208 .m(3)
12209 .n(4)
12210 .k(k)
12211 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12212 }
12213 }
12214 }
12215
12216 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4_strided_cn) {
12217 TEST_REQUIRES_X86_AVX;
12218 for (uint32_t n = 8; n <= 12; n += 4) {
12219 for (size_t k = 1; k <= 40; k += 9) {
12220 GemmMicrokernelTester()
12221 .mr(3)
12222 .nr(4)
12223 .kr(2)
12224 .sr(1)
12225 .m(3)
12226 .n(n)
12227 .k(k)
12228 .cn_stride(7)
12229 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12230 }
12231 }
12232 }
12233
12234 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4_strided_a) {
12235 TEST_REQUIRES_X86_AVX;
12236 for (uint32_t n = 8; n <= 12; n += 4) {
12237 for (size_t k = 1; k <= 40; k += 9) {
12238 GemmMicrokernelTester()
12239 .mr(3)
12240 .nr(4)
12241 .kr(2)
12242 .sr(1)
12243 .m(3)
12244 .n(n)
12245 .k(k)
12246 .a_stride(43)
12247 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12248 }
12249 }
12250 }
12251
12252 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4_subtile) {
12253 TEST_REQUIRES_X86_AVX;
12254 for (uint32_t n = 8; n <= 12; n += 4) {
12255 for (size_t k = 1; k <= 40; k += 9) {
12256 for (uint32_t m = 1; m <= 3; m++) {
12257 GemmMicrokernelTester()
12258 .mr(3)
12259 .nr(4)
12260 .kr(2)
12261 .sr(1)
12262 .m(m)
12263 .n(n)
12264 .k(k)
12265 .iterations(1)
12266 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12267 }
12268 }
12269 }
12270 }
12271
12272 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, strided_cm_subtile) {
12273 TEST_REQUIRES_X86_AVX;
12274 for (size_t k = 1; k <= 40; k += 9) {
12275 for (uint32_t m = 1; m <= 3; m++) {
12276 for (uint32_t n = 1; n <= 4; n++) {
12277 GemmMicrokernelTester()
12278 .mr(3)
12279 .nr(4)
12280 .kr(2)
12281 .sr(1)
12282 .m(m)
12283 .n(n)
12284 .k(k)
12285 .cm_stride(7)
12286 .iterations(1)
12287 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12288 }
12289 }
12290 }
12291 }
12292
12293 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, qmin) {
12294 TEST_REQUIRES_X86_AVX;
12295 GemmMicrokernelTester()
12296 .mr(3)
12297 .nr(4)
12298 .kr(2)
12299 .sr(1)
12300 .m(3)
12301 .n(4)
12302 .k(8)
12303 .qmin(128)
12304 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12305 }
12306
12307 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, qmax) {
12308 TEST_REQUIRES_X86_AVX;
12309 GemmMicrokernelTester()
12310 .mr(3)
12311 .nr(4)
12312 .kr(2)
12313 .sr(1)
12314 .m(3)
12315 .n(4)
12316 .k(8)
12317 .qmax(128)
12318 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12319 }
12320
12321 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, strided_cm) {
12322 TEST_REQUIRES_X86_AVX;
12323 GemmMicrokernelTester()
12324 .mr(3)
12325 .nr(4)
12326 .kr(2)
12327 .sr(1)
12328 .m(3)
12329 .n(4)
12330 .k(8)
12331 .cm_stride(7)
12332 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12333 }
12334#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
12335
12336
12337#if XNN_ARCH_X86 || XNN_ARCH_X86_64
12338 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8) {
12339 TEST_REQUIRES_X86_AVX;
12340 GemmMicrokernelTester()
12341 .mr(4)
12342 .nr(4)
12343 .kr(2)
12344 .sr(1)
12345 .m(4)
12346 .n(4)
12347 .k(8)
12348 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12349 }
12350
12351 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, strided_cn) {
12352 TEST_REQUIRES_X86_AVX;
12353 GemmMicrokernelTester()
12354 .mr(4)
12355 .nr(4)
12356 .kr(2)
12357 .sr(1)
12358 .m(4)
12359 .n(4)
12360 .k(8)
12361 .cn_stride(7)
12362 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12363 }
12364
12365 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_strided_a) {
12366 TEST_REQUIRES_X86_AVX;
12367 GemmMicrokernelTester()
12368 .mr(4)
12369 .nr(4)
12370 .kr(2)
12371 .sr(1)
12372 .m(4)
12373 .n(4)
12374 .k(8)
12375 .a_stride(11)
12376 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12377 }
12378
12379 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_subtile) {
12380 TEST_REQUIRES_X86_AVX;
12381 for (uint32_t m = 1; m <= 4; m++) {
12382 for (uint32_t n = 1; n <= 4; n++) {
12383 GemmMicrokernelTester()
12384 .mr(4)
12385 .nr(4)
12386 .kr(2)
12387 .sr(1)
12388 .m(m)
12389 .n(n)
12390 .k(8)
12391 .iterations(1)
12392 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12393 }
12394 }
12395 }
12396
12397 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_subtile_m) {
12398 TEST_REQUIRES_X86_AVX;
12399 for (uint32_t m = 1; m <= 4; m++) {
12400 GemmMicrokernelTester()
12401 .mr(4)
12402 .nr(4)
12403 .kr(2)
12404 .sr(1)
12405 .m(m)
12406 .n(4)
12407 .k(8)
12408 .iterations(1)
12409 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12410 }
12411 }
12412
12413 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_subtile_n) {
12414 TEST_REQUIRES_X86_AVX;
12415 for (uint32_t n = 1; n <= 4; n++) {
12416 GemmMicrokernelTester()
12417 .mr(4)
12418 .nr(4)
12419 .kr(2)
12420 .sr(1)
12421 .m(4)
12422 .n(n)
12423 .k(8)
12424 .iterations(1)
12425 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12426 }
12427 }
12428
12429 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_lt_8) {
12430 TEST_REQUIRES_X86_AVX;
12431 for (size_t k = 1; k < 8; k++) {
12432 GemmMicrokernelTester()
12433 .mr(4)
12434 .nr(4)
12435 .kr(2)
12436 .sr(1)
12437 .m(4)
12438 .n(4)
12439 .k(k)
12440 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12441 }
12442 }
12443
12444 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_lt_8_strided_a) {
12445 TEST_REQUIRES_X86_AVX;
12446 for (size_t k = 1; k < 8; k++) {
12447 GemmMicrokernelTester()
12448 .mr(4)
12449 .nr(4)
12450 .kr(2)
12451 .sr(1)
12452 .m(4)
12453 .n(4)
12454 .k(k)
12455 .a_stride(11)
12456 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12457 }
12458 }
12459
12460 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_lt_8_subtile) {
12461 TEST_REQUIRES_X86_AVX;
12462 for (size_t k = 1; k < 8; k++) {
12463 for (uint32_t m = 1; m <= 4; m++) {
12464 for (uint32_t n = 1; n <= 4; n++) {
12465 GemmMicrokernelTester()
12466 .mr(4)
12467 .nr(4)
12468 .kr(2)
12469 .sr(1)
12470 .m(m)
12471 .n(n)
12472 .k(k)
12473 .iterations(1)
12474 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12475 }
12476 }
12477 }
12478 }
12479
12480 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_gt_8) {
12481 TEST_REQUIRES_X86_AVX;
12482 for (size_t k = 9; k < 16; k++) {
12483 GemmMicrokernelTester()
12484 .mr(4)
12485 .nr(4)
12486 .kr(2)
12487 .sr(1)
12488 .m(4)
12489 .n(4)
12490 .k(k)
12491 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12492 }
12493 }
12494
12495 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_gt_8_strided_a) {
12496 TEST_REQUIRES_X86_AVX;
12497 for (size_t k = 9; k < 16; k++) {
12498 GemmMicrokernelTester()
12499 .mr(4)
12500 .nr(4)
12501 .kr(2)
12502 .sr(1)
12503 .m(4)
12504 .n(4)
12505 .k(k)
12506 .a_stride(19)
12507 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12508 }
12509 }
12510
12511 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_gt_8_subtile) {
12512 TEST_REQUIRES_X86_AVX;
12513 for (size_t k = 9; k < 16; k++) {
12514 for (uint32_t m = 1; m <= 4; m++) {
12515 for (uint32_t n = 1; n <= 4; n++) {
12516 GemmMicrokernelTester()
12517 .mr(4)
12518 .nr(4)
12519 .kr(2)
12520 .sr(1)
12521 .m(m)
12522 .n(n)
12523 .k(k)
12524 .iterations(1)
12525 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12526 }
12527 }
12528 }
12529 }
12530
12531 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_div_8) {
12532 TEST_REQUIRES_X86_AVX;
12533 for (size_t k = 16; k <= 80; k += 8) {
12534 GemmMicrokernelTester()
12535 .mr(4)
12536 .nr(4)
12537 .kr(2)
12538 .sr(1)
12539 .m(4)
12540 .n(4)
12541 .k(k)
12542 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12543 }
12544 }
12545
12546 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_div_8_strided_a) {
12547 TEST_REQUIRES_X86_AVX;
12548 for (size_t k = 16; k <= 80; k += 8) {
12549 GemmMicrokernelTester()
12550 .mr(4)
12551 .nr(4)
12552 .kr(2)
12553 .sr(1)
12554 .m(4)
12555 .n(4)
12556 .k(k)
12557 .a_stride(83)
12558 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12559 }
12560 }
12561
12562 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_div_8_subtile) {
12563 TEST_REQUIRES_X86_AVX;
12564 for (size_t k = 16; k <= 80; k += 8) {
12565 for (uint32_t m = 1; m <= 4; m++) {
12566 for (uint32_t n = 1; n <= 4; n++) {
12567 GemmMicrokernelTester()
12568 .mr(4)
12569 .nr(4)
12570 .kr(2)
12571 .sr(1)
12572 .m(m)
12573 .n(n)
12574 .k(k)
12575 .iterations(1)
12576 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12577 }
12578 }
12579 }
12580 }
12581
12582 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4) {
12583 TEST_REQUIRES_X86_AVX;
12584 for (uint32_t n = 5; n < 8; n++) {
12585 for (size_t k = 1; k <= 40; k += 9) {
12586 GemmMicrokernelTester()
12587 .mr(4)
12588 .nr(4)
12589 .kr(2)
12590 .sr(1)
12591 .m(4)
12592 .n(4)
12593 .k(k)
12594 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12595 }
12596 }
12597 }
12598
12599 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4_strided_cn) {
12600 TEST_REQUIRES_X86_AVX;
12601 for (uint32_t n = 5; n < 8; n++) {
12602 for (size_t k = 1; k <= 40; k += 9) {
12603 GemmMicrokernelTester()
12604 .mr(4)
12605 .nr(4)
12606 .kr(2)
12607 .sr(1)
12608 .m(4)
12609 .n(4)
12610 .k(k)
12611 .cn_stride(7)
12612 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12613 }
12614 }
12615 }
12616
12617 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4_strided_a) {
12618 TEST_REQUIRES_X86_AVX;
12619 for (uint32_t n = 5; n < 8; n++) {
12620 for (size_t k = 1; k <= 40; k += 9) {
12621 GemmMicrokernelTester()
12622 .mr(4)
12623 .nr(4)
12624 .kr(2)
12625 .sr(1)
12626 .m(4)
12627 .n(n)
12628 .k(k)
12629 .a_stride(43)
12630 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12631 }
12632 }
12633 }
12634
12635 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4_subtile) {
12636 TEST_REQUIRES_X86_AVX;
12637 for (uint32_t n = 5; n < 8; n++) {
12638 for (size_t k = 1; k <= 40; k += 9) {
12639 for (uint32_t m = 1; m <= 4; m++) {
12640 GemmMicrokernelTester()
12641 .mr(4)
12642 .nr(4)
12643 .kr(2)
12644 .sr(1)
12645 .m(m)
12646 .n(n)
12647 .k(k)
12648 .iterations(1)
12649 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12650 }
12651 }
12652 }
12653 }
12654
12655 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4) {
12656 TEST_REQUIRES_X86_AVX;
12657 for (uint32_t n = 8; n <= 12; n += 4) {
12658 for (size_t k = 1; k <= 40; k += 9) {
12659 GemmMicrokernelTester()
12660 .mr(4)
12661 .nr(4)
12662 .kr(2)
12663 .sr(1)
12664 .m(4)
12665 .n(4)
12666 .k(k)
12667 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12668 }
12669 }
12670 }
12671
12672 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4_strided_cn) {
12673 TEST_REQUIRES_X86_AVX;
12674 for (uint32_t n = 8; n <= 12; n += 4) {
12675 for (size_t k = 1; k <= 40; k += 9) {
12676 GemmMicrokernelTester()
12677 .mr(4)
12678 .nr(4)
12679 .kr(2)
12680 .sr(1)
12681 .m(4)
12682 .n(n)
12683 .k(k)
12684 .cn_stride(7)
12685 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12686 }
12687 }
12688 }
12689
12690 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4_strided_a) {
12691 TEST_REQUIRES_X86_AVX;
12692 for (uint32_t n = 8; n <= 12; n += 4) {
12693 for (size_t k = 1; k <= 40; k += 9) {
12694 GemmMicrokernelTester()
12695 .mr(4)
12696 .nr(4)
12697 .kr(2)
12698 .sr(1)
12699 .m(4)
12700 .n(n)
12701 .k(k)
12702 .a_stride(43)
12703 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12704 }
12705 }
12706 }
12707
12708 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4_subtile) {
12709 TEST_REQUIRES_X86_AVX;
12710 for (uint32_t n = 8; n <= 12; n += 4) {
12711 for (size_t k = 1; k <= 40; k += 9) {
12712 for (uint32_t m = 1; m <= 4; m++) {
12713 GemmMicrokernelTester()
12714 .mr(4)
12715 .nr(4)
12716 .kr(2)
12717 .sr(1)
12718 .m(m)
12719 .n(n)
12720 .k(k)
12721 .iterations(1)
12722 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12723 }
12724 }
12725 }
12726 }
12727
12728 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, strided_cm_subtile) {
12729 TEST_REQUIRES_X86_AVX;
12730 for (size_t k = 1; k <= 40; k += 9) {
12731 for (uint32_t m = 1; m <= 4; m++) {
12732 for (uint32_t n = 1; n <= 4; n++) {
12733 GemmMicrokernelTester()
12734 .mr(4)
12735 .nr(4)
12736 .kr(2)
12737 .sr(1)
12738 .m(m)
12739 .n(n)
12740 .k(k)
12741 .cm_stride(7)
12742 .iterations(1)
12743 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12744 }
12745 }
12746 }
12747 }
12748
12749 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, qmin) {
12750 TEST_REQUIRES_X86_AVX;
12751 GemmMicrokernelTester()
12752 .mr(4)
12753 .nr(4)
12754 .kr(2)
12755 .sr(1)
12756 .m(4)
12757 .n(4)
12758 .k(8)
12759 .qmin(128)
12760 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12761 }
12762
12763 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, qmax) {
12764 TEST_REQUIRES_X86_AVX;
12765 GemmMicrokernelTester()
12766 .mr(4)
12767 .nr(4)
12768 .kr(2)
12769 .sr(1)
12770 .m(4)
12771 .n(4)
12772 .k(8)
12773 .qmax(128)
12774 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12775 }
12776
12777 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, strided_cm) {
12778 TEST_REQUIRES_X86_AVX;
12779 GemmMicrokernelTester()
12780 .mr(4)
12781 .nr(4)
12782 .kr(2)
12783 .sr(1)
12784 .m(4)
12785 .n(4)
12786 .k(8)
12787 .cm_stride(7)
12788 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12789 }
12790#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
12791
12792
12793#if XNN_ARCH_X86 || XNN_ARCH_X86_64
12794 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8) {
12795 TEST_REQUIRES_X86_XOP;
12796 GemmMicrokernelTester()
12797 .mr(1)
12798 .nr(4)
12799 .kr(2)
12800 .sr(1)
12801 .m(1)
12802 .n(4)
12803 .k(8)
12804 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12805 }
12806
12807 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, strided_cn) {
12808 TEST_REQUIRES_X86_XOP;
12809 GemmMicrokernelTester()
12810 .mr(1)
12811 .nr(4)
12812 .kr(2)
12813 .sr(1)
12814 .m(1)
12815 .n(4)
12816 .k(8)
12817 .cn_stride(7)
12818 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12819 }
12820
12821 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8_strided_a) {
12822 TEST_REQUIRES_X86_XOP;
12823 GemmMicrokernelTester()
12824 .mr(1)
12825 .nr(4)
12826 .kr(2)
12827 .sr(1)
12828 .m(1)
12829 .n(4)
12830 .k(8)
12831 .a_stride(11)
12832 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12833 }
12834
12835 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8_subtile) {
12836 TEST_REQUIRES_X86_XOP;
12837 for (uint32_t m = 1; m <= 1; m++) {
12838 for (uint32_t n = 1; n <= 4; n++) {
12839 GemmMicrokernelTester()
12840 .mr(1)
12841 .nr(4)
12842 .kr(2)
12843 .sr(1)
12844 .m(m)
12845 .n(n)
12846 .k(8)
12847 .iterations(1)
12848 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12849 }
12850 }
12851 }
12852
12853 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8_subtile_m) {
12854 TEST_REQUIRES_X86_XOP;
12855 for (uint32_t m = 1; m <= 1; m++) {
12856 GemmMicrokernelTester()
12857 .mr(1)
12858 .nr(4)
12859 .kr(2)
12860 .sr(1)
12861 .m(m)
12862 .n(4)
12863 .k(8)
12864 .iterations(1)
12865 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12866 }
12867 }
12868
12869 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8_subtile_n) {
12870 TEST_REQUIRES_X86_XOP;
12871 for (uint32_t n = 1; n <= 4; n++) {
12872 GemmMicrokernelTester()
12873 .mr(1)
12874 .nr(4)
12875 .kr(2)
12876 .sr(1)
12877 .m(1)
12878 .n(n)
12879 .k(8)
12880 .iterations(1)
12881 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12882 }
12883 }
12884
12885 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_lt_8) {
12886 TEST_REQUIRES_X86_XOP;
12887 for (size_t k = 1; k < 8; k++) {
12888 GemmMicrokernelTester()
12889 .mr(1)
12890 .nr(4)
12891 .kr(2)
12892 .sr(1)
12893 .m(1)
12894 .n(4)
12895 .k(k)
12896 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12897 }
12898 }
12899
12900 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_lt_8_strided_a) {
12901 TEST_REQUIRES_X86_XOP;
12902 for (size_t k = 1; k < 8; k++) {
12903 GemmMicrokernelTester()
12904 .mr(1)
12905 .nr(4)
12906 .kr(2)
12907 .sr(1)
12908 .m(1)
12909 .n(4)
12910 .k(k)
12911 .a_stride(11)
12912 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12913 }
12914 }
12915
12916 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_lt_8_subtile) {
12917 TEST_REQUIRES_X86_XOP;
12918 for (size_t k = 1; k < 8; k++) {
12919 for (uint32_t m = 1; m <= 1; m++) {
12920 for (uint32_t n = 1; n <= 4; n++) {
12921 GemmMicrokernelTester()
12922 .mr(1)
12923 .nr(4)
12924 .kr(2)
12925 .sr(1)
12926 .m(m)
12927 .n(n)
12928 .k(k)
12929 .iterations(1)
12930 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12931 }
12932 }
12933 }
12934 }
12935
12936 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_gt_8) {
12937 TEST_REQUIRES_X86_XOP;
12938 for (size_t k = 9; k < 16; k++) {
12939 GemmMicrokernelTester()
12940 .mr(1)
12941 .nr(4)
12942 .kr(2)
12943 .sr(1)
12944 .m(1)
12945 .n(4)
12946 .k(k)
12947 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12948 }
12949 }
12950
12951 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_gt_8_strided_a) {
12952 TEST_REQUIRES_X86_XOP;
12953 for (size_t k = 9; k < 16; k++) {
12954 GemmMicrokernelTester()
12955 .mr(1)
12956 .nr(4)
12957 .kr(2)
12958 .sr(1)
12959 .m(1)
12960 .n(4)
12961 .k(k)
12962 .a_stride(19)
12963 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12964 }
12965 }
12966
12967 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_gt_8_subtile) {
12968 TEST_REQUIRES_X86_XOP;
12969 for (size_t k = 9; k < 16; k++) {
12970 for (uint32_t m = 1; m <= 1; m++) {
12971 for (uint32_t n = 1; n <= 4; n++) {
12972 GemmMicrokernelTester()
12973 .mr(1)
12974 .nr(4)
12975 .kr(2)
12976 .sr(1)
12977 .m(m)
12978 .n(n)
12979 .k(k)
12980 .iterations(1)
12981 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12982 }
12983 }
12984 }
12985 }
12986
12987 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_div_8) {
12988 TEST_REQUIRES_X86_XOP;
12989 for (size_t k = 16; k <= 80; k += 8) {
12990 GemmMicrokernelTester()
12991 .mr(1)
12992 .nr(4)
12993 .kr(2)
12994 .sr(1)
12995 .m(1)
12996 .n(4)
12997 .k(k)
12998 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
12999 }
13000 }
13001
13002 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_div_8_strided_a) {
13003 TEST_REQUIRES_X86_XOP;
13004 for (size_t k = 16; k <= 80; k += 8) {
13005 GemmMicrokernelTester()
13006 .mr(1)
13007 .nr(4)
13008 .kr(2)
13009 .sr(1)
13010 .m(1)
13011 .n(4)
13012 .k(k)
13013 .a_stride(83)
13014 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13015 }
13016 }
13017
13018 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_div_8_subtile) {
13019 TEST_REQUIRES_X86_XOP;
13020 for (size_t k = 16; k <= 80; k += 8) {
13021 for (uint32_t m = 1; m <= 1; m++) {
13022 for (uint32_t n = 1; n <= 4; n++) {
13023 GemmMicrokernelTester()
13024 .mr(1)
13025 .nr(4)
13026 .kr(2)
13027 .sr(1)
13028 .m(m)
13029 .n(n)
13030 .k(k)
13031 .iterations(1)
13032 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13033 }
13034 }
13035 }
13036 }
13037
13038 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4) {
13039 TEST_REQUIRES_X86_XOP;
13040 for (uint32_t n = 5; n < 8; n++) {
13041 for (size_t k = 1; k <= 40; k += 9) {
13042 GemmMicrokernelTester()
13043 .mr(1)
13044 .nr(4)
13045 .kr(2)
13046 .sr(1)
13047 .m(1)
13048 .n(4)
13049 .k(k)
13050 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13051 }
13052 }
13053 }
13054
13055 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4_strided_cn) {
13056 TEST_REQUIRES_X86_XOP;
13057 for (uint32_t n = 5; n < 8; n++) {
13058 for (size_t k = 1; k <= 40; k += 9) {
13059 GemmMicrokernelTester()
13060 .mr(1)
13061 .nr(4)
13062 .kr(2)
13063 .sr(1)
13064 .m(1)
13065 .n(4)
13066 .k(k)
13067 .cn_stride(7)
13068 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13069 }
13070 }
13071 }
13072
13073 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4_strided_a) {
13074 TEST_REQUIRES_X86_XOP;
13075 for (uint32_t n = 5; n < 8; n++) {
13076 for (size_t k = 1; k <= 40; k += 9) {
13077 GemmMicrokernelTester()
13078 .mr(1)
13079 .nr(4)
13080 .kr(2)
13081 .sr(1)
13082 .m(1)
13083 .n(n)
13084 .k(k)
13085 .a_stride(43)
13086 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13087 }
13088 }
13089 }
13090
13091 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4_subtile) {
13092 TEST_REQUIRES_X86_XOP;
13093 for (uint32_t n = 5; n < 8; n++) {
13094 for (size_t k = 1; k <= 40; k += 9) {
13095 for (uint32_t m = 1; m <= 1; m++) {
13096 GemmMicrokernelTester()
13097 .mr(1)
13098 .nr(4)
13099 .kr(2)
13100 .sr(1)
13101 .m(m)
13102 .n(n)
13103 .k(k)
13104 .iterations(1)
13105 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13106 }
13107 }
13108 }
13109 }
13110
13111 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4) {
13112 TEST_REQUIRES_X86_XOP;
13113 for (uint32_t n = 8; n <= 12; n += 4) {
13114 for (size_t k = 1; k <= 40; k += 9) {
13115 GemmMicrokernelTester()
13116 .mr(1)
13117 .nr(4)
13118 .kr(2)
13119 .sr(1)
13120 .m(1)
13121 .n(4)
13122 .k(k)
13123 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13124 }
13125 }
13126 }
13127
13128 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4_strided_cn) {
13129 TEST_REQUIRES_X86_XOP;
13130 for (uint32_t n = 8; n <= 12; n += 4) {
13131 for (size_t k = 1; k <= 40; k += 9) {
13132 GemmMicrokernelTester()
13133 .mr(1)
13134 .nr(4)
13135 .kr(2)
13136 .sr(1)
13137 .m(1)
13138 .n(n)
13139 .k(k)
13140 .cn_stride(7)
13141 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13142 }
13143 }
13144 }
13145
13146 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4_strided_a) {
13147 TEST_REQUIRES_X86_XOP;
13148 for (uint32_t n = 8; n <= 12; n += 4) {
13149 for (size_t k = 1; k <= 40; k += 9) {
13150 GemmMicrokernelTester()
13151 .mr(1)
13152 .nr(4)
13153 .kr(2)
13154 .sr(1)
13155 .m(1)
13156 .n(n)
13157 .k(k)
13158 .a_stride(43)
13159 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13160 }
13161 }
13162 }
13163
13164 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4_subtile) {
13165 TEST_REQUIRES_X86_XOP;
13166 for (uint32_t n = 8; n <= 12; n += 4) {
13167 for (size_t k = 1; k <= 40; k += 9) {
13168 for (uint32_t m = 1; m <= 1; m++) {
13169 GemmMicrokernelTester()
13170 .mr(1)
13171 .nr(4)
13172 .kr(2)
13173 .sr(1)
13174 .m(m)
13175 .n(n)
13176 .k(k)
13177 .iterations(1)
13178 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13179 }
13180 }
13181 }
13182 }
13183
13184 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, strided_cm_subtile) {
13185 TEST_REQUIRES_X86_XOP;
13186 for (size_t k = 1; k <= 40; k += 9) {
13187 for (uint32_t m = 1; m <= 1; m++) {
13188 for (uint32_t n = 1; n <= 4; n++) {
13189 GemmMicrokernelTester()
13190 .mr(1)
13191 .nr(4)
13192 .kr(2)
13193 .sr(1)
13194 .m(m)
13195 .n(n)
13196 .k(k)
13197 .cm_stride(7)
13198 .iterations(1)
13199 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13200 }
13201 }
13202 }
13203 }
13204
13205 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, qmin) {
13206 TEST_REQUIRES_X86_XOP;
13207 GemmMicrokernelTester()
13208 .mr(1)
13209 .nr(4)
13210 .kr(2)
13211 .sr(1)
13212 .m(1)
13213 .n(4)
13214 .k(8)
13215 .qmin(128)
13216 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13217 }
13218
13219 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, qmax) {
13220 TEST_REQUIRES_X86_XOP;
13221 GemmMicrokernelTester()
13222 .mr(1)
13223 .nr(4)
13224 .kr(2)
13225 .sr(1)
13226 .m(1)
13227 .n(4)
13228 .k(8)
13229 .qmax(128)
13230 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13231 }
13232
13233 TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, strided_cm) {
13234 TEST_REQUIRES_X86_XOP;
13235 GemmMicrokernelTester()
13236 .mr(1)
13237 .nr(4)
13238 .kr(2)
13239 .sr(1)
13240 .m(1)
13241 .n(4)
13242 .k(8)
13243 .cm_stride(7)
13244 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13245 }
13246#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
13247
13248
13249#if XNN_ARCH_X86 || XNN_ARCH_X86_64
13250 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8) {
13251 TEST_REQUIRES_X86_XOP;
13252 GemmMicrokernelTester()
13253 .mr(2)
13254 .nr(4)
13255 .kr(2)
13256 .sr(1)
13257 .m(2)
13258 .n(4)
13259 .k(8)
13260 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13261 }
13262
13263 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, strided_cn) {
13264 TEST_REQUIRES_X86_XOP;
13265 GemmMicrokernelTester()
13266 .mr(2)
13267 .nr(4)
13268 .kr(2)
13269 .sr(1)
13270 .m(2)
13271 .n(4)
13272 .k(8)
13273 .cn_stride(7)
13274 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13275 }
13276
13277 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8_strided_a) {
13278 TEST_REQUIRES_X86_XOP;
13279 GemmMicrokernelTester()
13280 .mr(2)
13281 .nr(4)
13282 .kr(2)
13283 .sr(1)
13284 .m(2)
13285 .n(4)
13286 .k(8)
13287 .a_stride(11)
13288 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13289 }
13290
13291 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8_subtile) {
13292 TEST_REQUIRES_X86_XOP;
13293 for (uint32_t m = 1; m <= 2; m++) {
13294 for (uint32_t n = 1; n <= 4; n++) {
13295 GemmMicrokernelTester()
13296 .mr(2)
13297 .nr(4)
13298 .kr(2)
13299 .sr(1)
13300 .m(m)
13301 .n(n)
13302 .k(8)
13303 .iterations(1)
13304 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13305 }
13306 }
13307 }
13308
13309 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8_subtile_m) {
13310 TEST_REQUIRES_X86_XOP;
13311 for (uint32_t m = 1; m <= 2; m++) {
13312 GemmMicrokernelTester()
13313 .mr(2)
13314 .nr(4)
13315 .kr(2)
13316 .sr(1)
13317 .m(m)
13318 .n(4)
13319 .k(8)
13320 .iterations(1)
13321 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13322 }
13323 }
13324
13325 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8_subtile_n) {
13326 TEST_REQUIRES_X86_XOP;
13327 for (uint32_t n = 1; n <= 4; n++) {
13328 GemmMicrokernelTester()
13329 .mr(2)
13330 .nr(4)
13331 .kr(2)
13332 .sr(1)
13333 .m(2)
13334 .n(n)
13335 .k(8)
13336 .iterations(1)
13337 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13338 }
13339 }
13340
13341 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_lt_8) {
13342 TEST_REQUIRES_X86_XOP;
13343 for (size_t k = 1; k < 8; k++) {
13344 GemmMicrokernelTester()
13345 .mr(2)
13346 .nr(4)
13347 .kr(2)
13348 .sr(1)
13349 .m(2)
13350 .n(4)
13351 .k(k)
13352 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13353 }
13354 }
13355
13356 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_lt_8_strided_a) {
13357 TEST_REQUIRES_X86_XOP;
13358 for (size_t k = 1; k < 8; k++) {
13359 GemmMicrokernelTester()
13360 .mr(2)
13361 .nr(4)
13362 .kr(2)
13363 .sr(1)
13364 .m(2)
13365 .n(4)
13366 .k(k)
13367 .a_stride(11)
13368 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13369 }
13370 }
13371
13372 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_lt_8_subtile) {
13373 TEST_REQUIRES_X86_XOP;
13374 for (size_t k = 1; k < 8; k++) {
13375 for (uint32_t m = 1; m <= 2; m++) {
13376 for (uint32_t n = 1; n <= 4; n++) {
13377 GemmMicrokernelTester()
13378 .mr(2)
13379 .nr(4)
13380 .kr(2)
13381 .sr(1)
13382 .m(m)
13383 .n(n)
13384 .k(k)
13385 .iterations(1)
13386 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13387 }
13388 }
13389 }
13390 }
13391
13392 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_gt_8) {
13393 TEST_REQUIRES_X86_XOP;
13394 for (size_t k = 9; k < 16; k++) {
13395 GemmMicrokernelTester()
13396 .mr(2)
13397 .nr(4)
13398 .kr(2)
13399 .sr(1)
13400 .m(2)
13401 .n(4)
13402 .k(k)
13403 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13404 }
13405 }
13406
13407 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_gt_8_strided_a) {
13408 TEST_REQUIRES_X86_XOP;
13409 for (size_t k = 9; k < 16; k++) {
13410 GemmMicrokernelTester()
13411 .mr(2)
13412 .nr(4)
13413 .kr(2)
13414 .sr(1)
13415 .m(2)
13416 .n(4)
13417 .k(k)
13418 .a_stride(19)
13419 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13420 }
13421 }
13422
13423 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_gt_8_subtile) {
13424 TEST_REQUIRES_X86_XOP;
13425 for (size_t k = 9; k < 16; k++) {
13426 for (uint32_t m = 1; m <= 2; m++) {
13427 for (uint32_t n = 1; n <= 4; n++) {
13428 GemmMicrokernelTester()
13429 .mr(2)
13430 .nr(4)
13431 .kr(2)
13432 .sr(1)
13433 .m(m)
13434 .n(n)
13435 .k(k)
13436 .iterations(1)
13437 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13438 }
13439 }
13440 }
13441 }
13442
13443 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_div_8) {
13444 TEST_REQUIRES_X86_XOP;
13445 for (size_t k = 16; k <= 80; k += 8) {
13446 GemmMicrokernelTester()
13447 .mr(2)
13448 .nr(4)
13449 .kr(2)
13450 .sr(1)
13451 .m(2)
13452 .n(4)
13453 .k(k)
13454 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13455 }
13456 }
13457
13458 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_div_8_strided_a) {
13459 TEST_REQUIRES_X86_XOP;
13460 for (size_t k = 16; k <= 80; k += 8) {
13461 GemmMicrokernelTester()
13462 .mr(2)
13463 .nr(4)
13464 .kr(2)
13465 .sr(1)
13466 .m(2)
13467 .n(4)
13468 .k(k)
13469 .a_stride(83)
13470 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13471 }
13472 }
13473
13474 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_div_8_subtile) {
13475 TEST_REQUIRES_X86_XOP;
13476 for (size_t k = 16; k <= 80; k += 8) {
13477 for (uint32_t m = 1; m <= 2; m++) {
13478 for (uint32_t n = 1; n <= 4; n++) {
13479 GemmMicrokernelTester()
13480 .mr(2)
13481 .nr(4)
13482 .kr(2)
13483 .sr(1)
13484 .m(m)
13485 .n(n)
13486 .k(k)
13487 .iterations(1)
13488 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13489 }
13490 }
13491 }
13492 }
13493
13494 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_gt_4) {
13495 TEST_REQUIRES_X86_XOP;
13496 for (uint32_t n = 5; n < 8; n++) {
13497 for (size_t k = 1; k <= 40; k += 9) {
13498 GemmMicrokernelTester()
13499 .mr(2)
13500 .nr(4)
13501 .kr(2)
13502 .sr(1)
13503 .m(2)
13504 .n(4)
13505 .k(k)
13506 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13507 }
13508 }
13509 }
13510
13511 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_gt_4_strided_cn) {
13512 TEST_REQUIRES_X86_XOP;
13513 for (uint32_t n = 5; n < 8; n++) {
13514 for (size_t k = 1; k <= 40; k += 9) {
13515 GemmMicrokernelTester()
13516 .mr(2)
13517 .nr(4)
13518 .kr(2)
13519 .sr(1)
13520 .m(2)
13521 .n(4)
13522 .k(k)
13523 .cn_stride(7)
13524 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13525 }
13526 }
13527 }
13528
13529 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_gt_4_strided_a) {
13530 TEST_REQUIRES_X86_XOP;
13531 for (uint32_t n = 5; n < 8; n++) {
13532 for (size_t k = 1; k <= 40; k += 9) {
13533 GemmMicrokernelTester()
13534 .mr(2)
13535 .nr(4)
13536 .kr(2)
13537 .sr(1)
13538 .m(2)
13539 .n(n)
13540 .k(k)
13541 .a_stride(43)
13542 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13543 }
13544 }
13545 }
13546
13547 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_gt_4_subtile) {
13548 TEST_REQUIRES_X86_XOP;
13549 for (uint32_t n = 5; n < 8; n++) {
13550 for (size_t k = 1; k <= 40; k += 9) {
13551 for (uint32_t m = 1; m <= 2; m++) {
13552 GemmMicrokernelTester()
13553 .mr(2)
13554 .nr(4)
13555 .kr(2)
13556 .sr(1)
13557 .m(m)
13558 .n(n)
13559 .k(k)
13560 .iterations(1)
13561 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13562 }
13563 }
13564 }
13565 }
13566
13567 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_div_4) {
13568 TEST_REQUIRES_X86_XOP;
13569 for (uint32_t n = 8; n <= 12; n += 4) {
13570 for (size_t k = 1; k <= 40; k += 9) {
13571 GemmMicrokernelTester()
13572 .mr(2)
13573 .nr(4)
13574 .kr(2)
13575 .sr(1)
13576 .m(2)
13577 .n(4)
13578 .k(k)
13579 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13580 }
13581 }
13582 }
13583
13584 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_div_4_strided_cn) {
13585 TEST_REQUIRES_X86_XOP;
13586 for (uint32_t n = 8; n <= 12; n += 4) {
13587 for (size_t k = 1; k <= 40; k += 9) {
13588 GemmMicrokernelTester()
13589 .mr(2)
13590 .nr(4)
13591 .kr(2)
13592 .sr(1)
13593 .m(2)
13594 .n(n)
13595 .k(k)
13596 .cn_stride(7)
13597 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13598 }
13599 }
13600 }
13601
13602 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_div_4_strided_a) {
13603 TEST_REQUIRES_X86_XOP;
13604 for (uint32_t n = 8; n <= 12; n += 4) {
13605 for (size_t k = 1; k <= 40; k += 9) {
13606 GemmMicrokernelTester()
13607 .mr(2)
13608 .nr(4)
13609 .kr(2)
13610 .sr(1)
13611 .m(2)
13612 .n(n)
13613 .k(k)
13614 .a_stride(43)
13615 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13616 }
13617 }
13618 }
13619
13620 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_div_4_subtile) {
13621 TEST_REQUIRES_X86_XOP;
13622 for (uint32_t n = 8; n <= 12; n += 4) {
13623 for (size_t k = 1; k <= 40; k += 9) {
13624 for (uint32_t m = 1; m <= 2; m++) {
13625 GemmMicrokernelTester()
13626 .mr(2)
13627 .nr(4)
13628 .kr(2)
13629 .sr(1)
13630 .m(m)
13631 .n(n)
13632 .k(k)
13633 .iterations(1)
13634 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13635 }
13636 }
13637 }
13638 }
13639
13640 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, strided_cm_subtile) {
13641 TEST_REQUIRES_X86_XOP;
13642 for (size_t k = 1; k <= 40; k += 9) {
13643 for (uint32_t m = 1; m <= 2; m++) {
13644 for (uint32_t n = 1; n <= 4; n++) {
13645 GemmMicrokernelTester()
13646 .mr(2)
13647 .nr(4)
13648 .kr(2)
13649 .sr(1)
13650 .m(m)
13651 .n(n)
13652 .k(k)
13653 .cm_stride(7)
13654 .iterations(1)
13655 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13656 }
13657 }
13658 }
13659 }
13660
13661 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, qmin) {
13662 TEST_REQUIRES_X86_XOP;
13663 GemmMicrokernelTester()
13664 .mr(2)
13665 .nr(4)
13666 .kr(2)
13667 .sr(1)
13668 .m(2)
13669 .n(4)
13670 .k(8)
13671 .qmin(128)
13672 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13673 }
13674
13675 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, qmax) {
13676 TEST_REQUIRES_X86_XOP;
13677 GemmMicrokernelTester()
13678 .mr(2)
13679 .nr(4)
13680 .kr(2)
13681 .sr(1)
13682 .m(2)
13683 .n(4)
13684 .k(8)
13685 .qmax(128)
13686 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13687 }
13688
13689 TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, strided_cm) {
13690 TEST_REQUIRES_X86_XOP;
13691 GemmMicrokernelTester()
13692 .mr(2)
13693 .nr(4)
13694 .kr(2)
13695 .sr(1)
13696 .m(2)
13697 .n(4)
13698 .k(8)
13699 .cm_stride(7)
13700 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13701 }
13702#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
13703
13704
13705#if XNN_ARCH_X86 || XNN_ARCH_X86_64
13706 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8) {
13707 TEST_REQUIRES_X86_XOP;
13708 GemmMicrokernelTester()
13709 .mr(3)
13710 .nr(4)
13711 .kr(2)
13712 .sr(1)
13713 .m(3)
13714 .n(4)
13715 .k(8)
13716 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13717 }
13718
13719 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, strided_cn) {
13720 TEST_REQUIRES_X86_XOP;
13721 GemmMicrokernelTester()
13722 .mr(3)
13723 .nr(4)
13724 .kr(2)
13725 .sr(1)
13726 .m(3)
13727 .n(4)
13728 .k(8)
13729 .cn_stride(7)
13730 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13731 }
13732
13733 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_strided_a) {
13734 TEST_REQUIRES_X86_XOP;
13735 GemmMicrokernelTester()
13736 .mr(3)
13737 .nr(4)
13738 .kr(2)
13739 .sr(1)
13740 .m(3)
13741 .n(4)
13742 .k(8)
13743 .a_stride(11)
13744 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13745 }
13746
13747 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_subtile) {
13748 TEST_REQUIRES_X86_XOP;
13749 for (uint32_t m = 1; m <= 3; m++) {
13750 for (uint32_t n = 1; n <= 4; n++) {
13751 GemmMicrokernelTester()
13752 .mr(3)
13753 .nr(4)
13754 .kr(2)
13755 .sr(1)
13756 .m(m)
13757 .n(n)
13758 .k(8)
13759 .iterations(1)
13760 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13761 }
13762 }
13763 }
13764
13765 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_subtile_m) {
13766 TEST_REQUIRES_X86_XOP;
13767 for (uint32_t m = 1; m <= 3; m++) {
13768 GemmMicrokernelTester()
13769 .mr(3)
13770 .nr(4)
13771 .kr(2)
13772 .sr(1)
13773 .m(m)
13774 .n(4)
13775 .k(8)
13776 .iterations(1)
13777 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13778 }
13779 }
13780
13781 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_subtile_n) {
13782 TEST_REQUIRES_X86_XOP;
13783 for (uint32_t n = 1; n <= 4; n++) {
13784 GemmMicrokernelTester()
13785 .mr(3)
13786 .nr(4)
13787 .kr(2)
13788 .sr(1)
13789 .m(3)
13790 .n(n)
13791 .k(8)
13792 .iterations(1)
13793 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13794 }
13795 }
13796
13797 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_lt_8) {
13798 TEST_REQUIRES_X86_XOP;
13799 for (size_t k = 1; k < 8; k++) {
13800 GemmMicrokernelTester()
13801 .mr(3)
13802 .nr(4)
13803 .kr(2)
13804 .sr(1)
13805 .m(3)
13806 .n(4)
13807 .k(k)
13808 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13809 }
13810 }
13811
13812 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_lt_8_strided_a) {
13813 TEST_REQUIRES_X86_XOP;
13814 for (size_t k = 1; k < 8; k++) {
13815 GemmMicrokernelTester()
13816 .mr(3)
13817 .nr(4)
13818 .kr(2)
13819 .sr(1)
13820 .m(3)
13821 .n(4)
13822 .k(k)
13823 .a_stride(11)
13824 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13825 }
13826 }
13827
13828 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_lt_8_subtile) {
13829 TEST_REQUIRES_X86_XOP;
13830 for (size_t k = 1; k < 8; k++) {
13831 for (uint32_t m = 1; m <= 3; m++) {
13832 for (uint32_t n = 1; n <= 4; n++) {
13833 GemmMicrokernelTester()
13834 .mr(3)
13835 .nr(4)
13836 .kr(2)
13837 .sr(1)
13838 .m(m)
13839 .n(n)
13840 .k(k)
13841 .iterations(1)
13842 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13843 }
13844 }
13845 }
13846 }
13847
13848 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_gt_8) {
13849 TEST_REQUIRES_X86_XOP;
13850 for (size_t k = 9; k < 16; k++) {
13851 GemmMicrokernelTester()
13852 .mr(3)
13853 .nr(4)
13854 .kr(2)
13855 .sr(1)
13856 .m(3)
13857 .n(4)
13858 .k(k)
13859 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13860 }
13861 }
13862
13863 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_gt_8_strided_a) {
13864 TEST_REQUIRES_X86_XOP;
13865 for (size_t k = 9; k < 16; k++) {
13866 GemmMicrokernelTester()
13867 .mr(3)
13868 .nr(4)
13869 .kr(2)
13870 .sr(1)
13871 .m(3)
13872 .n(4)
13873 .k(k)
13874 .a_stride(19)
13875 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13876 }
13877 }
13878
13879 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_gt_8_subtile) {
13880 TEST_REQUIRES_X86_XOP;
13881 for (size_t k = 9; k < 16; k++) {
13882 for (uint32_t m = 1; m <= 3; m++) {
13883 for (uint32_t n = 1; n <= 4; n++) {
13884 GemmMicrokernelTester()
13885 .mr(3)
13886 .nr(4)
13887 .kr(2)
13888 .sr(1)
13889 .m(m)
13890 .n(n)
13891 .k(k)
13892 .iterations(1)
13893 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13894 }
13895 }
13896 }
13897 }
13898
13899 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_div_8) {
13900 TEST_REQUIRES_X86_XOP;
13901 for (size_t k = 16; k <= 80; k += 8) {
13902 GemmMicrokernelTester()
13903 .mr(3)
13904 .nr(4)
13905 .kr(2)
13906 .sr(1)
13907 .m(3)
13908 .n(4)
13909 .k(k)
13910 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13911 }
13912 }
13913
13914 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_div_8_strided_a) {
13915 TEST_REQUIRES_X86_XOP;
13916 for (size_t k = 16; k <= 80; k += 8) {
13917 GemmMicrokernelTester()
13918 .mr(3)
13919 .nr(4)
13920 .kr(2)
13921 .sr(1)
13922 .m(3)
13923 .n(4)
13924 .k(k)
13925 .a_stride(83)
13926 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13927 }
13928 }
13929
13930 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_div_8_subtile) {
13931 TEST_REQUIRES_X86_XOP;
13932 for (size_t k = 16; k <= 80; k += 8) {
13933 for (uint32_t m = 1; m <= 3; m++) {
13934 for (uint32_t n = 1; n <= 4; n++) {
13935 GemmMicrokernelTester()
13936 .mr(3)
13937 .nr(4)
13938 .kr(2)
13939 .sr(1)
13940 .m(m)
13941 .n(n)
13942 .k(k)
13943 .iterations(1)
13944 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13945 }
13946 }
13947 }
13948 }
13949
13950 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4) {
13951 TEST_REQUIRES_X86_XOP;
13952 for (uint32_t n = 5; n < 8; n++) {
13953 for (size_t k = 1; k <= 40; k += 9) {
13954 GemmMicrokernelTester()
13955 .mr(3)
13956 .nr(4)
13957 .kr(2)
13958 .sr(1)
13959 .m(3)
13960 .n(4)
13961 .k(k)
13962 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13963 }
13964 }
13965 }
13966
13967 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4_strided_cn) {
13968 TEST_REQUIRES_X86_XOP;
13969 for (uint32_t n = 5; n < 8; n++) {
13970 for (size_t k = 1; k <= 40; k += 9) {
13971 GemmMicrokernelTester()
13972 .mr(3)
13973 .nr(4)
13974 .kr(2)
13975 .sr(1)
13976 .m(3)
13977 .n(4)
13978 .k(k)
13979 .cn_stride(7)
13980 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13981 }
13982 }
13983 }
13984
13985 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4_strided_a) {
13986 TEST_REQUIRES_X86_XOP;
13987 for (uint32_t n = 5; n < 8; n++) {
13988 for (size_t k = 1; k <= 40; k += 9) {
13989 GemmMicrokernelTester()
13990 .mr(3)
13991 .nr(4)
13992 .kr(2)
13993 .sr(1)
13994 .m(3)
13995 .n(n)
13996 .k(k)
13997 .a_stride(43)
13998 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
13999 }
14000 }
14001 }
14002
14003 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4_subtile) {
14004 TEST_REQUIRES_X86_XOP;
14005 for (uint32_t n = 5; n < 8; n++) {
14006 for (size_t k = 1; k <= 40; k += 9) {
14007 for (uint32_t m = 1; m <= 3; m++) {
14008 GemmMicrokernelTester()
14009 .mr(3)
14010 .nr(4)
14011 .kr(2)
14012 .sr(1)
14013 .m(m)
14014 .n(n)
14015 .k(k)
14016 .iterations(1)
14017 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14018 }
14019 }
14020 }
14021 }
14022
14023 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4) {
14024 TEST_REQUIRES_X86_XOP;
14025 for (uint32_t n = 8; n <= 12; n += 4) {
14026 for (size_t k = 1; k <= 40; k += 9) {
14027 GemmMicrokernelTester()
14028 .mr(3)
14029 .nr(4)
14030 .kr(2)
14031 .sr(1)
14032 .m(3)
14033 .n(4)
14034 .k(k)
14035 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14036 }
14037 }
14038 }
14039
14040 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4_strided_cn) {
14041 TEST_REQUIRES_X86_XOP;
14042 for (uint32_t n = 8; n <= 12; n += 4) {
14043 for (size_t k = 1; k <= 40; k += 9) {
14044 GemmMicrokernelTester()
14045 .mr(3)
14046 .nr(4)
14047 .kr(2)
14048 .sr(1)
14049 .m(3)
14050 .n(n)
14051 .k(k)
14052 .cn_stride(7)
14053 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14054 }
14055 }
14056 }
14057
14058 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4_strided_a) {
14059 TEST_REQUIRES_X86_XOP;
14060 for (uint32_t n = 8; n <= 12; n += 4) {
14061 for (size_t k = 1; k <= 40; k += 9) {
14062 GemmMicrokernelTester()
14063 .mr(3)
14064 .nr(4)
14065 .kr(2)
14066 .sr(1)
14067 .m(3)
14068 .n(n)
14069 .k(k)
14070 .a_stride(43)
14071 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14072 }
14073 }
14074 }
14075
14076 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4_subtile) {
14077 TEST_REQUIRES_X86_XOP;
14078 for (uint32_t n = 8; n <= 12; n += 4) {
14079 for (size_t k = 1; k <= 40; k += 9) {
14080 for (uint32_t m = 1; m <= 3; m++) {
14081 GemmMicrokernelTester()
14082 .mr(3)
14083 .nr(4)
14084 .kr(2)
14085 .sr(1)
14086 .m(m)
14087 .n(n)
14088 .k(k)
14089 .iterations(1)
14090 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14091 }
14092 }
14093 }
14094 }
14095
14096 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, strided_cm_subtile) {
14097 TEST_REQUIRES_X86_XOP;
14098 for (size_t k = 1; k <= 40; k += 9) {
14099 for (uint32_t m = 1; m <= 3; m++) {
14100 for (uint32_t n = 1; n <= 4; n++) {
14101 GemmMicrokernelTester()
14102 .mr(3)
14103 .nr(4)
14104 .kr(2)
14105 .sr(1)
14106 .m(m)
14107 .n(n)
14108 .k(k)
14109 .cm_stride(7)
14110 .iterations(1)
14111 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14112 }
14113 }
14114 }
14115 }
14116
14117 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, qmin) {
14118 TEST_REQUIRES_X86_XOP;
14119 GemmMicrokernelTester()
14120 .mr(3)
14121 .nr(4)
14122 .kr(2)
14123 .sr(1)
14124 .m(3)
14125 .n(4)
14126 .k(8)
14127 .qmin(128)
14128 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14129 }
14130
14131 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, qmax) {
14132 TEST_REQUIRES_X86_XOP;
14133 GemmMicrokernelTester()
14134 .mr(3)
14135 .nr(4)
14136 .kr(2)
14137 .sr(1)
14138 .m(3)
14139 .n(4)
14140 .k(8)
14141 .qmax(128)
14142 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14143 }
14144
14145 TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, strided_cm) {
14146 TEST_REQUIRES_X86_XOP;
14147 GemmMicrokernelTester()
14148 .mr(3)
14149 .nr(4)
14150 .kr(2)
14151 .sr(1)
14152 .m(3)
14153 .n(4)
14154 .k(8)
14155 .cm_stride(7)
14156 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14157 }
14158#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
14159
14160
14161#if XNN_ARCH_X86 || XNN_ARCH_X86_64
14162 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8) {
14163 TEST_REQUIRES_X86_XOP;
14164 GemmMicrokernelTester()
14165 .mr(4)
14166 .nr(4)
14167 .kr(2)
14168 .sr(1)
14169 .m(4)
14170 .n(4)
14171 .k(8)
14172 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14173 }
14174
14175 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, strided_cn) {
14176 TEST_REQUIRES_X86_XOP;
14177 GemmMicrokernelTester()
14178 .mr(4)
14179 .nr(4)
14180 .kr(2)
14181 .sr(1)
14182 .m(4)
14183 .n(4)
14184 .k(8)
14185 .cn_stride(7)
14186 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14187 }
14188
14189 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8_strided_a) {
14190 TEST_REQUIRES_X86_XOP;
14191 GemmMicrokernelTester()
14192 .mr(4)
14193 .nr(4)
14194 .kr(2)
14195 .sr(1)
14196 .m(4)
14197 .n(4)
14198 .k(8)
14199 .a_stride(11)
14200 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14201 }
14202
14203 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8_subtile) {
14204 TEST_REQUIRES_X86_XOP;
14205 for (uint32_t m = 1; m <= 4; m++) {
14206 for (uint32_t n = 1; n <= 4; n++) {
14207 GemmMicrokernelTester()
14208 .mr(4)
14209 .nr(4)
14210 .kr(2)
14211 .sr(1)
14212 .m(m)
14213 .n(n)
14214 .k(8)
14215 .iterations(1)
14216 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14217 }
14218 }
14219 }
14220
14221 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8_subtile_m) {
14222 TEST_REQUIRES_X86_XOP;
14223 for (uint32_t m = 1; m <= 4; m++) {
14224 GemmMicrokernelTester()
14225 .mr(4)
14226 .nr(4)
14227 .kr(2)
14228 .sr(1)
14229 .m(m)
14230 .n(4)
14231 .k(8)
14232 .iterations(1)
14233 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14234 }
14235 }
14236
14237 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8_subtile_n) {
14238 TEST_REQUIRES_X86_XOP;
14239 for (uint32_t n = 1; n <= 4; n++) {
14240 GemmMicrokernelTester()
14241 .mr(4)
14242 .nr(4)
14243 .kr(2)
14244 .sr(1)
14245 .m(4)
14246 .n(n)
14247 .k(8)
14248 .iterations(1)
14249 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14250 }
14251 }
14252
14253 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_lt_8) {
14254 TEST_REQUIRES_X86_XOP;
14255 for (size_t k = 1; k < 8; k++) {
14256 GemmMicrokernelTester()
14257 .mr(4)
14258 .nr(4)
14259 .kr(2)
14260 .sr(1)
14261 .m(4)
14262 .n(4)
14263 .k(k)
14264 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14265 }
14266 }
14267
14268 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_lt_8_strided_a) {
14269 TEST_REQUIRES_X86_XOP;
14270 for (size_t k = 1; k < 8; k++) {
14271 GemmMicrokernelTester()
14272 .mr(4)
14273 .nr(4)
14274 .kr(2)
14275 .sr(1)
14276 .m(4)
14277 .n(4)
14278 .k(k)
14279 .a_stride(11)
14280 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14281 }
14282 }
14283
14284 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_lt_8_subtile) {
14285 TEST_REQUIRES_X86_XOP;
14286 for (size_t k = 1; k < 8; k++) {
14287 for (uint32_t m = 1; m <= 4; m++) {
14288 for (uint32_t n = 1; n <= 4; n++) {
14289 GemmMicrokernelTester()
14290 .mr(4)
14291 .nr(4)
14292 .kr(2)
14293 .sr(1)
14294 .m(m)
14295 .n(n)
14296 .k(k)
14297 .iterations(1)
14298 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14299 }
14300 }
14301 }
14302 }
14303
14304 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_gt_8) {
14305 TEST_REQUIRES_X86_XOP;
14306 for (size_t k = 9; k < 16; k++) {
14307 GemmMicrokernelTester()
14308 .mr(4)
14309 .nr(4)
14310 .kr(2)
14311 .sr(1)
14312 .m(4)
14313 .n(4)
14314 .k(k)
14315 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14316 }
14317 }
14318
14319 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_gt_8_strided_a) {
14320 TEST_REQUIRES_X86_XOP;
14321 for (size_t k = 9; k < 16; k++) {
14322 GemmMicrokernelTester()
14323 .mr(4)
14324 .nr(4)
14325 .kr(2)
14326 .sr(1)
14327 .m(4)
14328 .n(4)
14329 .k(k)
14330 .a_stride(19)
14331 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14332 }
14333 }
14334
14335 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_gt_8_subtile) {
14336 TEST_REQUIRES_X86_XOP;
14337 for (size_t k = 9; k < 16; k++) {
14338 for (uint32_t m = 1; m <= 4; m++) {
14339 for (uint32_t n = 1; n <= 4; n++) {
14340 GemmMicrokernelTester()
14341 .mr(4)
14342 .nr(4)
14343 .kr(2)
14344 .sr(1)
14345 .m(m)
14346 .n(n)
14347 .k(k)
14348 .iterations(1)
14349 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14350 }
14351 }
14352 }
14353 }
14354
14355 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_div_8) {
14356 TEST_REQUIRES_X86_XOP;
14357 for (size_t k = 16; k <= 80; k += 8) {
14358 GemmMicrokernelTester()
14359 .mr(4)
14360 .nr(4)
14361 .kr(2)
14362 .sr(1)
14363 .m(4)
14364 .n(4)
14365 .k(k)
14366 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14367 }
14368 }
14369
14370 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_div_8_strided_a) {
14371 TEST_REQUIRES_X86_XOP;
14372 for (size_t k = 16; k <= 80; k += 8) {
14373 GemmMicrokernelTester()
14374 .mr(4)
14375 .nr(4)
14376 .kr(2)
14377 .sr(1)
14378 .m(4)
14379 .n(4)
14380 .k(k)
14381 .a_stride(83)
14382 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14383 }
14384 }
14385
14386 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_div_8_subtile) {
14387 TEST_REQUIRES_X86_XOP;
14388 for (size_t k = 16; k <= 80; k += 8) {
14389 for (uint32_t m = 1; m <= 4; m++) {
14390 for (uint32_t n = 1; n <= 4; n++) {
14391 GemmMicrokernelTester()
14392 .mr(4)
14393 .nr(4)
14394 .kr(2)
14395 .sr(1)
14396 .m(m)
14397 .n(n)
14398 .k(k)
14399 .iterations(1)
14400 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14401 }
14402 }
14403 }
14404 }
14405
14406 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4) {
14407 TEST_REQUIRES_X86_XOP;
14408 for (uint32_t n = 5; n < 8; n++) {
14409 for (size_t k = 1; k <= 40; k += 9) {
14410 GemmMicrokernelTester()
14411 .mr(4)
14412 .nr(4)
14413 .kr(2)
14414 .sr(1)
14415 .m(4)
14416 .n(4)
14417 .k(k)
14418 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14419 }
14420 }
14421 }
14422
14423 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4_strided_cn) {
14424 TEST_REQUIRES_X86_XOP;
14425 for (uint32_t n = 5; n < 8; n++) {
14426 for (size_t k = 1; k <= 40; k += 9) {
14427 GemmMicrokernelTester()
14428 .mr(4)
14429 .nr(4)
14430 .kr(2)
14431 .sr(1)
14432 .m(4)
14433 .n(4)
14434 .k(k)
14435 .cn_stride(7)
14436 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14437 }
14438 }
14439 }
14440
14441 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4_strided_a) {
14442 TEST_REQUIRES_X86_XOP;
14443 for (uint32_t n = 5; n < 8; n++) {
14444 for (size_t k = 1; k <= 40; k += 9) {
14445 GemmMicrokernelTester()
14446 .mr(4)
14447 .nr(4)
14448 .kr(2)
14449 .sr(1)
14450 .m(4)
14451 .n(n)
14452 .k(k)
14453 .a_stride(43)
14454 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14455 }
14456 }
14457 }
14458
14459 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4_subtile) {
14460 TEST_REQUIRES_X86_XOP;
14461 for (uint32_t n = 5; n < 8; n++) {
14462 for (size_t k = 1; k <= 40; k += 9) {
14463 for (uint32_t m = 1; m <= 4; m++) {
14464 GemmMicrokernelTester()
14465 .mr(4)
14466 .nr(4)
14467 .kr(2)
14468 .sr(1)
14469 .m(m)
14470 .n(n)
14471 .k(k)
14472 .iterations(1)
14473 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14474 }
14475 }
14476 }
14477 }
14478
14479 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4) {
14480 TEST_REQUIRES_X86_XOP;
14481 for (uint32_t n = 8; n <= 12; n += 4) {
14482 for (size_t k = 1; k <= 40; k += 9) {
14483 GemmMicrokernelTester()
14484 .mr(4)
14485 .nr(4)
14486 .kr(2)
14487 .sr(1)
14488 .m(4)
14489 .n(4)
14490 .k(k)
14491 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14492 }
14493 }
14494 }
14495
14496 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4_strided_cn) {
14497 TEST_REQUIRES_X86_XOP;
14498 for (uint32_t n = 8; n <= 12; n += 4) {
14499 for (size_t k = 1; k <= 40; k += 9) {
14500 GemmMicrokernelTester()
14501 .mr(4)
14502 .nr(4)
14503 .kr(2)
14504 .sr(1)
14505 .m(4)
14506 .n(n)
14507 .k(k)
14508 .cn_stride(7)
14509 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14510 }
14511 }
14512 }
14513
14514 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4_strided_a) {
14515 TEST_REQUIRES_X86_XOP;
14516 for (uint32_t n = 8; n <= 12; n += 4) {
14517 for (size_t k = 1; k <= 40; k += 9) {
14518 GemmMicrokernelTester()
14519 .mr(4)
14520 .nr(4)
14521 .kr(2)
14522 .sr(1)
14523 .m(4)
14524 .n(n)
14525 .k(k)
14526 .a_stride(43)
14527 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14528 }
14529 }
14530 }
14531
14532 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4_subtile) {
14533 TEST_REQUIRES_X86_XOP;
14534 for (uint32_t n = 8; n <= 12; n += 4) {
14535 for (size_t k = 1; k <= 40; k += 9) {
14536 for (uint32_t m = 1; m <= 4; m++) {
14537 GemmMicrokernelTester()
14538 .mr(4)
14539 .nr(4)
14540 .kr(2)
14541 .sr(1)
14542 .m(m)
14543 .n(n)
14544 .k(k)
14545 .iterations(1)
14546 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14547 }
14548 }
14549 }
14550 }
14551
14552 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, strided_cm_subtile) {
14553 TEST_REQUIRES_X86_XOP;
14554 for (size_t k = 1; k <= 40; k += 9) {
14555 for (uint32_t m = 1; m <= 4; m++) {
14556 for (uint32_t n = 1; n <= 4; n++) {
14557 GemmMicrokernelTester()
14558 .mr(4)
14559 .nr(4)
14560 .kr(2)
14561 .sr(1)
14562 .m(m)
14563 .n(n)
14564 .k(k)
14565 .cm_stride(7)
14566 .iterations(1)
14567 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14568 }
14569 }
14570 }
14571 }
14572
14573 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, qmin) {
14574 TEST_REQUIRES_X86_XOP;
14575 GemmMicrokernelTester()
14576 .mr(4)
14577 .nr(4)
14578 .kr(2)
14579 .sr(1)
14580 .m(4)
14581 .n(4)
14582 .k(8)
14583 .qmin(128)
14584 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14585 }
14586
14587 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, qmax) {
14588 TEST_REQUIRES_X86_XOP;
14589 GemmMicrokernelTester()
14590 .mr(4)
14591 .nr(4)
14592 .kr(2)
14593 .sr(1)
14594 .m(4)
14595 .n(4)
14596 .k(8)
14597 .qmax(128)
14598 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14599 }
14600
14601 TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, strided_cm) {
14602 TEST_REQUIRES_X86_XOP;
14603 GemmMicrokernelTester()
14604 .mr(4)
14605 .nr(4)
14606 .kr(2)
14607 .sr(1)
14608 .m(4)
14609 .n(4)
14610 .k(8)
14611 .cm_stride(7)
14612 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14613 }
14614#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
14615
14616
14617#if XNN_ARCH_X86 || XNN_ARCH_X86_64
14618 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8) {
14619 TEST_REQUIRES_X86_SSE2;
14620 GemmMicrokernelTester()
14621 .mr(1)
14622 .nr(4)
14623 .kr(8)
14624 .sr(1)
14625 .m(1)
14626 .n(4)
14627 .k(8)
14628 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14629 }
14630
14631 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, strided_cn) {
14632 TEST_REQUIRES_X86_SSE2;
14633 GemmMicrokernelTester()
14634 .mr(1)
14635 .nr(4)
14636 .kr(8)
14637 .sr(1)
14638 .m(1)
14639 .n(4)
14640 .k(8)
14641 .cn_stride(7)
14642 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14643 }
14644
14645 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8_strided_a) {
14646 TEST_REQUIRES_X86_SSE2;
14647 GemmMicrokernelTester()
14648 .mr(1)
14649 .nr(4)
14650 .kr(8)
14651 .sr(1)
14652 .m(1)
14653 .n(4)
14654 .k(8)
14655 .a_stride(11)
14656 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14657 }
14658
14659 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8_subtile) {
14660 TEST_REQUIRES_X86_SSE2;
14661 for (uint32_t m = 1; m <= 1; m++) {
14662 for (uint32_t n = 1; n <= 4; n++) {
14663 GemmMicrokernelTester()
14664 .mr(1)
14665 .nr(4)
14666 .kr(8)
14667 .sr(1)
14668 .m(m)
14669 .n(n)
14670 .k(8)
14671 .iterations(1)
14672 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14673 }
14674 }
14675 }
14676
14677 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8_subtile_m) {
14678 TEST_REQUIRES_X86_SSE2;
14679 for (uint32_t m = 1; m <= 1; m++) {
14680 GemmMicrokernelTester()
14681 .mr(1)
14682 .nr(4)
14683 .kr(8)
14684 .sr(1)
14685 .m(m)
14686 .n(4)
14687 .k(8)
14688 .iterations(1)
14689 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14690 }
14691 }
14692
14693 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8_subtile_n) {
14694 TEST_REQUIRES_X86_SSE2;
14695 for (uint32_t n = 1; n <= 4; n++) {
14696 GemmMicrokernelTester()
14697 .mr(1)
14698 .nr(4)
14699 .kr(8)
14700 .sr(1)
14701 .m(1)
14702 .n(n)
14703 .k(8)
14704 .iterations(1)
14705 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14706 }
14707 }
14708
14709 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_lt_8) {
14710 TEST_REQUIRES_X86_SSE2;
14711 for (size_t k = 1; k < 8; k++) {
14712 GemmMicrokernelTester()
14713 .mr(1)
14714 .nr(4)
14715 .kr(8)
14716 .sr(1)
14717 .m(1)
14718 .n(4)
14719 .k(k)
14720 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14721 }
14722 }
14723
14724 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_lt_8_strided_a) {
14725 TEST_REQUIRES_X86_SSE2;
14726 for (size_t k = 1; k < 8; k++) {
14727 GemmMicrokernelTester()
14728 .mr(1)
14729 .nr(4)
14730 .kr(8)
14731 .sr(1)
14732 .m(1)
14733 .n(4)
14734 .k(k)
14735 .a_stride(11)
14736 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14737 }
14738 }
14739
14740 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_lt_8_subtile) {
14741 TEST_REQUIRES_X86_SSE2;
14742 for (size_t k = 1; k < 8; k++) {
14743 for (uint32_t m = 1; m <= 1; m++) {
14744 for (uint32_t n = 1; n <= 4; n++) {
14745 GemmMicrokernelTester()
14746 .mr(1)
14747 .nr(4)
14748 .kr(8)
14749 .sr(1)
14750 .m(m)
14751 .n(n)
14752 .k(k)
14753 .iterations(1)
14754 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14755 }
14756 }
14757 }
14758 }
14759
14760 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_gt_8) {
14761 TEST_REQUIRES_X86_SSE2;
14762 for (size_t k = 9; k < 16; k++) {
14763 GemmMicrokernelTester()
14764 .mr(1)
14765 .nr(4)
14766 .kr(8)
14767 .sr(1)
14768 .m(1)
14769 .n(4)
14770 .k(k)
14771 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14772 }
14773 }
14774
14775 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_gt_8_strided_a) {
14776 TEST_REQUIRES_X86_SSE2;
14777 for (size_t k = 9; k < 16; k++) {
14778 GemmMicrokernelTester()
14779 .mr(1)
14780 .nr(4)
14781 .kr(8)
14782 .sr(1)
14783 .m(1)
14784 .n(4)
14785 .k(k)
14786 .a_stride(19)
14787 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14788 }
14789 }
14790
14791 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_gt_8_subtile) {
14792 TEST_REQUIRES_X86_SSE2;
14793 for (size_t k = 9; k < 16; k++) {
14794 for (uint32_t m = 1; m <= 1; m++) {
14795 for (uint32_t n = 1; n <= 4; n++) {
14796 GemmMicrokernelTester()
14797 .mr(1)
14798 .nr(4)
14799 .kr(8)
14800 .sr(1)
14801 .m(m)
14802 .n(n)
14803 .k(k)
14804 .iterations(1)
14805 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14806 }
14807 }
14808 }
14809 }
14810
14811 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_div_8) {
14812 TEST_REQUIRES_X86_SSE2;
14813 for (size_t k = 16; k <= 80; k += 8) {
14814 GemmMicrokernelTester()
14815 .mr(1)
14816 .nr(4)
14817 .kr(8)
14818 .sr(1)
14819 .m(1)
14820 .n(4)
14821 .k(k)
14822 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14823 }
14824 }
14825
14826 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_div_8_strided_a) {
14827 TEST_REQUIRES_X86_SSE2;
14828 for (size_t k = 16; k <= 80; k += 8) {
14829 GemmMicrokernelTester()
14830 .mr(1)
14831 .nr(4)
14832 .kr(8)
14833 .sr(1)
14834 .m(1)
14835 .n(4)
14836 .k(k)
14837 .a_stride(83)
14838 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14839 }
14840 }
14841
14842 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_div_8_subtile) {
14843 TEST_REQUIRES_X86_SSE2;
14844 for (size_t k = 16; k <= 80; k += 8) {
14845 for (uint32_t m = 1; m <= 1; m++) {
14846 for (uint32_t n = 1; n <= 4; n++) {
14847 GemmMicrokernelTester()
14848 .mr(1)
14849 .nr(4)
14850 .kr(8)
14851 .sr(1)
14852 .m(m)
14853 .n(n)
14854 .k(k)
14855 .iterations(1)
14856 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14857 }
14858 }
14859 }
14860 }
14861
14862 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4) {
14863 TEST_REQUIRES_X86_SSE2;
14864 for (uint32_t n = 5; n < 8; n++) {
14865 for (size_t k = 1; k <= 40; k += 9) {
14866 GemmMicrokernelTester()
14867 .mr(1)
14868 .nr(4)
14869 .kr(8)
14870 .sr(1)
14871 .m(1)
14872 .n(4)
14873 .k(k)
14874 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14875 }
14876 }
14877 }
14878
14879 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4_strided_cn) {
14880 TEST_REQUIRES_X86_SSE2;
14881 for (uint32_t n = 5; n < 8; n++) {
14882 for (size_t k = 1; k <= 40; k += 9) {
14883 GemmMicrokernelTester()
14884 .mr(1)
14885 .nr(4)
14886 .kr(8)
14887 .sr(1)
14888 .m(1)
14889 .n(4)
14890 .k(k)
14891 .cn_stride(7)
14892 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14893 }
14894 }
14895 }
14896
14897 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4_strided_a) {
14898 TEST_REQUIRES_X86_SSE2;
14899 for (uint32_t n = 5; n < 8; n++) {
14900 for (size_t k = 1; k <= 40; k += 9) {
14901 GemmMicrokernelTester()
14902 .mr(1)
14903 .nr(4)
14904 .kr(8)
14905 .sr(1)
14906 .m(1)
14907 .n(n)
14908 .k(k)
14909 .a_stride(43)
14910 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14911 }
14912 }
14913 }
14914
14915 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4_subtile) {
14916 TEST_REQUIRES_X86_SSE2;
14917 for (uint32_t n = 5; n < 8; n++) {
14918 for (size_t k = 1; k <= 40; k += 9) {
14919 for (uint32_t m = 1; m <= 1; m++) {
14920 GemmMicrokernelTester()
14921 .mr(1)
14922 .nr(4)
14923 .kr(8)
14924 .sr(1)
14925 .m(m)
14926 .n(n)
14927 .k(k)
14928 .iterations(1)
14929 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14930 }
14931 }
14932 }
14933 }
14934
14935 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4) {
14936 TEST_REQUIRES_X86_SSE2;
14937 for (uint32_t n = 8; n <= 12; n += 4) {
14938 for (size_t k = 1; k <= 40; k += 9) {
14939 GemmMicrokernelTester()
14940 .mr(1)
14941 .nr(4)
14942 .kr(8)
14943 .sr(1)
14944 .m(1)
14945 .n(4)
14946 .k(k)
14947 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14948 }
14949 }
14950 }
14951
14952 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4_strided_cn) {
14953 TEST_REQUIRES_X86_SSE2;
14954 for (uint32_t n = 8; n <= 12; n += 4) {
14955 for (size_t k = 1; k <= 40; k += 9) {
14956 GemmMicrokernelTester()
14957 .mr(1)
14958 .nr(4)
14959 .kr(8)
14960 .sr(1)
14961 .m(1)
14962 .n(n)
14963 .k(k)
14964 .cn_stride(7)
14965 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14966 }
14967 }
14968 }
14969
14970 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4_strided_a) {
14971 TEST_REQUIRES_X86_SSE2;
14972 for (uint32_t n = 8; n <= 12; n += 4) {
14973 for (size_t k = 1; k <= 40; k += 9) {
14974 GemmMicrokernelTester()
14975 .mr(1)
14976 .nr(4)
14977 .kr(8)
14978 .sr(1)
14979 .m(1)
14980 .n(n)
14981 .k(k)
14982 .a_stride(43)
14983 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
14984 }
14985 }
14986 }
14987
14988 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4_subtile) {
14989 TEST_REQUIRES_X86_SSE2;
14990 for (uint32_t n = 8; n <= 12; n += 4) {
14991 for (size_t k = 1; k <= 40; k += 9) {
14992 for (uint32_t m = 1; m <= 1; m++) {
14993 GemmMicrokernelTester()
14994 .mr(1)
14995 .nr(4)
14996 .kr(8)
14997 .sr(1)
14998 .m(m)
14999 .n(n)
15000 .k(k)
15001 .iterations(1)
15002 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15003 }
15004 }
15005 }
15006 }
15007
15008 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, strided_cm_subtile) {
15009 TEST_REQUIRES_X86_SSE2;
15010 for (size_t k = 1; k <= 40; k += 9) {
15011 for (uint32_t m = 1; m <= 1; m++) {
15012 for (uint32_t n = 1; n <= 4; n++) {
15013 GemmMicrokernelTester()
15014 .mr(1)
15015 .nr(4)
15016 .kr(8)
15017 .sr(1)
15018 .m(m)
15019 .n(n)
15020 .k(k)
15021 .cm_stride(7)
15022 .iterations(1)
15023 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15024 }
15025 }
15026 }
15027 }
15028
15029 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, qmin) {
15030 TEST_REQUIRES_X86_SSE2;
15031 GemmMicrokernelTester()
15032 .mr(1)
15033 .nr(4)
15034 .kr(8)
15035 .sr(1)
15036 .m(1)
15037 .n(4)
15038 .k(8)
15039 .qmin(128)
15040 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15041 }
15042
15043 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, qmax) {
15044 TEST_REQUIRES_X86_SSE2;
15045 GemmMicrokernelTester()
15046 .mr(1)
15047 .nr(4)
15048 .kr(8)
15049 .sr(1)
15050 .m(1)
15051 .n(4)
15052 .k(8)
15053 .qmax(128)
15054 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15055 }
15056
15057 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, strided_cm) {
15058 TEST_REQUIRES_X86_SSE2;
15059 GemmMicrokernelTester()
15060 .mr(1)
15061 .nr(4)
15062 .kr(8)
15063 .sr(1)
15064 .m(1)
15065 .n(4)
15066 .k(8)
15067 .cm_stride(7)
15068 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15069 }
15070#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
15071
15072
15073#if XNN_ARCH_X86 || XNN_ARCH_X86_64
15074 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8) {
15075 TEST_REQUIRES_X86_SSE2;
15076 GemmMicrokernelTester()
15077 .mr(2)
15078 .nr(4)
15079 .kr(8)
15080 .sr(1)
15081 .m(2)
15082 .n(4)
15083 .k(8)
15084 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15085 }
15086
15087 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, strided_cn) {
15088 TEST_REQUIRES_X86_SSE2;
15089 GemmMicrokernelTester()
15090 .mr(2)
15091 .nr(4)
15092 .kr(8)
15093 .sr(1)
15094 .m(2)
15095 .n(4)
15096 .k(8)
15097 .cn_stride(7)
15098 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15099 }
15100
15101 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8_strided_a) {
15102 TEST_REQUIRES_X86_SSE2;
15103 GemmMicrokernelTester()
15104 .mr(2)
15105 .nr(4)
15106 .kr(8)
15107 .sr(1)
15108 .m(2)
15109 .n(4)
15110 .k(8)
15111 .a_stride(11)
15112 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15113 }
15114
15115 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8_subtile) {
15116 TEST_REQUIRES_X86_SSE2;
15117 for (uint32_t m = 1; m <= 2; m++) {
15118 for (uint32_t n = 1; n <= 4; n++) {
15119 GemmMicrokernelTester()
15120 .mr(2)
15121 .nr(4)
15122 .kr(8)
15123 .sr(1)
15124 .m(m)
15125 .n(n)
15126 .k(8)
15127 .iterations(1)
15128 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15129 }
15130 }
15131 }
15132
15133 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8_subtile_m) {
15134 TEST_REQUIRES_X86_SSE2;
15135 for (uint32_t m = 1; m <= 2; m++) {
15136 GemmMicrokernelTester()
15137 .mr(2)
15138 .nr(4)
15139 .kr(8)
15140 .sr(1)
15141 .m(m)
15142 .n(4)
15143 .k(8)
15144 .iterations(1)
15145 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15146 }
15147 }
15148
15149 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8_subtile_n) {
15150 TEST_REQUIRES_X86_SSE2;
15151 for (uint32_t n = 1; n <= 4; n++) {
15152 GemmMicrokernelTester()
15153 .mr(2)
15154 .nr(4)
15155 .kr(8)
15156 .sr(1)
15157 .m(2)
15158 .n(n)
15159 .k(8)
15160 .iterations(1)
15161 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15162 }
15163 }
15164
15165 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_lt_8) {
15166 TEST_REQUIRES_X86_SSE2;
15167 for (size_t k = 1; k < 8; k++) {
15168 GemmMicrokernelTester()
15169 .mr(2)
15170 .nr(4)
15171 .kr(8)
15172 .sr(1)
15173 .m(2)
15174 .n(4)
15175 .k(k)
15176 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15177 }
15178 }
15179
15180 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_lt_8_strided_a) {
15181 TEST_REQUIRES_X86_SSE2;
15182 for (size_t k = 1; k < 8; k++) {
15183 GemmMicrokernelTester()
15184 .mr(2)
15185 .nr(4)
15186 .kr(8)
15187 .sr(1)
15188 .m(2)
15189 .n(4)
15190 .k(k)
15191 .a_stride(11)
15192 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15193 }
15194 }
15195
15196 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_lt_8_subtile) {
15197 TEST_REQUIRES_X86_SSE2;
15198 for (size_t k = 1; k < 8; k++) {
15199 for (uint32_t m = 1; m <= 2; m++) {
15200 for (uint32_t n = 1; n <= 4; n++) {
15201 GemmMicrokernelTester()
15202 .mr(2)
15203 .nr(4)
15204 .kr(8)
15205 .sr(1)
15206 .m(m)
15207 .n(n)
15208 .k(k)
15209 .iterations(1)
15210 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15211 }
15212 }
15213 }
15214 }
15215
15216 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_gt_8) {
15217 TEST_REQUIRES_X86_SSE2;
15218 for (size_t k = 9; k < 16; k++) {
15219 GemmMicrokernelTester()
15220 .mr(2)
15221 .nr(4)
15222 .kr(8)
15223 .sr(1)
15224 .m(2)
15225 .n(4)
15226 .k(k)
15227 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15228 }
15229 }
15230
15231 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_gt_8_strided_a) {
15232 TEST_REQUIRES_X86_SSE2;
15233 for (size_t k = 9; k < 16; k++) {
15234 GemmMicrokernelTester()
15235 .mr(2)
15236 .nr(4)
15237 .kr(8)
15238 .sr(1)
15239 .m(2)
15240 .n(4)
15241 .k(k)
15242 .a_stride(19)
15243 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15244 }
15245 }
15246
15247 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_gt_8_subtile) {
15248 TEST_REQUIRES_X86_SSE2;
15249 for (size_t k = 9; k < 16; k++) {
15250 for (uint32_t m = 1; m <= 2; m++) {
15251 for (uint32_t n = 1; n <= 4; n++) {
15252 GemmMicrokernelTester()
15253 .mr(2)
15254 .nr(4)
15255 .kr(8)
15256 .sr(1)
15257 .m(m)
15258 .n(n)
15259 .k(k)
15260 .iterations(1)
15261 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15262 }
15263 }
15264 }
15265 }
15266
15267 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_div_8) {
15268 TEST_REQUIRES_X86_SSE2;
15269 for (size_t k = 16; k <= 80; k += 8) {
15270 GemmMicrokernelTester()
15271 .mr(2)
15272 .nr(4)
15273 .kr(8)
15274 .sr(1)
15275 .m(2)
15276 .n(4)
15277 .k(k)
15278 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15279 }
15280 }
15281
15282 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_div_8_strided_a) {
15283 TEST_REQUIRES_X86_SSE2;
15284 for (size_t k = 16; k <= 80; k += 8) {
15285 GemmMicrokernelTester()
15286 .mr(2)
15287 .nr(4)
15288 .kr(8)
15289 .sr(1)
15290 .m(2)
15291 .n(4)
15292 .k(k)
15293 .a_stride(83)
15294 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15295 }
15296 }
15297
15298 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_div_8_subtile) {
15299 TEST_REQUIRES_X86_SSE2;
15300 for (size_t k = 16; k <= 80; k += 8) {
15301 for (uint32_t m = 1; m <= 2; m++) {
15302 for (uint32_t n = 1; n <= 4; n++) {
15303 GemmMicrokernelTester()
15304 .mr(2)
15305 .nr(4)
15306 .kr(8)
15307 .sr(1)
15308 .m(m)
15309 .n(n)
15310 .k(k)
15311 .iterations(1)
15312 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15313 }
15314 }
15315 }
15316 }
15317
15318 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_gt_4) {
15319 TEST_REQUIRES_X86_SSE2;
15320 for (uint32_t n = 5; n < 8; n++) {
15321 for (size_t k = 1; k <= 40; k += 9) {
15322 GemmMicrokernelTester()
15323 .mr(2)
15324 .nr(4)
15325 .kr(8)
15326 .sr(1)
15327 .m(2)
15328 .n(4)
15329 .k(k)
15330 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15331 }
15332 }
15333 }
15334
15335 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_gt_4_strided_cn) {
15336 TEST_REQUIRES_X86_SSE2;
15337 for (uint32_t n = 5; n < 8; n++) {
15338 for (size_t k = 1; k <= 40; k += 9) {
15339 GemmMicrokernelTester()
15340 .mr(2)
15341 .nr(4)
15342 .kr(8)
15343 .sr(1)
15344 .m(2)
15345 .n(4)
15346 .k(k)
15347 .cn_stride(7)
15348 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15349 }
15350 }
15351 }
15352
15353 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_gt_4_strided_a) {
15354 TEST_REQUIRES_X86_SSE2;
15355 for (uint32_t n = 5; n < 8; n++) {
15356 for (size_t k = 1; k <= 40; k += 9) {
15357 GemmMicrokernelTester()
15358 .mr(2)
15359 .nr(4)
15360 .kr(8)
15361 .sr(1)
15362 .m(2)
15363 .n(n)
15364 .k(k)
15365 .a_stride(43)
15366 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15367 }
15368 }
15369 }
15370
15371 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_gt_4_subtile) {
15372 TEST_REQUIRES_X86_SSE2;
15373 for (uint32_t n = 5; n < 8; n++) {
15374 for (size_t k = 1; k <= 40; k += 9) {
15375 for (uint32_t m = 1; m <= 2; m++) {
15376 GemmMicrokernelTester()
15377 .mr(2)
15378 .nr(4)
15379 .kr(8)
15380 .sr(1)
15381 .m(m)
15382 .n(n)
15383 .k(k)
15384 .iterations(1)
15385 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15386 }
15387 }
15388 }
15389 }
15390
15391 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_div_4) {
15392 TEST_REQUIRES_X86_SSE2;
15393 for (uint32_t n = 8; n <= 12; n += 4) {
15394 for (size_t k = 1; k <= 40; k += 9) {
15395 GemmMicrokernelTester()
15396 .mr(2)
15397 .nr(4)
15398 .kr(8)
15399 .sr(1)
15400 .m(2)
15401 .n(4)
15402 .k(k)
15403 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15404 }
15405 }
15406 }
15407
15408 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_div_4_strided_cn) {
15409 TEST_REQUIRES_X86_SSE2;
15410 for (uint32_t n = 8; n <= 12; n += 4) {
15411 for (size_t k = 1; k <= 40; k += 9) {
15412 GemmMicrokernelTester()
15413 .mr(2)
15414 .nr(4)
15415 .kr(8)
15416 .sr(1)
15417 .m(2)
15418 .n(n)
15419 .k(k)
15420 .cn_stride(7)
15421 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15422 }
15423 }
15424 }
15425
15426 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_div_4_strided_a) {
15427 TEST_REQUIRES_X86_SSE2;
15428 for (uint32_t n = 8; n <= 12; n += 4) {
15429 for (size_t k = 1; k <= 40; k += 9) {
15430 GemmMicrokernelTester()
15431 .mr(2)
15432 .nr(4)
15433 .kr(8)
15434 .sr(1)
15435 .m(2)
15436 .n(n)
15437 .k(k)
15438 .a_stride(43)
15439 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15440 }
15441 }
15442 }
15443
15444 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_div_4_subtile) {
15445 TEST_REQUIRES_X86_SSE2;
15446 for (uint32_t n = 8; n <= 12; n += 4) {
15447 for (size_t k = 1; k <= 40; k += 9) {
15448 for (uint32_t m = 1; m <= 2; m++) {
15449 GemmMicrokernelTester()
15450 .mr(2)
15451 .nr(4)
15452 .kr(8)
15453 .sr(1)
15454 .m(m)
15455 .n(n)
15456 .k(k)
15457 .iterations(1)
15458 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15459 }
15460 }
15461 }
15462 }
15463
15464 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, strided_cm_subtile) {
15465 TEST_REQUIRES_X86_SSE2;
15466 for (size_t k = 1; k <= 40; k += 9) {
15467 for (uint32_t m = 1; m <= 2; m++) {
15468 for (uint32_t n = 1; n <= 4; n++) {
15469 GemmMicrokernelTester()
15470 .mr(2)
15471 .nr(4)
15472 .kr(8)
15473 .sr(1)
15474 .m(m)
15475 .n(n)
15476 .k(k)
15477 .cm_stride(7)
15478 .iterations(1)
15479 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15480 }
15481 }
15482 }
15483 }
15484
15485 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, qmin) {
15486 TEST_REQUIRES_X86_SSE2;
15487 GemmMicrokernelTester()
15488 .mr(2)
15489 .nr(4)
15490 .kr(8)
15491 .sr(1)
15492 .m(2)
15493 .n(4)
15494 .k(8)
15495 .qmin(128)
15496 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15497 }
15498
15499 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, qmax) {
15500 TEST_REQUIRES_X86_SSE2;
15501 GemmMicrokernelTester()
15502 .mr(2)
15503 .nr(4)
15504 .kr(8)
15505 .sr(1)
15506 .m(2)
15507 .n(4)
15508 .k(8)
15509 .qmax(128)
15510 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15511 }
15512
15513 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, strided_cm) {
15514 TEST_REQUIRES_X86_SSE2;
15515 GemmMicrokernelTester()
15516 .mr(2)
15517 .nr(4)
15518 .kr(8)
15519 .sr(1)
15520 .m(2)
15521 .n(4)
15522 .k(8)
15523 .cm_stride(7)
15524 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15525 }
15526#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
15527
15528
15529#if XNN_ARCH_X86 || XNN_ARCH_X86_64
15530 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8) {
15531 TEST_REQUIRES_X86_SSE2;
15532 GemmMicrokernelTester()
15533 .mr(3)
15534 .nr(4)
15535 .kr(8)
15536 .sr(1)
15537 .m(3)
15538 .n(4)
15539 .k(8)
15540 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15541 }
15542
15543 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, strided_cn) {
15544 TEST_REQUIRES_X86_SSE2;
15545 GemmMicrokernelTester()
15546 .mr(3)
15547 .nr(4)
15548 .kr(8)
15549 .sr(1)
15550 .m(3)
15551 .n(4)
15552 .k(8)
15553 .cn_stride(7)
15554 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15555 }
15556
15557 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_strided_a) {
15558 TEST_REQUIRES_X86_SSE2;
15559 GemmMicrokernelTester()
15560 .mr(3)
15561 .nr(4)
15562 .kr(8)
15563 .sr(1)
15564 .m(3)
15565 .n(4)
15566 .k(8)
15567 .a_stride(11)
15568 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15569 }
15570
15571 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_subtile) {
15572 TEST_REQUIRES_X86_SSE2;
15573 for (uint32_t m = 1; m <= 3; m++) {
15574 for (uint32_t n = 1; n <= 4; n++) {
15575 GemmMicrokernelTester()
15576 .mr(3)
15577 .nr(4)
15578 .kr(8)
15579 .sr(1)
15580 .m(m)
15581 .n(n)
15582 .k(8)
15583 .iterations(1)
15584 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15585 }
15586 }
15587 }
15588
15589 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_subtile_m) {
15590 TEST_REQUIRES_X86_SSE2;
15591 for (uint32_t m = 1; m <= 3; m++) {
15592 GemmMicrokernelTester()
15593 .mr(3)
15594 .nr(4)
15595 .kr(8)
15596 .sr(1)
15597 .m(m)
15598 .n(4)
15599 .k(8)
15600 .iterations(1)
15601 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15602 }
15603 }
15604
15605 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_subtile_n) {
15606 TEST_REQUIRES_X86_SSE2;
15607 for (uint32_t n = 1; n <= 4; n++) {
15608 GemmMicrokernelTester()
15609 .mr(3)
15610 .nr(4)
15611 .kr(8)
15612 .sr(1)
15613 .m(3)
15614 .n(n)
15615 .k(8)
15616 .iterations(1)
15617 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15618 }
15619 }
15620
15621 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_lt_8) {
15622 TEST_REQUIRES_X86_SSE2;
15623 for (size_t k = 1; k < 8; k++) {
15624 GemmMicrokernelTester()
15625 .mr(3)
15626 .nr(4)
15627 .kr(8)
15628 .sr(1)
15629 .m(3)
15630 .n(4)
15631 .k(k)
15632 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15633 }
15634 }
15635
15636 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_lt_8_strided_a) {
15637 TEST_REQUIRES_X86_SSE2;
15638 for (size_t k = 1; k < 8; k++) {
15639 GemmMicrokernelTester()
15640 .mr(3)
15641 .nr(4)
15642 .kr(8)
15643 .sr(1)
15644 .m(3)
15645 .n(4)
15646 .k(k)
15647 .a_stride(11)
15648 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15649 }
15650 }
15651
15652 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_lt_8_subtile) {
15653 TEST_REQUIRES_X86_SSE2;
15654 for (size_t k = 1; k < 8; k++) {
15655 for (uint32_t m = 1; m <= 3; m++) {
15656 for (uint32_t n = 1; n <= 4; n++) {
15657 GemmMicrokernelTester()
15658 .mr(3)
15659 .nr(4)
15660 .kr(8)
15661 .sr(1)
15662 .m(m)
15663 .n(n)
15664 .k(k)
15665 .iterations(1)
15666 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15667 }
15668 }
15669 }
15670 }
15671
15672 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_gt_8) {
15673 TEST_REQUIRES_X86_SSE2;
15674 for (size_t k = 9; k < 16; k++) {
15675 GemmMicrokernelTester()
15676 .mr(3)
15677 .nr(4)
15678 .kr(8)
15679 .sr(1)
15680 .m(3)
15681 .n(4)
15682 .k(k)
15683 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15684 }
15685 }
15686
15687 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_gt_8_strided_a) {
15688 TEST_REQUIRES_X86_SSE2;
15689 for (size_t k = 9; k < 16; k++) {
15690 GemmMicrokernelTester()
15691 .mr(3)
15692 .nr(4)
15693 .kr(8)
15694 .sr(1)
15695 .m(3)
15696 .n(4)
15697 .k(k)
15698 .a_stride(19)
15699 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15700 }
15701 }
15702
15703 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_gt_8_subtile) {
15704 TEST_REQUIRES_X86_SSE2;
15705 for (size_t k = 9; k < 16; k++) {
15706 for (uint32_t m = 1; m <= 3; m++) {
15707 for (uint32_t n = 1; n <= 4; n++) {
15708 GemmMicrokernelTester()
15709 .mr(3)
15710 .nr(4)
15711 .kr(8)
15712 .sr(1)
15713 .m(m)
15714 .n(n)
15715 .k(k)
15716 .iterations(1)
15717 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15718 }
15719 }
15720 }
15721 }
15722
15723 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_div_8) {
15724 TEST_REQUIRES_X86_SSE2;
15725 for (size_t k = 16; k <= 80; k += 8) {
15726 GemmMicrokernelTester()
15727 .mr(3)
15728 .nr(4)
15729 .kr(8)
15730 .sr(1)
15731 .m(3)
15732 .n(4)
15733 .k(k)
15734 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15735 }
15736 }
15737
15738 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_div_8_strided_a) {
15739 TEST_REQUIRES_X86_SSE2;
15740 for (size_t k = 16; k <= 80; k += 8) {
15741 GemmMicrokernelTester()
15742 .mr(3)
15743 .nr(4)
15744 .kr(8)
15745 .sr(1)
15746 .m(3)
15747 .n(4)
15748 .k(k)
15749 .a_stride(83)
15750 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15751 }
15752 }
15753
15754 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_div_8_subtile) {
15755 TEST_REQUIRES_X86_SSE2;
15756 for (size_t k = 16; k <= 80; k += 8) {
15757 for (uint32_t m = 1; m <= 3; m++) {
15758 for (uint32_t n = 1; n <= 4; n++) {
15759 GemmMicrokernelTester()
15760 .mr(3)
15761 .nr(4)
15762 .kr(8)
15763 .sr(1)
15764 .m(m)
15765 .n(n)
15766 .k(k)
15767 .iterations(1)
15768 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15769 }
15770 }
15771 }
15772 }
15773
15774 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4) {
15775 TEST_REQUIRES_X86_SSE2;
15776 for (uint32_t n = 5; n < 8; n++) {
15777 for (size_t k = 1; k <= 40; k += 9) {
15778 GemmMicrokernelTester()
15779 .mr(3)
15780 .nr(4)
15781 .kr(8)
15782 .sr(1)
15783 .m(3)
15784 .n(4)
15785 .k(k)
15786 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15787 }
15788 }
15789 }
15790
15791 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4_strided_cn) {
15792 TEST_REQUIRES_X86_SSE2;
15793 for (uint32_t n = 5; n < 8; n++) {
15794 for (size_t k = 1; k <= 40; k += 9) {
15795 GemmMicrokernelTester()
15796 .mr(3)
15797 .nr(4)
15798 .kr(8)
15799 .sr(1)
15800 .m(3)
15801 .n(4)
15802 .k(k)
15803 .cn_stride(7)
15804 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15805 }
15806 }
15807 }
15808
15809 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4_strided_a) {
15810 TEST_REQUIRES_X86_SSE2;
15811 for (uint32_t n = 5; n < 8; n++) {
15812 for (size_t k = 1; k <= 40; k += 9) {
15813 GemmMicrokernelTester()
15814 .mr(3)
15815 .nr(4)
15816 .kr(8)
15817 .sr(1)
15818 .m(3)
15819 .n(n)
15820 .k(k)
15821 .a_stride(43)
15822 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15823 }
15824 }
15825 }
15826
15827 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4_subtile) {
15828 TEST_REQUIRES_X86_SSE2;
15829 for (uint32_t n = 5; n < 8; n++) {
15830 for (size_t k = 1; k <= 40; k += 9) {
15831 for (uint32_t m = 1; m <= 3; m++) {
15832 GemmMicrokernelTester()
15833 .mr(3)
15834 .nr(4)
15835 .kr(8)
15836 .sr(1)
15837 .m(m)
15838 .n(n)
15839 .k(k)
15840 .iterations(1)
15841 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15842 }
15843 }
15844 }
15845 }
15846
15847 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4) {
15848 TEST_REQUIRES_X86_SSE2;
15849 for (uint32_t n = 8; n <= 12; n += 4) {
15850 for (size_t k = 1; k <= 40; k += 9) {
15851 GemmMicrokernelTester()
15852 .mr(3)
15853 .nr(4)
15854 .kr(8)
15855 .sr(1)
15856 .m(3)
15857 .n(4)
15858 .k(k)
15859 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15860 }
15861 }
15862 }
15863
15864 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4_strided_cn) {
15865 TEST_REQUIRES_X86_SSE2;
15866 for (uint32_t n = 8; n <= 12; n += 4) {
15867 for (size_t k = 1; k <= 40; k += 9) {
15868 GemmMicrokernelTester()
15869 .mr(3)
15870 .nr(4)
15871 .kr(8)
15872 .sr(1)
15873 .m(3)
15874 .n(n)
15875 .k(k)
15876 .cn_stride(7)
15877 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15878 }
15879 }
15880 }
15881
15882 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4_strided_a) {
15883 TEST_REQUIRES_X86_SSE2;
15884 for (uint32_t n = 8; n <= 12; n += 4) {
15885 for (size_t k = 1; k <= 40; k += 9) {
15886 GemmMicrokernelTester()
15887 .mr(3)
15888 .nr(4)
15889 .kr(8)
15890 .sr(1)
15891 .m(3)
15892 .n(n)
15893 .k(k)
15894 .a_stride(43)
15895 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15896 }
15897 }
15898 }
15899
15900 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4_subtile) {
15901 TEST_REQUIRES_X86_SSE2;
15902 for (uint32_t n = 8; n <= 12; n += 4) {
15903 for (size_t k = 1; k <= 40; k += 9) {
15904 for (uint32_t m = 1; m <= 3; m++) {
15905 GemmMicrokernelTester()
15906 .mr(3)
15907 .nr(4)
15908 .kr(8)
15909 .sr(1)
15910 .m(m)
15911 .n(n)
15912 .k(k)
15913 .iterations(1)
15914 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15915 }
15916 }
15917 }
15918 }
15919
15920 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, strided_cm_subtile) {
15921 TEST_REQUIRES_X86_SSE2;
15922 for (size_t k = 1; k <= 40; k += 9) {
15923 for (uint32_t m = 1; m <= 3; m++) {
15924 for (uint32_t n = 1; n <= 4; n++) {
15925 GemmMicrokernelTester()
15926 .mr(3)
15927 .nr(4)
15928 .kr(8)
15929 .sr(1)
15930 .m(m)
15931 .n(n)
15932 .k(k)
15933 .cm_stride(7)
15934 .iterations(1)
15935 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15936 }
15937 }
15938 }
15939 }
15940
15941 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, qmin) {
15942 TEST_REQUIRES_X86_SSE2;
15943 GemmMicrokernelTester()
15944 .mr(3)
15945 .nr(4)
15946 .kr(8)
15947 .sr(1)
15948 .m(3)
15949 .n(4)
15950 .k(8)
15951 .qmin(128)
15952 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15953 }
15954
15955 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, qmax) {
15956 TEST_REQUIRES_X86_SSE2;
15957 GemmMicrokernelTester()
15958 .mr(3)
15959 .nr(4)
15960 .kr(8)
15961 .sr(1)
15962 .m(3)
15963 .n(4)
15964 .k(8)
15965 .qmax(128)
15966 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15967 }
15968
15969 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, strided_cm) {
15970 TEST_REQUIRES_X86_SSE2;
15971 GemmMicrokernelTester()
15972 .mr(3)
15973 .nr(4)
15974 .kr(8)
15975 .sr(1)
15976 .m(3)
15977 .n(4)
15978 .k(8)
15979 .cm_stride(7)
15980 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15981 }
15982#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
15983
15984
15985#if XNN_ARCH_X86 || XNN_ARCH_X86_64
15986 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8) {
15987 TEST_REQUIRES_X86_SSE41;
15988 GemmMicrokernelTester()
15989 .mr(1)
15990 .nr(4)
15991 .kr(8)
15992 .sr(1)
15993 .m(1)
15994 .n(4)
15995 .k(8)
15996 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
15997 }
15998
15999 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, strided_cn) {
16000 TEST_REQUIRES_X86_SSE41;
16001 GemmMicrokernelTester()
16002 .mr(1)
16003 .nr(4)
16004 .kr(8)
16005 .sr(1)
16006 .m(1)
16007 .n(4)
16008 .k(8)
16009 .cn_stride(7)
16010 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16011 }
16012
16013 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_strided_a) {
16014 TEST_REQUIRES_X86_SSE41;
16015 GemmMicrokernelTester()
16016 .mr(1)
16017 .nr(4)
16018 .kr(8)
16019 .sr(1)
16020 .m(1)
16021 .n(4)
16022 .k(8)
16023 .a_stride(11)
16024 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16025 }
16026
16027 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_subtile) {
16028 TEST_REQUIRES_X86_SSE41;
16029 for (uint32_t m = 1; m <= 1; m++) {
16030 for (uint32_t n = 1; n <= 4; n++) {
16031 GemmMicrokernelTester()
16032 .mr(1)
16033 .nr(4)
16034 .kr(8)
16035 .sr(1)
16036 .m(m)
16037 .n(n)
16038 .k(8)
16039 .iterations(1)
16040 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16041 }
16042 }
16043 }
16044
16045 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_subtile_m) {
16046 TEST_REQUIRES_X86_SSE41;
16047 for (uint32_t m = 1; m <= 1; m++) {
16048 GemmMicrokernelTester()
16049 .mr(1)
16050 .nr(4)
16051 .kr(8)
16052 .sr(1)
16053 .m(m)
16054 .n(4)
16055 .k(8)
16056 .iterations(1)
16057 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16058 }
16059 }
16060
16061 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_subtile_n) {
16062 TEST_REQUIRES_X86_SSE41;
16063 for (uint32_t n = 1; n <= 4; n++) {
16064 GemmMicrokernelTester()
16065 .mr(1)
16066 .nr(4)
16067 .kr(8)
16068 .sr(1)
16069 .m(1)
16070 .n(n)
16071 .k(8)
16072 .iterations(1)
16073 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16074 }
16075 }
16076
16077 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_lt_8) {
16078 TEST_REQUIRES_X86_SSE41;
16079 for (size_t k = 1; k < 8; k++) {
16080 GemmMicrokernelTester()
16081 .mr(1)
16082 .nr(4)
16083 .kr(8)
16084 .sr(1)
16085 .m(1)
16086 .n(4)
16087 .k(k)
16088 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16089 }
16090 }
16091
16092 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_lt_8_strided_a) {
16093 TEST_REQUIRES_X86_SSE41;
16094 for (size_t k = 1; k < 8; k++) {
16095 GemmMicrokernelTester()
16096 .mr(1)
16097 .nr(4)
16098 .kr(8)
16099 .sr(1)
16100 .m(1)
16101 .n(4)
16102 .k(k)
16103 .a_stride(11)
16104 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16105 }
16106 }
16107
16108 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_lt_8_subtile) {
16109 TEST_REQUIRES_X86_SSE41;
16110 for (size_t k = 1; k < 8; k++) {
16111 for (uint32_t m = 1; m <= 1; m++) {
16112 for (uint32_t n = 1; n <= 4; n++) {
16113 GemmMicrokernelTester()
16114 .mr(1)
16115 .nr(4)
16116 .kr(8)
16117 .sr(1)
16118 .m(m)
16119 .n(n)
16120 .k(k)
16121 .iterations(1)
16122 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16123 }
16124 }
16125 }
16126 }
16127
16128 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_gt_8) {
16129 TEST_REQUIRES_X86_SSE41;
16130 for (size_t k = 9; k < 16; k++) {
16131 GemmMicrokernelTester()
16132 .mr(1)
16133 .nr(4)
16134 .kr(8)
16135 .sr(1)
16136 .m(1)
16137 .n(4)
16138 .k(k)
16139 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16140 }
16141 }
16142
16143 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_gt_8_strided_a) {
16144 TEST_REQUIRES_X86_SSE41;
16145 for (size_t k = 9; k < 16; k++) {
16146 GemmMicrokernelTester()
16147 .mr(1)
16148 .nr(4)
16149 .kr(8)
16150 .sr(1)
16151 .m(1)
16152 .n(4)
16153 .k(k)
16154 .a_stride(19)
16155 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16156 }
16157 }
16158
16159 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_gt_8_subtile) {
16160 TEST_REQUIRES_X86_SSE41;
16161 for (size_t k = 9; k < 16; k++) {
16162 for (uint32_t m = 1; m <= 1; m++) {
16163 for (uint32_t n = 1; n <= 4; n++) {
16164 GemmMicrokernelTester()
16165 .mr(1)
16166 .nr(4)
16167 .kr(8)
16168 .sr(1)
16169 .m(m)
16170 .n(n)
16171 .k(k)
16172 .iterations(1)
16173 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16174 }
16175 }
16176 }
16177 }
16178
16179 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_div_8) {
16180 TEST_REQUIRES_X86_SSE41;
16181 for (size_t k = 16; k <= 80; k += 8) {
16182 GemmMicrokernelTester()
16183 .mr(1)
16184 .nr(4)
16185 .kr(8)
16186 .sr(1)
16187 .m(1)
16188 .n(4)
16189 .k(k)
16190 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16191 }
16192 }
16193
16194 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_div_8_strided_a) {
16195 TEST_REQUIRES_X86_SSE41;
16196 for (size_t k = 16; k <= 80; k += 8) {
16197 GemmMicrokernelTester()
16198 .mr(1)
16199 .nr(4)
16200 .kr(8)
16201 .sr(1)
16202 .m(1)
16203 .n(4)
16204 .k(k)
16205 .a_stride(83)
16206 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16207 }
16208 }
16209
16210 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_div_8_subtile) {
16211 TEST_REQUIRES_X86_SSE41;
16212 for (size_t k = 16; k <= 80; k += 8) {
16213 for (uint32_t m = 1; m <= 1; m++) {
16214 for (uint32_t n = 1; n <= 4; n++) {
16215 GemmMicrokernelTester()
16216 .mr(1)
16217 .nr(4)
16218 .kr(8)
16219 .sr(1)
16220 .m(m)
16221 .n(n)
16222 .k(k)
16223 .iterations(1)
16224 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16225 }
16226 }
16227 }
16228 }
16229
16230 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4) {
16231 TEST_REQUIRES_X86_SSE41;
16232 for (uint32_t n = 5; n < 8; n++) {
16233 for (size_t k = 1; k <= 40; k += 9) {
16234 GemmMicrokernelTester()
16235 .mr(1)
16236 .nr(4)
16237 .kr(8)
16238 .sr(1)
16239 .m(1)
16240 .n(4)
16241 .k(k)
16242 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16243 }
16244 }
16245 }
16246
16247 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4_strided_cn) {
16248 TEST_REQUIRES_X86_SSE41;
16249 for (uint32_t n = 5; n < 8; n++) {
16250 for (size_t k = 1; k <= 40; k += 9) {
16251 GemmMicrokernelTester()
16252 .mr(1)
16253 .nr(4)
16254 .kr(8)
16255 .sr(1)
16256 .m(1)
16257 .n(4)
16258 .k(k)
16259 .cn_stride(7)
16260 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16261 }
16262 }
16263 }
16264
16265 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4_strided_a) {
16266 TEST_REQUIRES_X86_SSE41;
16267 for (uint32_t n = 5; n < 8; n++) {
16268 for (size_t k = 1; k <= 40; k += 9) {
16269 GemmMicrokernelTester()
16270 .mr(1)
16271 .nr(4)
16272 .kr(8)
16273 .sr(1)
16274 .m(1)
16275 .n(n)
16276 .k(k)
16277 .a_stride(43)
16278 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16279 }
16280 }
16281 }
16282
16283 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4_subtile) {
16284 TEST_REQUIRES_X86_SSE41;
16285 for (uint32_t n = 5; n < 8; n++) {
16286 for (size_t k = 1; k <= 40; k += 9) {
16287 for (uint32_t m = 1; m <= 1; m++) {
16288 GemmMicrokernelTester()
16289 .mr(1)
16290 .nr(4)
16291 .kr(8)
16292 .sr(1)
16293 .m(m)
16294 .n(n)
16295 .k(k)
16296 .iterations(1)
16297 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16298 }
16299 }
16300 }
16301 }
16302
16303 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4) {
16304 TEST_REQUIRES_X86_SSE41;
16305 for (uint32_t n = 8; n <= 12; n += 4) {
16306 for (size_t k = 1; k <= 40; k += 9) {
16307 GemmMicrokernelTester()
16308 .mr(1)
16309 .nr(4)
16310 .kr(8)
16311 .sr(1)
16312 .m(1)
16313 .n(4)
16314 .k(k)
16315 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16316 }
16317 }
16318 }
16319
16320 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4_strided_cn) {
16321 TEST_REQUIRES_X86_SSE41;
16322 for (uint32_t n = 8; n <= 12; n += 4) {
16323 for (size_t k = 1; k <= 40; k += 9) {
16324 GemmMicrokernelTester()
16325 .mr(1)
16326 .nr(4)
16327 .kr(8)
16328 .sr(1)
16329 .m(1)
16330 .n(n)
16331 .k(k)
16332 .cn_stride(7)
16333 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16334 }
16335 }
16336 }
16337
16338 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4_strided_a) {
16339 TEST_REQUIRES_X86_SSE41;
16340 for (uint32_t n = 8; n <= 12; n += 4) {
16341 for (size_t k = 1; k <= 40; k += 9) {
16342 GemmMicrokernelTester()
16343 .mr(1)
16344 .nr(4)
16345 .kr(8)
16346 .sr(1)
16347 .m(1)
16348 .n(n)
16349 .k(k)
16350 .a_stride(43)
16351 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16352 }
16353 }
16354 }
16355
16356 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4_subtile) {
16357 TEST_REQUIRES_X86_SSE41;
16358 for (uint32_t n = 8; n <= 12; n += 4) {
16359 for (size_t k = 1; k <= 40; k += 9) {
16360 for (uint32_t m = 1; m <= 1; m++) {
16361 GemmMicrokernelTester()
16362 .mr(1)
16363 .nr(4)
16364 .kr(8)
16365 .sr(1)
16366 .m(m)
16367 .n(n)
16368 .k(k)
16369 .iterations(1)
16370 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16371 }
16372 }
16373 }
16374 }
16375
16376 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, strided_cm_subtile) {
16377 TEST_REQUIRES_X86_SSE41;
16378 for (size_t k = 1; k <= 40; k += 9) {
16379 for (uint32_t m = 1; m <= 1; m++) {
16380 for (uint32_t n = 1; n <= 4; n++) {
16381 GemmMicrokernelTester()
16382 .mr(1)
16383 .nr(4)
16384 .kr(8)
16385 .sr(1)
16386 .m(m)
16387 .n(n)
16388 .k(k)
16389 .cm_stride(7)
16390 .iterations(1)
16391 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16392 }
16393 }
16394 }
16395 }
16396
16397 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, qmin) {
16398 TEST_REQUIRES_X86_SSE41;
16399 GemmMicrokernelTester()
16400 .mr(1)
16401 .nr(4)
16402 .kr(8)
16403 .sr(1)
16404 .m(1)
16405 .n(4)
16406 .k(8)
16407 .qmin(128)
16408 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16409 }
16410
16411 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, qmax) {
16412 TEST_REQUIRES_X86_SSE41;
16413 GemmMicrokernelTester()
16414 .mr(1)
16415 .nr(4)
16416 .kr(8)
16417 .sr(1)
16418 .m(1)
16419 .n(4)
16420 .k(8)
16421 .qmax(128)
16422 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16423 }
16424
16425 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, strided_cm) {
16426 TEST_REQUIRES_X86_SSE41;
16427 GemmMicrokernelTester()
16428 .mr(1)
16429 .nr(4)
16430 .kr(8)
16431 .sr(1)
16432 .m(1)
16433 .n(4)
16434 .k(8)
16435 .cm_stride(7)
16436 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16437 }
16438#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
16439
16440
16441#if XNN_ARCH_X86 || XNN_ARCH_X86_64
16442 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8) {
16443 TEST_REQUIRES_X86_SSE41;
16444 GemmMicrokernelTester()
16445 .mr(2)
16446 .nr(4)
16447 .kr(8)
16448 .sr(1)
16449 .m(2)
16450 .n(4)
16451 .k(8)
16452 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16453 }
16454
16455 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, strided_cn) {
16456 TEST_REQUIRES_X86_SSE41;
16457 GemmMicrokernelTester()
16458 .mr(2)
16459 .nr(4)
16460 .kr(8)
16461 .sr(1)
16462 .m(2)
16463 .n(4)
16464 .k(8)
16465 .cn_stride(7)
16466 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16467 }
16468
16469 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8_strided_a) {
16470 TEST_REQUIRES_X86_SSE41;
16471 GemmMicrokernelTester()
16472 .mr(2)
16473 .nr(4)
16474 .kr(8)
16475 .sr(1)
16476 .m(2)
16477 .n(4)
16478 .k(8)
16479 .a_stride(11)
16480 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16481 }
16482
16483 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8_subtile) {
16484 TEST_REQUIRES_X86_SSE41;
16485 for (uint32_t m = 1; m <= 2; m++) {
16486 for (uint32_t n = 1; n <= 4; n++) {
16487 GemmMicrokernelTester()
16488 .mr(2)
16489 .nr(4)
16490 .kr(8)
16491 .sr(1)
16492 .m(m)
16493 .n(n)
16494 .k(8)
16495 .iterations(1)
16496 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16497 }
16498 }
16499 }
16500
16501 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8_subtile_m) {
16502 TEST_REQUIRES_X86_SSE41;
16503 for (uint32_t m = 1; m <= 2; m++) {
16504 GemmMicrokernelTester()
16505 .mr(2)
16506 .nr(4)
16507 .kr(8)
16508 .sr(1)
16509 .m(m)
16510 .n(4)
16511 .k(8)
16512 .iterations(1)
16513 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16514 }
16515 }
16516
16517 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8_subtile_n) {
16518 TEST_REQUIRES_X86_SSE41;
16519 for (uint32_t n = 1; n <= 4; n++) {
16520 GemmMicrokernelTester()
16521 .mr(2)
16522 .nr(4)
16523 .kr(8)
16524 .sr(1)
16525 .m(2)
16526 .n(n)
16527 .k(8)
16528 .iterations(1)
16529 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16530 }
16531 }
16532
16533 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_lt_8) {
16534 TEST_REQUIRES_X86_SSE41;
16535 for (size_t k = 1; k < 8; k++) {
16536 GemmMicrokernelTester()
16537 .mr(2)
16538 .nr(4)
16539 .kr(8)
16540 .sr(1)
16541 .m(2)
16542 .n(4)
16543 .k(k)
16544 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16545 }
16546 }
16547
16548 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_lt_8_strided_a) {
16549 TEST_REQUIRES_X86_SSE41;
16550 for (size_t k = 1; k < 8; k++) {
16551 GemmMicrokernelTester()
16552 .mr(2)
16553 .nr(4)
16554 .kr(8)
16555 .sr(1)
16556 .m(2)
16557 .n(4)
16558 .k(k)
16559 .a_stride(11)
16560 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16561 }
16562 }
16563
16564 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_lt_8_subtile) {
16565 TEST_REQUIRES_X86_SSE41;
16566 for (size_t k = 1; k < 8; k++) {
16567 for (uint32_t m = 1; m <= 2; m++) {
16568 for (uint32_t n = 1; n <= 4; n++) {
16569 GemmMicrokernelTester()
16570 .mr(2)
16571 .nr(4)
16572 .kr(8)
16573 .sr(1)
16574 .m(m)
16575 .n(n)
16576 .k(k)
16577 .iterations(1)
16578 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16579 }
16580 }
16581 }
16582 }
16583
16584 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_gt_8) {
16585 TEST_REQUIRES_X86_SSE41;
16586 for (size_t k = 9; k < 16; k++) {
16587 GemmMicrokernelTester()
16588 .mr(2)
16589 .nr(4)
16590 .kr(8)
16591 .sr(1)
16592 .m(2)
16593 .n(4)
16594 .k(k)
16595 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16596 }
16597 }
16598
16599 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_gt_8_strided_a) {
16600 TEST_REQUIRES_X86_SSE41;
16601 for (size_t k = 9; k < 16; k++) {
16602 GemmMicrokernelTester()
16603 .mr(2)
16604 .nr(4)
16605 .kr(8)
16606 .sr(1)
16607 .m(2)
16608 .n(4)
16609 .k(k)
16610 .a_stride(19)
16611 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16612 }
16613 }
16614
16615 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_gt_8_subtile) {
16616 TEST_REQUIRES_X86_SSE41;
16617 for (size_t k = 9; k < 16; k++) {
16618 for (uint32_t m = 1; m <= 2; m++) {
16619 for (uint32_t n = 1; n <= 4; n++) {
16620 GemmMicrokernelTester()
16621 .mr(2)
16622 .nr(4)
16623 .kr(8)
16624 .sr(1)
16625 .m(m)
16626 .n(n)
16627 .k(k)
16628 .iterations(1)
16629 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16630 }
16631 }
16632 }
16633 }
16634
16635 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_div_8) {
16636 TEST_REQUIRES_X86_SSE41;
16637 for (size_t k = 16; k <= 80; k += 8) {
16638 GemmMicrokernelTester()
16639 .mr(2)
16640 .nr(4)
16641 .kr(8)
16642 .sr(1)
16643 .m(2)
16644 .n(4)
16645 .k(k)
16646 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16647 }
16648 }
16649
16650 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_div_8_strided_a) {
16651 TEST_REQUIRES_X86_SSE41;
16652 for (size_t k = 16; k <= 80; k += 8) {
16653 GemmMicrokernelTester()
16654 .mr(2)
16655 .nr(4)
16656 .kr(8)
16657 .sr(1)
16658 .m(2)
16659 .n(4)
16660 .k(k)
16661 .a_stride(83)
16662 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16663 }
16664 }
16665
16666 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_div_8_subtile) {
16667 TEST_REQUIRES_X86_SSE41;
16668 for (size_t k = 16; k <= 80; k += 8) {
16669 for (uint32_t m = 1; m <= 2; m++) {
16670 for (uint32_t n = 1; n <= 4; n++) {
16671 GemmMicrokernelTester()
16672 .mr(2)
16673 .nr(4)
16674 .kr(8)
16675 .sr(1)
16676 .m(m)
16677 .n(n)
16678 .k(k)
16679 .iterations(1)
16680 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16681 }
16682 }
16683 }
16684 }
16685
16686 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4) {
16687 TEST_REQUIRES_X86_SSE41;
16688 for (uint32_t n = 5; n < 8; n++) {
16689 for (size_t k = 1; k <= 40; k += 9) {
16690 GemmMicrokernelTester()
16691 .mr(2)
16692 .nr(4)
16693 .kr(8)
16694 .sr(1)
16695 .m(2)
16696 .n(4)
16697 .k(k)
16698 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16699 }
16700 }
16701 }
16702
16703 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4_strided_cn) {
16704 TEST_REQUIRES_X86_SSE41;
16705 for (uint32_t n = 5; n < 8; n++) {
16706 for (size_t k = 1; k <= 40; k += 9) {
16707 GemmMicrokernelTester()
16708 .mr(2)
16709 .nr(4)
16710 .kr(8)
16711 .sr(1)
16712 .m(2)
16713 .n(4)
16714 .k(k)
16715 .cn_stride(7)
16716 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16717 }
16718 }
16719 }
16720
16721 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4_strided_a) {
16722 TEST_REQUIRES_X86_SSE41;
16723 for (uint32_t n = 5; n < 8; n++) {
16724 for (size_t k = 1; k <= 40; k += 9) {
16725 GemmMicrokernelTester()
16726 .mr(2)
16727 .nr(4)
16728 .kr(8)
16729 .sr(1)
16730 .m(2)
16731 .n(n)
16732 .k(k)
16733 .a_stride(43)
16734 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16735 }
16736 }
16737 }
16738
16739 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4_subtile) {
16740 TEST_REQUIRES_X86_SSE41;
16741 for (uint32_t n = 5; n < 8; n++) {
16742 for (size_t k = 1; k <= 40; k += 9) {
16743 for (uint32_t m = 1; m <= 2; m++) {
16744 GemmMicrokernelTester()
16745 .mr(2)
16746 .nr(4)
16747 .kr(8)
16748 .sr(1)
16749 .m(m)
16750 .n(n)
16751 .k(k)
16752 .iterations(1)
16753 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16754 }
16755 }
16756 }
16757 }
16758
16759 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4) {
16760 TEST_REQUIRES_X86_SSE41;
16761 for (uint32_t n = 8; n <= 12; n += 4) {
16762 for (size_t k = 1; k <= 40; k += 9) {
16763 GemmMicrokernelTester()
16764 .mr(2)
16765 .nr(4)
16766 .kr(8)
16767 .sr(1)
16768 .m(2)
16769 .n(4)
16770 .k(k)
16771 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16772 }
16773 }
16774 }
16775
16776 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4_strided_cn) {
16777 TEST_REQUIRES_X86_SSE41;
16778 for (uint32_t n = 8; n <= 12; n += 4) {
16779 for (size_t k = 1; k <= 40; k += 9) {
16780 GemmMicrokernelTester()
16781 .mr(2)
16782 .nr(4)
16783 .kr(8)
16784 .sr(1)
16785 .m(2)
16786 .n(n)
16787 .k(k)
16788 .cn_stride(7)
16789 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16790 }
16791 }
16792 }
16793
16794 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4_strided_a) {
16795 TEST_REQUIRES_X86_SSE41;
16796 for (uint32_t n = 8; n <= 12; n += 4) {
16797 for (size_t k = 1; k <= 40; k += 9) {
16798 GemmMicrokernelTester()
16799 .mr(2)
16800 .nr(4)
16801 .kr(8)
16802 .sr(1)
16803 .m(2)
16804 .n(n)
16805 .k(k)
16806 .a_stride(43)
16807 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16808 }
16809 }
16810 }
16811
16812 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4_subtile) {
16813 TEST_REQUIRES_X86_SSE41;
16814 for (uint32_t n = 8; n <= 12; n += 4) {
16815 for (size_t k = 1; k <= 40; k += 9) {
16816 for (uint32_t m = 1; m <= 2; m++) {
16817 GemmMicrokernelTester()
16818 .mr(2)
16819 .nr(4)
16820 .kr(8)
16821 .sr(1)
16822 .m(m)
16823 .n(n)
16824 .k(k)
16825 .iterations(1)
16826 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16827 }
16828 }
16829 }
16830 }
16831
16832 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, strided_cm_subtile) {
16833 TEST_REQUIRES_X86_SSE41;
16834 for (size_t k = 1; k <= 40; k += 9) {
16835 for (uint32_t m = 1; m <= 2; m++) {
16836 for (uint32_t n = 1; n <= 4; n++) {
16837 GemmMicrokernelTester()
16838 .mr(2)
16839 .nr(4)
16840 .kr(8)
16841 .sr(1)
16842 .m(m)
16843 .n(n)
16844 .k(k)
16845 .cm_stride(7)
16846 .iterations(1)
16847 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16848 }
16849 }
16850 }
16851 }
16852
16853 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, qmin) {
16854 TEST_REQUIRES_X86_SSE41;
16855 GemmMicrokernelTester()
16856 .mr(2)
16857 .nr(4)
16858 .kr(8)
16859 .sr(1)
16860 .m(2)
16861 .n(4)
16862 .k(8)
16863 .qmin(128)
16864 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16865 }
16866
16867 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, qmax) {
16868 TEST_REQUIRES_X86_SSE41;
16869 GemmMicrokernelTester()
16870 .mr(2)
16871 .nr(4)
16872 .kr(8)
16873 .sr(1)
16874 .m(2)
16875 .n(4)
16876 .k(8)
16877 .qmax(128)
16878 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16879 }
16880
16881 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, strided_cm) {
16882 TEST_REQUIRES_X86_SSE41;
16883 GemmMicrokernelTester()
16884 .mr(2)
16885 .nr(4)
16886 .kr(8)
16887 .sr(1)
16888 .m(2)
16889 .n(4)
16890 .k(8)
16891 .cm_stride(7)
16892 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16893 }
16894#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
16895
16896
16897#if XNN_ARCH_X86 || XNN_ARCH_X86_64
16898 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8) {
16899 TEST_REQUIRES_X86_SSE41;
16900 GemmMicrokernelTester()
16901 .mr(3)
16902 .nr(4)
16903 .kr(8)
16904 .sr(1)
16905 .m(3)
16906 .n(4)
16907 .k(8)
16908 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16909 }
16910
16911 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, strided_cn) {
16912 TEST_REQUIRES_X86_SSE41;
16913 GemmMicrokernelTester()
16914 .mr(3)
16915 .nr(4)
16916 .kr(8)
16917 .sr(1)
16918 .m(3)
16919 .n(4)
16920 .k(8)
16921 .cn_stride(7)
16922 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16923 }
16924
16925 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8_strided_a) {
16926 TEST_REQUIRES_X86_SSE41;
16927 GemmMicrokernelTester()
16928 .mr(3)
16929 .nr(4)
16930 .kr(8)
16931 .sr(1)
16932 .m(3)
16933 .n(4)
16934 .k(8)
16935 .a_stride(11)
16936 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16937 }
16938
16939 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8_subtile) {
16940 TEST_REQUIRES_X86_SSE41;
16941 for (uint32_t m = 1; m <= 3; m++) {
16942 for (uint32_t n = 1; n <= 4; n++) {
16943 GemmMicrokernelTester()
16944 .mr(3)
16945 .nr(4)
16946 .kr(8)
16947 .sr(1)
16948 .m(m)
16949 .n(n)
16950 .k(8)
16951 .iterations(1)
16952 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16953 }
16954 }
16955 }
16956
16957 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8_subtile_m) {
16958 TEST_REQUIRES_X86_SSE41;
16959 for (uint32_t m = 1; m <= 3; m++) {
16960 GemmMicrokernelTester()
16961 .mr(3)
16962 .nr(4)
16963 .kr(8)
16964 .sr(1)
16965 .m(m)
16966 .n(4)
16967 .k(8)
16968 .iterations(1)
16969 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16970 }
16971 }
16972
16973 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8_subtile_n) {
16974 TEST_REQUIRES_X86_SSE41;
16975 for (uint32_t n = 1; n <= 4; n++) {
16976 GemmMicrokernelTester()
16977 .mr(3)
16978 .nr(4)
16979 .kr(8)
16980 .sr(1)
16981 .m(3)
16982 .n(n)
16983 .k(8)
16984 .iterations(1)
16985 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
16986 }
16987 }
16988
16989 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_lt_8) {
16990 TEST_REQUIRES_X86_SSE41;
16991 for (size_t k = 1; k < 8; k++) {
16992 GemmMicrokernelTester()
16993 .mr(3)
16994 .nr(4)
16995 .kr(8)
16996 .sr(1)
16997 .m(3)
16998 .n(4)
16999 .k(k)
17000 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17001 }
17002 }
17003
17004 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_lt_8_strided_a) {
17005 TEST_REQUIRES_X86_SSE41;
17006 for (size_t k = 1; k < 8; k++) {
17007 GemmMicrokernelTester()
17008 .mr(3)
17009 .nr(4)
17010 .kr(8)
17011 .sr(1)
17012 .m(3)
17013 .n(4)
17014 .k(k)
17015 .a_stride(11)
17016 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17017 }
17018 }
17019
17020 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_lt_8_subtile) {
17021 TEST_REQUIRES_X86_SSE41;
17022 for (size_t k = 1; k < 8; k++) {
17023 for (uint32_t m = 1; m <= 3; m++) {
17024 for (uint32_t n = 1; n <= 4; n++) {
17025 GemmMicrokernelTester()
17026 .mr(3)
17027 .nr(4)
17028 .kr(8)
17029 .sr(1)
17030 .m(m)
17031 .n(n)
17032 .k(k)
17033 .iterations(1)
17034 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17035 }
17036 }
17037 }
17038 }
17039
17040 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_gt_8) {
17041 TEST_REQUIRES_X86_SSE41;
17042 for (size_t k = 9; k < 16; k++) {
17043 GemmMicrokernelTester()
17044 .mr(3)
17045 .nr(4)
17046 .kr(8)
17047 .sr(1)
17048 .m(3)
17049 .n(4)
17050 .k(k)
17051 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17052 }
17053 }
17054
17055 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_gt_8_strided_a) {
17056 TEST_REQUIRES_X86_SSE41;
17057 for (size_t k = 9; k < 16; k++) {
17058 GemmMicrokernelTester()
17059 .mr(3)
17060 .nr(4)
17061 .kr(8)
17062 .sr(1)
17063 .m(3)
17064 .n(4)
17065 .k(k)
17066 .a_stride(19)
17067 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17068 }
17069 }
17070
17071 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_gt_8_subtile) {
17072 TEST_REQUIRES_X86_SSE41;
17073 for (size_t k = 9; k < 16; k++) {
17074 for (uint32_t m = 1; m <= 3; m++) {
17075 for (uint32_t n = 1; n <= 4; n++) {
17076 GemmMicrokernelTester()
17077 .mr(3)
17078 .nr(4)
17079 .kr(8)
17080 .sr(1)
17081 .m(m)
17082 .n(n)
17083 .k(k)
17084 .iterations(1)
17085 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17086 }
17087 }
17088 }
17089 }
17090
17091 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_div_8) {
17092 TEST_REQUIRES_X86_SSE41;
17093 for (size_t k = 16; k <= 80; k += 8) {
17094 GemmMicrokernelTester()
17095 .mr(3)
17096 .nr(4)
17097 .kr(8)
17098 .sr(1)
17099 .m(3)
17100 .n(4)
17101 .k(k)
17102 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17103 }
17104 }
17105
17106 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_div_8_strided_a) {
17107 TEST_REQUIRES_X86_SSE41;
17108 for (size_t k = 16; k <= 80; k += 8) {
17109 GemmMicrokernelTester()
17110 .mr(3)
17111 .nr(4)
17112 .kr(8)
17113 .sr(1)
17114 .m(3)
17115 .n(4)
17116 .k(k)
17117 .a_stride(83)
17118 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17119 }
17120 }
17121
17122 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_div_8_subtile) {
17123 TEST_REQUIRES_X86_SSE41;
17124 for (size_t k = 16; k <= 80; k += 8) {
17125 for (uint32_t m = 1; m <= 3; m++) {
17126 for (uint32_t n = 1; n <= 4; n++) {
17127 GemmMicrokernelTester()
17128 .mr(3)
17129 .nr(4)
17130 .kr(8)
17131 .sr(1)
17132 .m(m)
17133 .n(n)
17134 .k(k)
17135 .iterations(1)
17136 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17137 }
17138 }
17139 }
17140 }
17141
17142 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4) {
17143 TEST_REQUIRES_X86_SSE41;
17144 for (uint32_t n = 5; n < 8; n++) {
17145 for (size_t k = 1; k <= 40; k += 9) {
17146 GemmMicrokernelTester()
17147 .mr(3)
17148 .nr(4)
17149 .kr(8)
17150 .sr(1)
17151 .m(3)
17152 .n(4)
17153 .k(k)
17154 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17155 }
17156 }
17157 }
17158
17159 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4_strided_cn) {
17160 TEST_REQUIRES_X86_SSE41;
17161 for (uint32_t n = 5; n < 8; n++) {
17162 for (size_t k = 1; k <= 40; k += 9) {
17163 GemmMicrokernelTester()
17164 .mr(3)
17165 .nr(4)
17166 .kr(8)
17167 .sr(1)
17168 .m(3)
17169 .n(4)
17170 .k(k)
17171 .cn_stride(7)
17172 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17173 }
17174 }
17175 }
17176
17177 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4_strided_a) {
17178 TEST_REQUIRES_X86_SSE41;
17179 for (uint32_t n = 5; n < 8; n++) {
17180 for (size_t k = 1; k <= 40; k += 9) {
17181 GemmMicrokernelTester()
17182 .mr(3)
17183 .nr(4)
17184 .kr(8)
17185 .sr(1)
17186 .m(3)
17187 .n(n)
17188 .k(k)
17189 .a_stride(43)
17190 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17191 }
17192 }
17193 }
17194
17195 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4_subtile) {
17196 TEST_REQUIRES_X86_SSE41;
17197 for (uint32_t n = 5; n < 8; n++) {
17198 for (size_t k = 1; k <= 40; k += 9) {
17199 for (uint32_t m = 1; m <= 3; m++) {
17200 GemmMicrokernelTester()
17201 .mr(3)
17202 .nr(4)
17203 .kr(8)
17204 .sr(1)
17205 .m(m)
17206 .n(n)
17207 .k(k)
17208 .iterations(1)
17209 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17210 }
17211 }
17212 }
17213 }
17214
17215 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4) {
17216 TEST_REQUIRES_X86_SSE41;
17217 for (uint32_t n = 8; n <= 12; n += 4) {
17218 for (size_t k = 1; k <= 40; k += 9) {
17219 GemmMicrokernelTester()
17220 .mr(3)
17221 .nr(4)
17222 .kr(8)
17223 .sr(1)
17224 .m(3)
17225 .n(4)
17226 .k(k)
17227 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17228 }
17229 }
17230 }
17231
17232 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4_strided_cn) {
17233 TEST_REQUIRES_X86_SSE41;
17234 for (uint32_t n = 8; n <= 12; n += 4) {
17235 for (size_t k = 1; k <= 40; k += 9) {
17236 GemmMicrokernelTester()
17237 .mr(3)
17238 .nr(4)
17239 .kr(8)
17240 .sr(1)
17241 .m(3)
17242 .n(n)
17243 .k(k)
17244 .cn_stride(7)
17245 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17246 }
17247 }
17248 }
17249
17250 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4_strided_a) {
17251 TEST_REQUIRES_X86_SSE41;
17252 for (uint32_t n = 8; n <= 12; n += 4) {
17253 for (size_t k = 1; k <= 40; k += 9) {
17254 GemmMicrokernelTester()
17255 .mr(3)
17256 .nr(4)
17257 .kr(8)
17258 .sr(1)
17259 .m(3)
17260 .n(n)
17261 .k(k)
17262 .a_stride(43)
17263 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17264 }
17265 }
17266 }
17267
17268 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4_subtile) {
17269 TEST_REQUIRES_X86_SSE41;
17270 for (uint32_t n = 8; n <= 12; n += 4) {
17271 for (size_t k = 1; k <= 40; k += 9) {
17272 for (uint32_t m = 1; m <= 3; m++) {
17273 GemmMicrokernelTester()
17274 .mr(3)
17275 .nr(4)
17276 .kr(8)
17277 .sr(1)
17278 .m(m)
17279 .n(n)
17280 .k(k)
17281 .iterations(1)
17282 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17283 }
17284 }
17285 }
17286 }
17287
17288 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, strided_cm_subtile) {
17289 TEST_REQUIRES_X86_SSE41;
17290 for (size_t k = 1; k <= 40; k += 9) {
17291 for (uint32_t m = 1; m <= 3; m++) {
17292 for (uint32_t n = 1; n <= 4; n++) {
17293 GemmMicrokernelTester()
17294 .mr(3)
17295 .nr(4)
17296 .kr(8)
17297 .sr(1)
17298 .m(m)
17299 .n(n)
17300 .k(k)
17301 .cm_stride(7)
17302 .iterations(1)
17303 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17304 }
17305 }
17306 }
17307 }
17308
17309 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, qmin) {
17310 TEST_REQUIRES_X86_SSE41;
17311 GemmMicrokernelTester()
17312 .mr(3)
17313 .nr(4)
17314 .kr(8)
17315 .sr(1)
17316 .m(3)
17317 .n(4)
17318 .k(8)
17319 .qmin(128)
17320 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17321 }
17322
17323 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, qmax) {
17324 TEST_REQUIRES_X86_SSE41;
17325 GemmMicrokernelTester()
17326 .mr(3)
17327 .nr(4)
17328 .kr(8)
17329 .sr(1)
17330 .m(3)
17331 .n(4)
17332 .k(8)
17333 .qmax(128)
17334 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17335 }
17336
17337 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, strided_cm) {
17338 TEST_REQUIRES_X86_SSE41;
17339 GemmMicrokernelTester()
17340 .mr(3)
17341 .nr(4)
17342 .kr(8)
17343 .sr(1)
17344 .m(3)
17345 .n(4)
17346 .k(8)
17347 .cm_stride(7)
17348 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17349 }
17350#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
17351
17352
17353#if XNN_ARCH_X86 || XNN_ARCH_X86_64
17354 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8) {
17355 TEST_REQUIRES_X86_AVX;
17356 GemmMicrokernelTester()
17357 .mr(1)
17358 .nr(4)
17359 .kr(8)
17360 .sr(1)
17361 .m(1)
17362 .n(4)
17363 .k(8)
17364 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17365 }
17366
17367 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, strided_cn) {
17368 TEST_REQUIRES_X86_AVX;
17369 GemmMicrokernelTester()
17370 .mr(1)
17371 .nr(4)
17372 .kr(8)
17373 .sr(1)
17374 .m(1)
17375 .n(4)
17376 .k(8)
17377 .cn_stride(7)
17378 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17379 }
17380
17381 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_strided_a) {
17382 TEST_REQUIRES_X86_AVX;
17383 GemmMicrokernelTester()
17384 .mr(1)
17385 .nr(4)
17386 .kr(8)
17387 .sr(1)
17388 .m(1)
17389 .n(4)
17390 .k(8)
17391 .a_stride(11)
17392 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17393 }
17394
17395 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_subtile) {
17396 TEST_REQUIRES_X86_AVX;
17397 for (uint32_t m = 1; m <= 1; m++) {
17398 for (uint32_t n = 1; n <= 4; n++) {
17399 GemmMicrokernelTester()
17400 .mr(1)
17401 .nr(4)
17402 .kr(8)
17403 .sr(1)
17404 .m(m)
17405 .n(n)
17406 .k(8)
17407 .iterations(1)
17408 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17409 }
17410 }
17411 }
17412
17413 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_subtile_m) {
17414 TEST_REQUIRES_X86_AVX;
17415 for (uint32_t m = 1; m <= 1; m++) {
17416 GemmMicrokernelTester()
17417 .mr(1)
17418 .nr(4)
17419 .kr(8)
17420 .sr(1)
17421 .m(m)
17422 .n(4)
17423 .k(8)
17424 .iterations(1)
17425 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17426 }
17427 }
17428
17429 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_subtile_n) {
17430 TEST_REQUIRES_X86_AVX;
17431 for (uint32_t n = 1; n <= 4; n++) {
17432 GemmMicrokernelTester()
17433 .mr(1)
17434 .nr(4)
17435 .kr(8)
17436 .sr(1)
17437 .m(1)
17438 .n(n)
17439 .k(8)
17440 .iterations(1)
17441 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17442 }
17443 }
17444
17445 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_lt_8) {
17446 TEST_REQUIRES_X86_AVX;
17447 for (size_t k = 1; k < 8; k++) {
17448 GemmMicrokernelTester()
17449 .mr(1)
17450 .nr(4)
17451 .kr(8)
17452 .sr(1)
17453 .m(1)
17454 .n(4)
17455 .k(k)
17456 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17457 }
17458 }
17459
17460 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_lt_8_strided_a) {
17461 TEST_REQUIRES_X86_AVX;
17462 for (size_t k = 1; k < 8; k++) {
17463 GemmMicrokernelTester()
17464 .mr(1)
17465 .nr(4)
17466 .kr(8)
17467 .sr(1)
17468 .m(1)
17469 .n(4)
17470 .k(k)
17471 .a_stride(11)
17472 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17473 }
17474 }
17475
17476 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_lt_8_subtile) {
17477 TEST_REQUIRES_X86_AVX;
17478 for (size_t k = 1; k < 8; k++) {
17479 for (uint32_t m = 1; m <= 1; m++) {
17480 for (uint32_t n = 1; n <= 4; n++) {
17481 GemmMicrokernelTester()
17482 .mr(1)
17483 .nr(4)
17484 .kr(8)
17485 .sr(1)
17486 .m(m)
17487 .n(n)
17488 .k(k)
17489 .iterations(1)
17490 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17491 }
17492 }
17493 }
17494 }
17495
17496 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_gt_8) {
17497 TEST_REQUIRES_X86_AVX;
17498 for (size_t k = 9; k < 16; k++) {
17499 GemmMicrokernelTester()
17500 .mr(1)
17501 .nr(4)
17502 .kr(8)
17503 .sr(1)
17504 .m(1)
17505 .n(4)
17506 .k(k)
17507 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17508 }
17509 }
17510
17511 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_gt_8_strided_a) {
17512 TEST_REQUIRES_X86_AVX;
17513 for (size_t k = 9; k < 16; k++) {
17514 GemmMicrokernelTester()
17515 .mr(1)
17516 .nr(4)
17517 .kr(8)
17518 .sr(1)
17519 .m(1)
17520 .n(4)
17521 .k(k)
17522 .a_stride(19)
17523 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17524 }
17525 }
17526
17527 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_gt_8_subtile) {
17528 TEST_REQUIRES_X86_AVX;
17529 for (size_t k = 9; k < 16; k++) {
17530 for (uint32_t m = 1; m <= 1; m++) {
17531 for (uint32_t n = 1; n <= 4; n++) {
17532 GemmMicrokernelTester()
17533 .mr(1)
17534 .nr(4)
17535 .kr(8)
17536 .sr(1)
17537 .m(m)
17538 .n(n)
17539 .k(k)
17540 .iterations(1)
17541 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17542 }
17543 }
17544 }
17545 }
17546
17547 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_div_8) {
17548 TEST_REQUIRES_X86_AVX;
17549 for (size_t k = 16; k <= 80; k += 8) {
17550 GemmMicrokernelTester()
17551 .mr(1)
17552 .nr(4)
17553 .kr(8)
17554 .sr(1)
17555 .m(1)
17556 .n(4)
17557 .k(k)
17558 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17559 }
17560 }
17561
17562 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_div_8_strided_a) {
17563 TEST_REQUIRES_X86_AVX;
17564 for (size_t k = 16; k <= 80; k += 8) {
17565 GemmMicrokernelTester()
17566 .mr(1)
17567 .nr(4)
17568 .kr(8)
17569 .sr(1)
17570 .m(1)
17571 .n(4)
17572 .k(k)
17573 .a_stride(83)
17574 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17575 }
17576 }
17577
17578 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_div_8_subtile) {
17579 TEST_REQUIRES_X86_AVX;
17580 for (size_t k = 16; k <= 80; k += 8) {
17581 for (uint32_t m = 1; m <= 1; m++) {
17582 for (uint32_t n = 1; n <= 4; n++) {
17583 GemmMicrokernelTester()
17584 .mr(1)
17585 .nr(4)
17586 .kr(8)
17587 .sr(1)
17588 .m(m)
17589 .n(n)
17590 .k(k)
17591 .iterations(1)
17592 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17593 }
17594 }
17595 }
17596 }
17597
17598 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4) {
17599 TEST_REQUIRES_X86_AVX;
17600 for (uint32_t n = 5; n < 8; n++) {
17601 for (size_t k = 1; k <= 40; k += 9) {
17602 GemmMicrokernelTester()
17603 .mr(1)
17604 .nr(4)
17605 .kr(8)
17606 .sr(1)
17607 .m(1)
17608 .n(4)
17609 .k(k)
17610 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17611 }
17612 }
17613 }
17614
17615 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4_strided_cn) {
17616 TEST_REQUIRES_X86_AVX;
17617 for (uint32_t n = 5; n < 8; n++) {
17618 for (size_t k = 1; k <= 40; k += 9) {
17619 GemmMicrokernelTester()
17620 .mr(1)
17621 .nr(4)
17622 .kr(8)
17623 .sr(1)
17624 .m(1)
17625 .n(4)
17626 .k(k)
17627 .cn_stride(7)
17628 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17629 }
17630 }
17631 }
17632
17633 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4_strided_a) {
17634 TEST_REQUIRES_X86_AVX;
17635 for (uint32_t n = 5; n < 8; n++) {
17636 for (size_t k = 1; k <= 40; k += 9) {
17637 GemmMicrokernelTester()
17638 .mr(1)
17639 .nr(4)
17640 .kr(8)
17641 .sr(1)
17642 .m(1)
17643 .n(n)
17644 .k(k)
17645 .a_stride(43)
17646 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17647 }
17648 }
17649 }
17650
17651 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4_subtile) {
17652 TEST_REQUIRES_X86_AVX;
17653 for (uint32_t n = 5; n < 8; n++) {
17654 for (size_t k = 1; k <= 40; k += 9) {
17655 for (uint32_t m = 1; m <= 1; m++) {
17656 GemmMicrokernelTester()
17657 .mr(1)
17658 .nr(4)
17659 .kr(8)
17660 .sr(1)
17661 .m(m)
17662 .n(n)
17663 .k(k)
17664 .iterations(1)
17665 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17666 }
17667 }
17668 }
17669 }
17670
17671 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4) {
17672 TEST_REQUIRES_X86_AVX;
17673 for (uint32_t n = 8; n <= 12; n += 4) {
17674 for (size_t k = 1; k <= 40; k += 9) {
17675 GemmMicrokernelTester()
17676 .mr(1)
17677 .nr(4)
17678 .kr(8)
17679 .sr(1)
17680 .m(1)
17681 .n(4)
17682 .k(k)
17683 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17684 }
17685 }
17686 }
17687
17688 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4_strided_cn) {
17689 TEST_REQUIRES_X86_AVX;
17690 for (uint32_t n = 8; n <= 12; n += 4) {
17691 for (size_t k = 1; k <= 40; k += 9) {
17692 GemmMicrokernelTester()
17693 .mr(1)
17694 .nr(4)
17695 .kr(8)
17696 .sr(1)
17697 .m(1)
17698 .n(n)
17699 .k(k)
17700 .cn_stride(7)
17701 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17702 }
17703 }
17704 }
17705
17706 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4_strided_a) {
17707 TEST_REQUIRES_X86_AVX;
17708 for (uint32_t n = 8; n <= 12; n += 4) {
17709 for (size_t k = 1; k <= 40; k += 9) {
17710 GemmMicrokernelTester()
17711 .mr(1)
17712 .nr(4)
17713 .kr(8)
17714 .sr(1)
17715 .m(1)
17716 .n(n)
17717 .k(k)
17718 .a_stride(43)
17719 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17720 }
17721 }
17722 }
17723
17724 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4_subtile) {
17725 TEST_REQUIRES_X86_AVX;
17726 for (uint32_t n = 8; n <= 12; n += 4) {
17727 for (size_t k = 1; k <= 40; k += 9) {
17728 for (uint32_t m = 1; m <= 1; m++) {
17729 GemmMicrokernelTester()
17730 .mr(1)
17731 .nr(4)
17732 .kr(8)
17733 .sr(1)
17734 .m(m)
17735 .n(n)
17736 .k(k)
17737 .iterations(1)
17738 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17739 }
17740 }
17741 }
17742 }
17743
17744 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, strided_cm_subtile) {
17745 TEST_REQUIRES_X86_AVX;
17746 for (size_t k = 1; k <= 40; k += 9) {
17747 for (uint32_t m = 1; m <= 1; m++) {
17748 for (uint32_t n = 1; n <= 4; n++) {
17749 GemmMicrokernelTester()
17750 .mr(1)
17751 .nr(4)
17752 .kr(8)
17753 .sr(1)
17754 .m(m)
17755 .n(n)
17756 .k(k)
17757 .cm_stride(7)
17758 .iterations(1)
17759 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17760 }
17761 }
17762 }
17763 }
17764
17765 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, qmin) {
17766 TEST_REQUIRES_X86_AVX;
17767 GemmMicrokernelTester()
17768 .mr(1)
17769 .nr(4)
17770 .kr(8)
17771 .sr(1)
17772 .m(1)
17773 .n(4)
17774 .k(8)
17775 .qmin(128)
17776 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17777 }
17778
17779 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, qmax) {
17780 TEST_REQUIRES_X86_AVX;
17781 GemmMicrokernelTester()
17782 .mr(1)
17783 .nr(4)
17784 .kr(8)
17785 .sr(1)
17786 .m(1)
17787 .n(4)
17788 .k(8)
17789 .qmax(128)
17790 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17791 }
17792
17793 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, strided_cm) {
17794 TEST_REQUIRES_X86_AVX;
17795 GemmMicrokernelTester()
17796 .mr(1)
17797 .nr(4)
17798 .kr(8)
17799 .sr(1)
17800 .m(1)
17801 .n(4)
17802 .k(8)
17803 .cm_stride(7)
17804 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17805 }
17806#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
17807
17808
17809#if XNN_ARCH_X86 || XNN_ARCH_X86_64
17810 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8) {
17811 TEST_REQUIRES_X86_AVX;
17812 GemmMicrokernelTester()
17813 .mr(2)
17814 .nr(4)
17815 .kr(8)
17816 .sr(1)
17817 .m(2)
17818 .n(4)
17819 .k(8)
17820 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17821 }
17822
17823 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, strided_cn) {
17824 TEST_REQUIRES_X86_AVX;
17825 GemmMicrokernelTester()
17826 .mr(2)
17827 .nr(4)
17828 .kr(8)
17829 .sr(1)
17830 .m(2)
17831 .n(4)
17832 .k(8)
17833 .cn_stride(7)
17834 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17835 }
17836
17837 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8_strided_a) {
17838 TEST_REQUIRES_X86_AVX;
17839 GemmMicrokernelTester()
17840 .mr(2)
17841 .nr(4)
17842 .kr(8)
17843 .sr(1)
17844 .m(2)
17845 .n(4)
17846 .k(8)
17847 .a_stride(11)
17848 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17849 }
17850
17851 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8_subtile) {
17852 TEST_REQUIRES_X86_AVX;
17853 for (uint32_t m = 1; m <= 2; m++) {
17854 for (uint32_t n = 1; n <= 4; n++) {
17855 GemmMicrokernelTester()
17856 .mr(2)
17857 .nr(4)
17858 .kr(8)
17859 .sr(1)
17860 .m(m)
17861 .n(n)
17862 .k(8)
17863 .iterations(1)
17864 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17865 }
17866 }
17867 }
17868
17869 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8_subtile_m) {
17870 TEST_REQUIRES_X86_AVX;
17871 for (uint32_t m = 1; m <= 2; m++) {
17872 GemmMicrokernelTester()
17873 .mr(2)
17874 .nr(4)
17875 .kr(8)
17876 .sr(1)
17877 .m(m)
17878 .n(4)
17879 .k(8)
17880 .iterations(1)
17881 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17882 }
17883 }
17884
17885 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8_subtile_n) {
17886 TEST_REQUIRES_X86_AVX;
17887 for (uint32_t n = 1; n <= 4; n++) {
17888 GemmMicrokernelTester()
17889 .mr(2)
17890 .nr(4)
17891 .kr(8)
17892 .sr(1)
17893 .m(2)
17894 .n(n)
17895 .k(8)
17896 .iterations(1)
17897 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17898 }
17899 }
17900
17901 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_lt_8) {
17902 TEST_REQUIRES_X86_AVX;
17903 for (size_t k = 1; k < 8; k++) {
17904 GemmMicrokernelTester()
17905 .mr(2)
17906 .nr(4)
17907 .kr(8)
17908 .sr(1)
17909 .m(2)
17910 .n(4)
17911 .k(k)
17912 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17913 }
17914 }
17915
17916 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_lt_8_strided_a) {
17917 TEST_REQUIRES_X86_AVX;
17918 for (size_t k = 1; k < 8; k++) {
17919 GemmMicrokernelTester()
17920 .mr(2)
17921 .nr(4)
17922 .kr(8)
17923 .sr(1)
17924 .m(2)
17925 .n(4)
17926 .k(k)
17927 .a_stride(11)
17928 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17929 }
17930 }
17931
17932 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_lt_8_subtile) {
17933 TEST_REQUIRES_X86_AVX;
17934 for (size_t k = 1; k < 8; k++) {
17935 for (uint32_t m = 1; m <= 2; m++) {
17936 for (uint32_t n = 1; n <= 4; n++) {
17937 GemmMicrokernelTester()
17938 .mr(2)
17939 .nr(4)
17940 .kr(8)
17941 .sr(1)
17942 .m(m)
17943 .n(n)
17944 .k(k)
17945 .iterations(1)
17946 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17947 }
17948 }
17949 }
17950 }
17951
17952 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_gt_8) {
17953 TEST_REQUIRES_X86_AVX;
17954 for (size_t k = 9; k < 16; k++) {
17955 GemmMicrokernelTester()
17956 .mr(2)
17957 .nr(4)
17958 .kr(8)
17959 .sr(1)
17960 .m(2)
17961 .n(4)
17962 .k(k)
17963 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17964 }
17965 }
17966
17967 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_gt_8_strided_a) {
17968 TEST_REQUIRES_X86_AVX;
17969 for (size_t k = 9; k < 16; k++) {
17970 GemmMicrokernelTester()
17971 .mr(2)
17972 .nr(4)
17973 .kr(8)
17974 .sr(1)
17975 .m(2)
17976 .n(4)
17977 .k(k)
17978 .a_stride(19)
17979 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17980 }
17981 }
17982
17983 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_gt_8_subtile) {
17984 TEST_REQUIRES_X86_AVX;
17985 for (size_t k = 9; k < 16; k++) {
17986 for (uint32_t m = 1; m <= 2; m++) {
17987 for (uint32_t n = 1; n <= 4; n++) {
17988 GemmMicrokernelTester()
17989 .mr(2)
17990 .nr(4)
17991 .kr(8)
17992 .sr(1)
17993 .m(m)
17994 .n(n)
17995 .k(k)
17996 .iterations(1)
17997 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
17998 }
17999 }
18000 }
18001 }
18002
18003 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_div_8) {
18004 TEST_REQUIRES_X86_AVX;
18005 for (size_t k = 16; k <= 80; k += 8) {
18006 GemmMicrokernelTester()
18007 .mr(2)
18008 .nr(4)
18009 .kr(8)
18010 .sr(1)
18011 .m(2)
18012 .n(4)
18013 .k(k)
18014 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18015 }
18016 }
18017
18018 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_div_8_strided_a) {
18019 TEST_REQUIRES_X86_AVX;
18020 for (size_t k = 16; k <= 80; k += 8) {
18021 GemmMicrokernelTester()
18022 .mr(2)
18023 .nr(4)
18024 .kr(8)
18025 .sr(1)
18026 .m(2)
18027 .n(4)
18028 .k(k)
18029 .a_stride(83)
18030 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18031 }
18032 }
18033
18034 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_div_8_subtile) {
18035 TEST_REQUIRES_X86_AVX;
18036 for (size_t k = 16; k <= 80; k += 8) {
18037 for (uint32_t m = 1; m <= 2; m++) {
18038 for (uint32_t n = 1; n <= 4; n++) {
18039 GemmMicrokernelTester()
18040 .mr(2)
18041 .nr(4)
18042 .kr(8)
18043 .sr(1)
18044 .m(m)
18045 .n(n)
18046 .k(k)
18047 .iterations(1)
18048 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18049 }
18050 }
18051 }
18052 }
18053
18054 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4) {
18055 TEST_REQUIRES_X86_AVX;
18056 for (uint32_t n = 5; n < 8; n++) {
18057 for (size_t k = 1; k <= 40; k += 9) {
18058 GemmMicrokernelTester()
18059 .mr(2)
18060 .nr(4)
18061 .kr(8)
18062 .sr(1)
18063 .m(2)
18064 .n(4)
18065 .k(k)
18066 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18067 }
18068 }
18069 }
18070
18071 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4_strided_cn) {
18072 TEST_REQUIRES_X86_AVX;
18073 for (uint32_t n = 5; n < 8; n++) {
18074 for (size_t k = 1; k <= 40; k += 9) {
18075 GemmMicrokernelTester()
18076 .mr(2)
18077 .nr(4)
18078 .kr(8)
18079 .sr(1)
18080 .m(2)
18081 .n(4)
18082 .k(k)
18083 .cn_stride(7)
18084 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18085 }
18086 }
18087 }
18088
18089 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4_strided_a) {
18090 TEST_REQUIRES_X86_AVX;
18091 for (uint32_t n = 5; n < 8; n++) {
18092 for (size_t k = 1; k <= 40; k += 9) {
18093 GemmMicrokernelTester()
18094 .mr(2)
18095 .nr(4)
18096 .kr(8)
18097 .sr(1)
18098 .m(2)
18099 .n(n)
18100 .k(k)
18101 .a_stride(43)
18102 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18103 }
18104 }
18105 }
18106
18107 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4_subtile) {
18108 TEST_REQUIRES_X86_AVX;
18109 for (uint32_t n = 5; n < 8; n++) {
18110 for (size_t k = 1; k <= 40; k += 9) {
18111 for (uint32_t m = 1; m <= 2; m++) {
18112 GemmMicrokernelTester()
18113 .mr(2)
18114 .nr(4)
18115 .kr(8)
18116 .sr(1)
18117 .m(m)
18118 .n(n)
18119 .k(k)
18120 .iterations(1)
18121 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18122 }
18123 }
18124 }
18125 }
18126
18127 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4) {
18128 TEST_REQUIRES_X86_AVX;
18129 for (uint32_t n = 8; n <= 12; n += 4) {
18130 for (size_t k = 1; k <= 40; k += 9) {
18131 GemmMicrokernelTester()
18132 .mr(2)
18133 .nr(4)
18134 .kr(8)
18135 .sr(1)
18136 .m(2)
18137 .n(4)
18138 .k(k)
18139 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18140 }
18141 }
18142 }
18143
18144 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4_strided_cn) {
18145 TEST_REQUIRES_X86_AVX;
18146 for (uint32_t n = 8; n <= 12; n += 4) {
18147 for (size_t k = 1; k <= 40; k += 9) {
18148 GemmMicrokernelTester()
18149 .mr(2)
18150 .nr(4)
18151 .kr(8)
18152 .sr(1)
18153 .m(2)
18154 .n(n)
18155 .k(k)
18156 .cn_stride(7)
18157 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18158 }
18159 }
18160 }
18161
18162 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4_strided_a) {
18163 TEST_REQUIRES_X86_AVX;
18164 for (uint32_t n = 8; n <= 12; n += 4) {
18165 for (size_t k = 1; k <= 40; k += 9) {
18166 GemmMicrokernelTester()
18167 .mr(2)
18168 .nr(4)
18169 .kr(8)
18170 .sr(1)
18171 .m(2)
18172 .n(n)
18173 .k(k)
18174 .a_stride(43)
18175 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18176 }
18177 }
18178 }
18179
18180 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4_subtile) {
18181 TEST_REQUIRES_X86_AVX;
18182 for (uint32_t n = 8; n <= 12; n += 4) {
18183 for (size_t k = 1; k <= 40; k += 9) {
18184 for (uint32_t m = 1; m <= 2; m++) {
18185 GemmMicrokernelTester()
18186 .mr(2)
18187 .nr(4)
18188 .kr(8)
18189 .sr(1)
18190 .m(m)
18191 .n(n)
18192 .k(k)
18193 .iterations(1)
18194 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18195 }
18196 }
18197 }
18198 }
18199
18200 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, strided_cm_subtile) {
18201 TEST_REQUIRES_X86_AVX;
18202 for (size_t k = 1; k <= 40; k += 9) {
18203 for (uint32_t m = 1; m <= 2; m++) {
18204 for (uint32_t n = 1; n <= 4; n++) {
18205 GemmMicrokernelTester()
18206 .mr(2)
18207 .nr(4)
18208 .kr(8)
18209 .sr(1)
18210 .m(m)
18211 .n(n)
18212 .k(k)
18213 .cm_stride(7)
18214 .iterations(1)
18215 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18216 }
18217 }
18218 }
18219 }
18220
18221 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, qmin) {
18222 TEST_REQUIRES_X86_AVX;
18223 GemmMicrokernelTester()
18224 .mr(2)
18225 .nr(4)
18226 .kr(8)
18227 .sr(1)
18228 .m(2)
18229 .n(4)
18230 .k(8)
18231 .qmin(128)
18232 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18233 }
18234
18235 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, qmax) {
18236 TEST_REQUIRES_X86_AVX;
18237 GemmMicrokernelTester()
18238 .mr(2)
18239 .nr(4)
18240 .kr(8)
18241 .sr(1)
18242 .m(2)
18243 .n(4)
18244 .k(8)
18245 .qmax(128)
18246 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18247 }
18248
18249 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, strided_cm) {
18250 TEST_REQUIRES_X86_AVX;
18251 GemmMicrokernelTester()
18252 .mr(2)
18253 .nr(4)
18254 .kr(8)
18255 .sr(1)
18256 .m(2)
18257 .n(4)
18258 .k(8)
18259 .cm_stride(7)
18260 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18261 }
18262#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
18263
18264
18265#if XNN_ARCH_X86 || XNN_ARCH_X86_64
18266 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8) {
18267 TEST_REQUIRES_X86_AVX;
18268 GemmMicrokernelTester()
18269 .mr(3)
18270 .nr(4)
18271 .kr(8)
18272 .sr(1)
18273 .m(3)
18274 .n(4)
18275 .k(8)
18276 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18277 }
18278
18279 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, strided_cn) {
18280 TEST_REQUIRES_X86_AVX;
18281 GemmMicrokernelTester()
18282 .mr(3)
18283 .nr(4)
18284 .kr(8)
18285 .sr(1)
18286 .m(3)
18287 .n(4)
18288 .k(8)
18289 .cn_stride(7)
18290 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18291 }
18292
18293 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8_strided_a) {
18294 TEST_REQUIRES_X86_AVX;
18295 GemmMicrokernelTester()
18296 .mr(3)
18297 .nr(4)
18298 .kr(8)
18299 .sr(1)
18300 .m(3)
18301 .n(4)
18302 .k(8)
18303 .a_stride(11)
18304 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18305 }
18306
18307 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8_subtile) {
18308 TEST_REQUIRES_X86_AVX;
18309 for (uint32_t m = 1; m <= 3; m++) {
18310 for (uint32_t n = 1; n <= 4; n++) {
18311 GemmMicrokernelTester()
18312 .mr(3)
18313 .nr(4)
18314 .kr(8)
18315 .sr(1)
18316 .m(m)
18317 .n(n)
18318 .k(8)
18319 .iterations(1)
18320 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18321 }
18322 }
18323 }
18324
18325 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8_subtile_m) {
18326 TEST_REQUIRES_X86_AVX;
18327 for (uint32_t m = 1; m <= 3; m++) {
18328 GemmMicrokernelTester()
18329 .mr(3)
18330 .nr(4)
18331 .kr(8)
18332 .sr(1)
18333 .m(m)
18334 .n(4)
18335 .k(8)
18336 .iterations(1)
18337 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18338 }
18339 }
18340
18341 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8_subtile_n) {
18342 TEST_REQUIRES_X86_AVX;
18343 for (uint32_t n = 1; n <= 4; n++) {
18344 GemmMicrokernelTester()
18345 .mr(3)
18346 .nr(4)
18347 .kr(8)
18348 .sr(1)
18349 .m(3)
18350 .n(n)
18351 .k(8)
18352 .iterations(1)
18353 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18354 }
18355 }
18356
18357 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_lt_8) {
18358 TEST_REQUIRES_X86_AVX;
18359 for (size_t k = 1; k < 8; k++) {
18360 GemmMicrokernelTester()
18361 .mr(3)
18362 .nr(4)
18363 .kr(8)
18364 .sr(1)
18365 .m(3)
18366 .n(4)
18367 .k(k)
18368 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18369 }
18370 }
18371
18372 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_lt_8_strided_a) {
18373 TEST_REQUIRES_X86_AVX;
18374 for (size_t k = 1; k < 8; k++) {
18375 GemmMicrokernelTester()
18376 .mr(3)
18377 .nr(4)
18378 .kr(8)
18379 .sr(1)
18380 .m(3)
18381 .n(4)
18382 .k(k)
18383 .a_stride(11)
18384 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18385 }
18386 }
18387
18388 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_lt_8_subtile) {
18389 TEST_REQUIRES_X86_AVX;
18390 for (size_t k = 1; k < 8; k++) {
18391 for (uint32_t m = 1; m <= 3; m++) {
18392 for (uint32_t n = 1; n <= 4; n++) {
18393 GemmMicrokernelTester()
18394 .mr(3)
18395 .nr(4)
18396 .kr(8)
18397 .sr(1)
18398 .m(m)
18399 .n(n)
18400 .k(k)
18401 .iterations(1)
18402 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18403 }
18404 }
18405 }
18406 }
18407
18408 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_gt_8) {
18409 TEST_REQUIRES_X86_AVX;
18410 for (size_t k = 9; k < 16; k++) {
18411 GemmMicrokernelTester()
18412 .mr(3)
18413 .nr(4)
18414 .kr(8)
18415 .sr(1)
18416 .m(3)
18417 .n(4)
18418 .k(k)
18419 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18420 }
18421 }
18422
18423 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_gt_8_strided_a) {
18424 TEST_REQUIRES_X86_AVX;
18425 for (size_t k = 9; k < 16; k++) {
18426 GemmMicrokernelTester()
18427 .mr(3)
18428 .nr(4)
18429 .kr(8)
18430 .sr(1)
18431 .m(3)
18432 .n(4)
18433 .k(k)
18434 .a_stride(19)
18435 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18436 }
18437 }
18438
18439 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_gt_8_subtile) {
18440 TEST_REQUIRES_X86_AVX;
18441 for (size_t k = 9; k < 16; k++) {
18442 for (uint32_t m = 1; m <= 3; m++) {
18443 for (uint32_t n = 1; n <= 4; n++) {
18444 GemmMicrokernelTester()
18445 .mr(3)
18446 .nr(4)
18447 .kr(8)
18448 .sr(1)
18449 .m(m)
18450 .n(n)
18451 .k(k)
18452 .iterations(1)
18453 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18454 }
18455 }
18456 }
18457 }
18458
18459 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_div_8) {
18460 TEST_REQUIRES_X86_AVX;
18461 for (size_t k = 16; k <= 80; k += 8) {
18462 GemmMicrokernelTester()
18463 .mr(3)
18464 .nr(4)
18465 .kr(8)
18466 .sr(1)
18467 .m(3)
18468 .n(4)
18469 .k(k)
18470 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18471 }
18472 }
18473
18474 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_div_8_strided_a) {
18475 TEST_REQUIRES_X86_AVX;
18476 for (size_t k = 16; k <= 80; k += 8) {
18477 GemmMicrokernelTester()
18478 .mr(3)
18479 .nr(4)
18480 .kr(8)
18481 .sr(1)
18482 .m(3)
18483 .n(4)
18484 .k(k)
18485 .a_stride(83)
18486 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18487 }
18488 }
18489
18490 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_div_8_subtile) {
18491 TEST_REQUIRES_X86_AVX;
18492 for (size_t k = 16; k <= 80; k += 8) {
18493 for (uint32_t m = 1; m <= 3; m++) {
18494 for (uint32_t n = 1; n <= 4; n++) {
18495 GemmMicrokernelTester()
18496 .mr(3)
18497 .nr(4)
18498 .kr(8)
18499 .sr(1)
18500 .m(m)
18501 .n(n)
18502 .k(k)
18503 .iterations(1)
18504 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18505 }
18506 }
18507 }
18508 }
18509
18510 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_gt_4) {
18511 TEST_REQUIRES_X86_AVX;
18512 for (uint32_t n = 5; n < 8; n++) {
18513 for (size_t k = 1; k <= 40; k += 9) {
18514 GemmMicrokernelTester()
18515 .mr(3)
18516 .nr(4)
18517 .kr(8)
18518 .sr(1)
18519 .m(3)
18520 .n(4)
18521 .k(k)
18522 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18523 }
18524 }
18525 }
18526
18527 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_gt_4_strided_cn) {
18528 TEST_REQUIRES_X86_AVX;
18529 for (uint32_t n = 5; n < 8; n++) {
18530 for (size_t k = 1; k <= 40; k += 9) {
18531 GemmMicrokernelTester()
18532 .mr(3)
18533 .nr(4)
18534 .kr(8)
18535 .sr(1)
18536 .m(3)
18537 .n(4)
18538 .k(k)
18539 .cn_stride(7)
18540 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18541 }
18542 }
18543 }
18544
18545 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_gt_4_strided_a) {
18546 TEST_REQUIRES_X86_AVX;
18547 for (uint32_t n = 5; n < 8; n++) {
18548 for (size_t k = 1; k <= 40; k += 9) {
18549 GemmMicrokernelTester()
18550 .mr(3)
18551 .nr(4)
18552 .kr(8)
18553 .sr(1)
18554 .m(3)
18555 .n(n)
18556 .k(k)
18557 .a_stride(43)
18558 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18559 }
18560 }
18561 }
18562
18563 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_gt_4_subtile) {
18564 TEST_REQUIRES_X86_AVX;
18565 for (uint32_t n = 5; n < 8; n++) {
18566 for (size_t k = 1; k <= 40; k += 9) {
18567 for (uint32_t m = 1; m <= 3; m++) {
18568 GemmMicrokernelTester()
18569 .mr(3)
18570 .nr(4)
18571 .kr(8)
18572 .sr(1)
18573 .m(m)
18574 .n(n)
18575 .k(k)
18576 .iterations(1)
18577 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18578 }
18579 }
18580 }
18581 }
18582
18583 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_div_4) {
18584 TEST_REQUIRES_X86_AVX;
18585 for (uint32_t n = 8; n <= 12; n += 4) {
18586 for (size_t k = 1; k <= 40; k += 9) {
18587 GemmMicrokernelTester()
18588 .mr(3)
18589 .nr(4)
18590 .kr(8)
18591 .sr(1)
18592 .m(3)
18593 .n(4)
18594 .k(k)
18595 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18596 }
18597 }
18598 }
18599
18600 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_div_4_strided_cn) {
18601 TEST_REQUIRES_X86_AVX;
18602 for (uint32_t n = 8; n <= 12; n += 4) {
18603 for (size_t k = 1; k <= 40; k += 9) {
18604 GemmMicrokernelTester()
18605 .mr(3)
18606 .nr(4)
18607 .kr(8)
18608 .sr(1)
18609 .m(3)
18610 .n(n)
18611 .k(k)
18612 .cn_stride(7)
18613 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18614 }
18615 }
18616 }
18617
18618 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_div_4_strided_a) {
18619 TEST_REQUIRES_X86_AVX;
18620 for (uint32_t n = 8; n <= 12; n += 4) {
18621 for (size_t k = 1; k <= 40; k += 9) {
18622 GemmMicrokernelTester()
18623 .mr(3)
18624 .nr(4)
18625 .kr(8)
18626 .sr(1)
18627 .m(3)
18628 .n(n)
18629 .k(k)
18630 .a_stride(43)
18631 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18632 }
18633 }
18634 }
18635
18636 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_div_4_subtile) {
18637 TEST_REQUIRES_X86_AVX;
18638 for (uint32_t n = 8; n <= 12; n += 4) {
18639 for (size_t k = 1; k <= 40; k += 9) {
18640 for (uint32_t m = 1; m <= 3; m++) {
18641 GemmMicrokernelTester()
18642 .mr(3)
18643 .nr(4)
18644 .kr(8)
18645 .sr(1)
18646 .m(m)
18647 .n(n)
18648 .k(k)
18649 .iterations(1)
18650 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18651 }
18652 }
18653 }
18654 }
18655
18656 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, strided_cm_subtile) {
18657 TEST_REQUIRES_X86_AVX;
18658 for (size_t k = 1; k <= 40; k += 9) {
18659 for (uint32_t m = 1; m <= 3; m++) {
18660 for (uint32_t n = 1; n <= 4; n++) {
18661 GemmMicrokernelTester()
18662 .mr(3)
18663 .nr(4)
18664 .kr(8)
18665 .sr(1)
18666 .m(m)
18667 .n(n)
18668 .k(k)
18669 .cm_stride(7)
18670 .iterations(1)
18671 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18672 }
18673 }
18674 }
18675 }
18676
18677 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, qmin) {
18678 TEST_REQUIRES_X86_AVX;
18679 GemmMicrokernelTester()
18680 .mr(3)
18681 .nr(4)
18682 .kr(8)
18683 .sr(1)
18684 .m(3)
18685 .n(4)
18686 .k(8)
18687 .qmin(128)
18688 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18689 }
18690
18691 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, qmax) {
18692 TEST_REQUIRES_X86_AVX;
18693 GemmMicrokernelTester()
18694 .mr(3)
18695 .nr(4)
18696 .kr(8)
18697 .sr(1)
18698 .m(3)
18699 .n(4)
18700 .k(8)
18701 .qmax(128)
18702 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18703 }
18704
18705 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, strided_cm) {
18706 TEST_REQUIRES_X86_AVX;
18707 GemmMicrokernelTester()
18708 .mr(3)
18709 .nr(4)
18710 .kr(8)
18711 .sr(1)
18712 .m(3)
18713 .n(4)
18714 .k(8)
18715 .cm_stride(7)
18716 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18717 }
18718#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
18719
18720
18721#if XNN_ARCH_X86 || XNN_ARCH_X86_64
18722 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8) {
18723 TEST_REQUIRES_X86_XOP;
18724 GemmMicrokernelTester()
18725 .mr(1)
18726 .nr(4)
18727 .kr(8)
18728 .sr(1)
18729 .m(1)
18730 .n(4)
18731 .k(8)
18732 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18733 }
18734
18735 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, strided_cn) {
18736 TEST_REQUIRES_X86_XOP;
18737 GemmMicrokernelTester()
18738 .mr(1)
18739 .nr(4)
18740 .kr(8)
18741 .sr(1)
18742 .m(1)
18743 .n(4)
18744 .k(8)
18745 .cn_stride(7)
18746 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18747 }
18748
18749 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_strided_a) {
18750 TEST_REQUIRES_X86_XOP;
18751 GemmMicrokernelTester()
18752 .mr(1)
18753 .nr(4)
18754 .kr(8)
18755 .sr(1)
18756 .m(1)
18757 .n(4)
18758 .k(8)
18759 .a_stride(11)
18760 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18761 }
18762
18763 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_subtile) {
18764 TEST_REQUIRES_X86_XOP;
18765 for (uint32_t m = 1; m <= 1; m++) {
18766 for (uint32_t n = 1; n <= 4; n++) {
18767 GemmMicrokernelTester()
18768 .mr(1)
18769 .nr(4)
18770 .kr(8)
18771 .sr(1)
18772 .m(m)
18773 .n(n)
18774 .k(8)
18775 .iterations(1)
18776 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18777 }
18778 }
18779 }
18780
18781 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_subtile_m) {
18782 TEST_REQUIRES_X86_XOP;
18783 for (uint32_t m = 1; m <= 1; m++) {
18784 GemmMicrokernelTester()
18785 .mr(1)
18786 .nr(4)
18787 .kr(8)
18788 .sr(1)
18789 .m(m)
18790 .n(4)
18791 .k(8)
18792 .iterations(1)
18793 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18794 }
18795 }
18796
18797 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_subtile_n) {
18798 TEST_REQUIRES_X86_XOP;
18799 for (uint32_t n = 1; n <= 4; n++) {
18800 GemmMicrokernelTester()
18801 .mr(1)
18802 .nr(4)
18803 .kr(8)
18804 .sr(1)
18805 .m(1)
18806 .n(n)
18807 .k(8)
18808 .iterations(1)
18809 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18810 }
18811 }
18812
18813 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_lt_8) {
18814 TEST_REQUIRES_X86_XOP;
18815 for (size_t k = 1; k < 8; k++) {
18816 GemmMicrokernelTester()
18817 .mr(1)
18818 .nr(4)
18819 .kr(8)
18820 .sr(1)
18821 .m(1)
18822 .n(4)
18823 .k(k)
18824 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18825 }
18826 }
18827
18828 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_lt_8_strided_a) {
18829 TEST_REQUIRES_X86_XOP;
18830 for (size_t k = 1; k < 8; k++) {
18831 GemmMicrokernelTester()
18832 .mr(1)
18833 .nr(4)
18834 .kr(8)
18835 .sr(1)
18836 .m(1)
18837 .n(4)
18838 .k(k)
18839 .a_stride(11)
18840 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18841 }
18842 }
18843
18844 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_lt_8_subtile) {
18845 TEST_REQUIRES_X86_XOP;
18846 for (size_t k = 1; k < 8; k++) {
18847 for (uint32_t m = 1; m <= 1; m++) {
18848 for (uint32_t n = 1; n <= 4; n++) {
18849 GemmMicrokernelTester()
18850 .mr(1)
18851 .nr(4)
18852 .kr(8)
18853 .sr(1)
18854 .m(m)
18855 .n(n)
18856 .k(k)
18857 .iterations(1)
18858 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18859 }
18860 }
18861 }
18862 }
18863
18864 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_gt_8) {
18865 TEST_REQUIRES_X86_XOP;
18866 for (size_t k = 9; k < 16; k++) {
18867 GemmMicrokernelTester()
18868 .mr(1)
18869 .nr(4)
18870 .kr(8)
18871 .sr(1)
18872 .m(1)
18873 .n(4)
18874 .k(k)
18875 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18876 }
18877 }
18878
18879 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_gt_8_strided_a) {
18880 TEST_REQUIRES_X86_XOP;
18881 for (size_t k = 9; k < 16; k++) {
18882 GemmMicrokernelTester()
18883 .mr(1)
18884 .nr(4)
18885 .kr(8)
18886 .sr(1)
18887 .m(1)
18888 .n(4)
18889 .k(k)
18890 .a_stride(19)
18891 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18892 }
18893 }
18894
18895 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_gt_8_subtile) {
18896 TEST_REQUIRES_X86_XOP;
18897 for (size_t k = 9; k < 16; k++) {
18898 for (uint32_t m = 1; m <= 1; m++) {
18899 for (uint32_t n = 1; n <= 4; n++) {
18900 GemmMicrokernelTester()
18901 .mr(1)
18902 .nr(4)
18903 .kr(8)
18904 .sr(1)
18905 .m(m)
18906 .n(n)
18907 .k(k)
18908 .iterations(1)
18909 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18910 }
18911 }
18912 }
18913 }
18914
18915 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_div_8) {
18916 TEST_REQUIRES_X86_XOP;
18917 for (size_t k = 16; k <= 80; k += 8) {
18918 GemmMicrokernelTester()
18919 .mr(1)
18920 .nr(4)
18921 .kr(8)
18922 .sr(1)
18923 .m(1)
18924 .n(4)
18925 .k(k)
18926 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18927 }
18928 }
18929
18930 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_div_8_strided_a) {
18931 TEST_REQUIRES_X86_XOP;
18932 for (size_t k = 16; k <= 80; k += 8) {
18933 GemmMicrokernelTester()
18934 .mr(1)
18935 .nr(4)
18936 .kr(8)
18937 .sr(1)
18938 .m(1)
18939 .n(4)
18940 .k(k)
18941 .a_stride(83)
18942 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18943 }
18944 }
18945
18946 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_div_8_subtile) {
18947 TEST_REQUIRES_X86_XOP;
18948 for (size_t k = 16; k <= 80; k += 8) {
18949 for (uint32_t m = 1; m <= 1; m++) {
18950 for (uint32_t n = 1; n <= 4; n++) {
18951 GemmMicrokernelTester()
18952 .mr(1)
18953 .nr(4)
18954 .kr(8)
18955 .sr(1)
18956 .m(m)
18957 .n(n)
18958 .k(k)
18959 .iterations(1)
18960 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18961 }
18962 }
18963 }
18964 }
18965
18966 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4) {
18967 TEST_REQUIRES_X86_XOP;
18968 for (uint32_t n = 5; n < 8; n++) {
18969 for (size_t k = 1; k <= 40; k += 9) {
18970 GemmMicrokernelTester()
18971 .mr(1)
18972 .nr(4)
18973 .kr(8)
18974 .sr(1)
18975 .m(1)
18976 .n(4)
18977 .k(k)
18978 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18979 }
18980 }
18981 }
18982
18983 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4_strided_cn) {
18984 TEST_REQUIRES_X86_XOP;
18985 for (uint32_t n = 5; n < 8; n++) {
18986 for (size_t k = 1; k <= 40; k += 9) {
18987 GemmMicrokernelTester()
18988 .mr(1)
18989 .nr(4)
18990 .kr(8)
18991 .sr(1)
18992 .m(1)
18993 .n(4)
18994 .k(k)
18995 .cn_stride(7)
18996 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
18997 }
18998 }
18999 }
19000
19001 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4_strided_a) {
19002 TEST_REQUIRES_X86_XOP;
19003 for (uint32_t n = 5; n < 8; n++) {
19004 for (size_t k = 1; k <= 40; k += 9) {
19005 GemmMicrokernelTester()
19006 .mr(1)
19007 .nr(4)
19008 .kr(8)
19009 .sr(1)
19010 .m(1)
19011 .n(n)
19012 .k(k)
19013 .a_stride(43)
19014 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19015 }
19016 }
19017 }
19018
19019 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4_subtile) {
19020 TEST_REQUIRES_X86_XOP;
19021 for (uint32_t n = 5; n < 8; n++) {
19022 for (size_t k = 1; k <= 40; k += 9) {
19023 for (uint32_t m = 1; m <= 1; m++) {
19024 GemmMicrokernelTester()
19025 .mr(1)
19026 .nr(4)
19027 .kr(8)
19028 .sr(1)
19029 .m(m)
19030 .n(n)
19031 .k(k)
19032 .iterations(1)
19033 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19034 }
19035 }
19036 }
19037 }
19038
19039 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4) {
19040 TEST_REQUIRES_X86_XOP;
19041 for (uint32_t n = 8; n <= 12; n += 4) {
19042 for (size_t k = 1; k <= 40; k += 9) {
19043 GemmMicrokernelTester()
19044 .mr(1)
19045 .nr(4)
19046 .kr(8)
19047 .sr(1)
19048 .m(1)
19049 .n(4)
19050 .k(k)
19051 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19052 }
19053 }
19054 }
19055
19056 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4_strided_cn) {
19057 TEST_REQUIRES_X86_XOP;
19058 for (uint32_t n = 8; n <= 12; n += 4) {
19059 for (size_t k = 1; k <= 40; k += 9) {
19060 GemmMicrokernelTester()
19061 .mr(1)
19062 .nr(4)
19063 .kr(8)
19064 .sr(1)
19065 .m(1)
19066 .n(n)
19067 .k(k)
19068 .cn_stride(7)
19069 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19070 }
19071 }
19072 }
19073
19074 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4_strided_a) {
19075 TEST_REQUIRES_X86_XOP;
19076 for (uint32_t n = 8; n <= 12; n += 4) {
19077 for (size_t k = 1; k <= 40; k += 9) {
19078 GemmMicrokernelTester()
19079 .mr(1)
19080 .nr(4)
19081 .kr(8)
19082 .sr(1)
19083 .m(1)
19084 .n(n)
19085 .k(k)
19086 .a_stride(43)
19087 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19088 }
19089 }
19090 }
19091
19092 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4_subtile) {
19093 TEST_REQUIRES_X86_XOP;
19094 for (uint32_t n = 8; n <= 12; n += 4) {
19095 for (size_t k = 1; k <= 40; k += 9) {
19096 for (uint32_t m = 1; m <= 1; m++) {
19097 GemmMicrokernelTester()
19098 .mr(1)
19099 .nr(4)
19100 .kr(8)
19101 .sr(1)
19102 .m(m)
19103 .n(n)
19104 .k(k)
19105 .iterations(1)
19106 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19107 }
19108 }
19109 }
19110 }
19111
19112 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, strided_cm_subtile) {
19113 TEST_REQUIRES_X86_XOP;
19114 for (size_t k = 1; k <= 40; k += 9) {
19115 for (uint32_t m = 1; m <= 1; m++) {
19116 for (uint32_t n = 1; n <= 4; n++) {
19117 GemmMicrokernelTester()
19118 .mr(1)
19119 .nr(4)
19120 .kr(8)
19121 .sr(1)
19122 .m(m)
19123 .n(n)
19124 .k(k)
19125 .cm_stride(7)
19126 .iterations(1)
19127 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19128 }
19129 }
19130 }
19131 }
19132
19133 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, qmin) {
19134 TEST_REQUIRES_X86_XOP;
19135 GemmMicrokernelTester()
19136 .mr(1)
19137 .nr(4)
19138 .kr(8)
19139 .sr(1)
19140 .m(1)
19141 .n(4)
19142 .k(8)
19143 .qmin(128)
19144 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19145 }
19146
19147 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, qmax) {
19148 TEST_REQUIRES_X86_XOP;
19149 GemmMicrokernelTester()
19150 .mr(1)
19151 .nr(4)
19152 .kr(8)
19153 .sr(1)
19154 .m(1)
19155 .n(4)
19156 .k(8)
19157 .qmax(128)
19158 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19159 }
19160
19161 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, strided_cm) {
19162 TEST_REQUIRES_X86_XOP;
19163 GemmMicrokernelTester()
19164 .mr(1)
19165 .nr(4)
19166 .kr(8)
19167 .sr(1)
19168 .m(1)
19169 .n(4)
19170 .k(8)
19171 .cm_stride(7)
19172 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19173 }
19174#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
19175
19176
19177#if XNN_ARCH_X86 || XNN_ARCH_X86_64
19178 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8) {
19179 TEST_REQUIRES_X86_XOP;
19180 GemmMicrokernelTester()
19181 .mr(2)
19182 .nr(4)
19183 .kr(8)
19184 .sr(1)
19185 .m(2)
19186 .n(4)
19187 .k(8)
19188 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19189 }
19190
19191 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, strided_cn) {
19192 TEST_REQUIRES_X86_XOP;
19193 GemmMicrokernelTester()
19194 .mr(2)
19195 .nr(4)
19196 .kr(8)
19197 .sr(1)
19198 .m(2)
19199 .n(4)
19200 .k(8)
19201 .cn_stride(7)
19202 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19203 }
19204
19205 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8_strided_a) {
19206 TEST_REQUIRES_X86_XOP;
19207 GemmMicrokernelTester()
19208 .mr(2)
19209 .nr(4)
19210 .kr(8)
19211 .sr(1)
19212 .m(2)
19213 .n(4)
19214 .k(8)
19215 .a_stride(11)
19216 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19217 }
19218
19219 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8_subtile) {
19220 TEST_REQUIRES_X86_XOP;
19221 for (uint32_t m = 1; m <= 2; m++) {
19222 for (uint32_t n = 1; n <= 4; n++) {
19223 GemmMicrokernelTester()
19224 .mr(2)
19225 .nr(4)
19226 .kr(8)
19227 .sr(1)
19228 .m(m)
19229 .n(n)
19230 .k(8)
19231 .iterations(1)
19232 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19233 }
19234 }
19235 }
19236
19237 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8_subtile_m) {
19238 TEST_REQUIRES_X86_XOP;
19239 for (uint32_t m = 1; m <= 2; m++) {
19240 GemmMicrokernelTester()
19241 .mr(2)
19242 .nr(4)
19243 .kr(8)
19244 .sr(1)
19245 .m(m)
19246 .n(4)
19247 .k(8)
19248 .iterations(1)
19249 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19250 }
19251 }
19252
19253 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8_subtile_n) {
19254 TEST_REQUIRES_X86_XOP;
19255 for (uint32_t n = 1; n <= 4; n++) {
19256 GemmMicrokernelTester()
19257 .mr(2)
19258 .nr(4)
19259 .kr(8)
19260 .sr(1)
19261 .m(2)
19262 .n(n)
19263 .k(8)
19264 .iterations(1)
19265 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19266 }
19267 }
19268
19269 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_lt_8) {
19270 TEST_REQUIRES_X86_XOP;
19271 for (size_t k = 1; k < 8; k++) {
19272 GemmMicrokernelTester()
19273 .mr(2)
19274 .nr(4)
19275 .kr(8)
19276 .sr(1)
19277 .m(2)
19278 .n(4)
19279 .k(k)
19280 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19281 }
19282 }
19283
19284 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_lt_8_strided_a) {
19285 TEST_REQUIRES_X86_XOP;
19286 for (size_t k = 1; k < 8; k++) {
19287 GemmMicrokernelTester()
19288 .mr(2)
19289 .nr(4)
19290 .kr(8)
19291 .sr(1)
19292 .m(2)
19293 .n(4)
19294 .k(k)
19295 .a_stride(11)
19296 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19297 }
19298 }
19299
19300 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_lt_8_subtile) {
19301 TEST_REQUIRES_X86_XOP;
19302 for (size_t k = 1; k < 8; k++) {
19303 for (uint32_t m = 1; m <= 2; m++) {
19304 for (uint32_t n = 1; n <= 4; n++) {
19305 GemmMicrokernelTester()
19306 .mr(2)
19307 .nr(4)
19308 .kr(8)
19309 .sr(1)
19310 .m(m)
19311 .n(n)
19312 .k(k)
19313 .iterations(1)
19314 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19315 }
19316 }
19317 }
19318 }
19319
19320 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_gt_8) {
19321 TEST_REQUIRES_X86_XOP;
19322 for (size_t k = 9; k < 16; k++) {
19323 GemmMicrokernelTester()
19324 .mr(2)
19325 .nr(4)
19326 .kr(8)
19327 .sr(1)
19328 .m(2)
19329 .n(4)
19330 .k(k)
19331 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19332 }
19333 }
19334
19335 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_gt_8_strided_a) {
19336 TEST_REQUIRES_X86_XOP;
19337 for (size_t k = 9; k < 16; k++) {
19338 GemmMicrokernelTester()
19339 .mr(2)
19340 .nr(4)
19341 .kr(8)
19342 .sr(1)
19343 .m(2)
19344 .n(4)
19345 .k(k)
19346 .a_stride(19)
19347 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19348 }
19349 }
19350
19351 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_gt_8_subtile) {
19352 TEST_REQUIRES_X86_XOP;
19353 for (size_t k = 9; k < 16; k++) {
19354 for (uint32_t m = 1; m <= 2; m++) {
19355 for (uint32_t n = 1; n <= 4; n++) {
19356 GemmMicrokernelTester()
19357 .mr(2)
19358 .nr(4)
19359 .kr(8)
19360 .sr(1)
19361 .m(m)
19362 .n(n)
19363 .k(k)
19364 .iterations(1)
19365 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19366 }
19367 }
19368 }
19369 }
19370
19371 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_div_8) {
19372 TEST_REQUIRES_X86_XOP;
19373 for (size_t k = 16; k <= 80; k += 8) {
19374 GemmMicrokernelTester()
19375 .mr(2)
19376 .nr(4)
19377 .kr(8)
19378 .sr(1)
19379 .m(2)
19380 .n(4)
19381 .k(k)
19382 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19383 }
19384 }
19385
19386 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_div_8_strided_a) {
19387 TEST_REQUIRES_X86_XOP;
19388 for (size_t k = 16; k <= 80; k += 8) {
19389 GemmMicrokernelTester()
19390 .mr(2)
19391 .nr(4)
19392 .kr(8)
19393 .sr(1)
19394 .m(2)
19395 .n(4)
19396 .k(k)
19397 .a_stride(83)
19398 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19399 }
19400 }
19401
19402 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_div_8_subtile) {
19403 TEST_REQUIRES_X86_XOP;
19404 for (size_t k = 16; k <= 80; k += 8) {
19405 for (uint32_t m = 1; m <= 2; m++) {
19406 for (uint32_t n = 1; n <= 4; n++) {
19407 GemmMicrokernelTester()
19408 .mr(2)
19409 .nr(4)
19410 .kr(8)
19411 .sr(1)
19412 .m(m)
19413 .n(n)
19414 .k(k)
19415 .iterations(1)
19416 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19417 }
19418 }
19419 }
19420 }
19421
19422 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4) {
19423 TEST_REQUIRES_X86_XOP;
19424 for (uint32_t n = 5; n < 8; n++) {
19425 for (size_t k = 1; k <= 40; k += 9) {
19426 GemmMicrokernelTester()
19427 .mr(2)
19428 .nr(4)
19429 .kr(8)
19430 .sr(1)
19431 .m(2)
19432 .n(4)
19433 .k(k)
19434 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19435 }
19436 }
19437 }
19438
19439 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4_strided_cn) {
19440 TEST_REQUIRES_X86_XOP;
19441 for (uint32_t n = 5; n < 8; n++) {
19442 for (size_t k = 1; k <= 40; k += 9) {
19443 GemmMicrokernelTester()
19444 .mr(2)
19445 .nr(4)
19446 .kr(8)
19447 .sr(1)
19448 .m(2)
19449 .n(4)
19450 .k(k)
19451 .cn_stride(7)
19452 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19453 }
19454 }
19455 }
19456
19457 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4_strided_a) {
19458 TEST_REQUIRES_X86_XOP;
19459 for (uint32_t n = 5; n < 8; n++) {
19460 for (size_t k = 1; k <= 40; k += 9) {
19461 GemmMicrokernelTester()
19462 .mr(2)
19463 .nr(4)
19464 .kr(8)
19465 .sr(1)
19466 .m(2)
19467 .n(n)
19468 .k(k)
19469 .a_stride(43)
19470 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19471 }
19472 }
19473 }
19474
19475 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4_subtile) {
19476 TEST_REQUIRES_X86_XOP;
19477 for (uint32_t n = 5; n < 8; n++) {
19478 for (size_t k = 1; k <= 40; k += 9) {
19479 for (uint32_t m = 1; m <= 2; m++) {
19480 GemmMicrokernelTester()
19481 .mr(2)
19482 .nr(4)
19483 .kr(8)
19484 .sr(1)
19485 .m(m)
19486 .n(n)
19487 .k(k)
19488 .iterations(1)
19489 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19490 }
19491 }
19492 }
19493 }
19494
19495 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4) {
19496 TEST_REQUIRES_X86_XOP;
19497 for (uint32_t n = 8; n <= 12; n += 4) {
19498 for (size_t k = 1; k <= 40; k += 9) {
19499 GemmMicrokernelTester()
19500 .mr(2)
19501 .nr(4)
19502 .kr(8)
19503 .sr(1)
19504 .m(2)
19505 .n(4)
19506 .k(k)
19507 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19508 }
19509 }
19510 }
19511
19512 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4_strided_cn) {
19513 TEST_REQUIRES_X86_XOP;
19514 for (uint32_t n = 8; n <= 12; n += 4) {
19515 for (size_t k = 1; k <= 40; k += 9) {
19516 GemmMicrokernelTester()
19517 .mr(2)
19518 .nr(4)
19519 .kr(8)
19520 .sr(1)
19521 .m(2)
19522 .n(n)
19523 .k(k)
19524 .cn_stride(7)
19525 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19526 }
19527 }
19528 }
19529
19530 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4_strided_a) {
19531 TEST_REQUIRES_X86_XOP;
19532 for (uint32_t n = 8; n <= 12; n += 4) {
19533 for (size_t k = 1; k <= 40; k += 9) {
19534 GemmMicrokernelTester()
19535 .mr(2)
19536 .nr(4)
19537 .kr(8)
19538 .sr(1)
19539 .m(2)
19540 .n(n)
19541 .k(k)
19542 .a_stride(43)
19543 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19544 }
19545 }
19546 }
19547
19548 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4_subtile) {
19549 TEST_REQUIRES_X86_XOP;
19550 for (uint32_t n = 8; n <= 12; n += 4) {
19551 for (size_t k = 1; k <= 40; k += 9) {
19552 for (uint32_t m = 1; m <= 2; m++) {
19553 GemmMicrokernelTester()
19554 .mr(2)
19555 .nr(4)
19556 .kr(8)
19557 .sr(1)
19558 .m(m)
19559 .n(n)
19560 .k(k)
19561 .iterations(1)
19562 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19563 }
19564 }
19565 }
19566 }
19567
19568 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, strided_cm_subtile) {
19569 TEST_REQUIRES_X86_XOP;
19570 for (size_t k = 1; k <= 40; k += 9) {
19571 for (uint32_t m = 1; m <= 2; m++) {
19572 for (uint32_t n = 1; n <= 4; n++) {
19573 GemmMicrokernelTester()
19574 .mr(2)
19575 .nr(4)
19576 .kr(8)
19577 .sr(1)
19578 .m(m)
19579 .n(n)
19580 .k(k)
19581 .cm_stride(7)
19582 .iterations(1)
19583 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19584 }
19585 }
19586 }
19587 }
19588
19589 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, qmin) {
19590 TEST_REQUIRES_X86_XOP;
19591 GemmMicrokernelTester()
19592 .mr(2)
19593 .nr(4)
19594 .kr(8)
19595 .sr(1)
19596 .m(2)
19597 .n(4)
19598 .k(8)
19599 .qmin(128)
19600 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19601 }
19602
19603 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, qmax) {
19604 TEST_REQUIRES_X86_XOP;
19605 GemmMicrokernelTester()
19606 .mr(2)
19607 .nr(4)
19608 .kr(8)
19609 .sr(1)
19610 .m(2)
19611 .n(4)
19612 .k(8)
19613 .qmax(128)
19614 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19615 }
19616
19617 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, strided_cm) {
19618 TEST_REQUIRES_X86_XOP;
19619 GemmMicrokernelTester()
19620 .mr(2)
19621 .nr(4)
19622 .kr(8)
19623 .sr(1)
19624 .m(2)
19625 .n(4)
19626 .k(8)
19627 .cm_stride(7)
19628 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19629 }
19630#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
19631
19632
19633#if XNN_ARCH_X86 || XNN_ARCH_X86_64
19634 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8) {
19635 TEST_REQUIRES_X86_XOP;
19636 GemmMicrokernelTester()
19637 .mr(3)
19638 .nr(4)
19639 .kr(8)
19640 .sr(1)
19641 .m(3)
19642 .n(4)
19643 .k(8)
19644 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19645 }
19646
19647 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, strided_cn) {
19648 TEST_REQUIRES_X86_XOP;
19649 GemmMicrokernelTester()
19650 .mr(3)
19651 .nr(4)
19652 .kr(8)
19653 .sr(1)
19654 .m(3)
19655 .n(4)
19656 .k(8)
19657 .cn_stride(7)
19658 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19659 }
19660
19661 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8_strided_a) {
19662 TEST_REQUIRES_X86_XOP;
19663 GemmMicrokernelTester()
19664 .mr(3)
19665 .nr(4)
19666 .kr(8)
19667 .sr(1)
19668 .m(3)
19669 .n(4)
19670 .k(8)
19671 .a_stride(11)
19672 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19673 }
19674
19675 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8_subtile) {
19676 TEST_REQUIRES_X86_XOP;
19677 for (uint32_t m = 1; m <= 3; m++) {
19678 for (uint32_t n = 1; n <= 4; n++) {
19679 GemmMicrokernelTester()
19680 .mr(3)
19681 .nr(4)
19682 .kr(8)
19683 .sr(1)
19684 .m(m)
19685 .n(n)
19686 .k(8)
19687 .iterations(1)
19688 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19689 }
19690 }
19691 }
19692
19693 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8_subtile_m) {
19694 TEST_REQUIRES_X86_XOP;
19695 for (uint32_t m = 1; m <= 3; m++) {
19696 GemmMicrokernelTester()
19697 .mr(3)
19698 .nr(4)
19699 .kr(8)
19700 .sr(1)
19701 .m(m)
19702 .n(4)
19703 .k(8)
19704 .iterations(1)
19705 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19706 }
19707 }
19708
19709 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8_subtile_n) {
19710 TEST_REQUIRES_X86_XOP;
19711 for (uint32_t n = 1; n <= 4; n++) {
19712 GemmMicrokernelTester()
19713 .mr(3)
19714 .nr(4)
19715 .kr(8)
19716 .sr(1)
19717 .m(3)
19718 .n(n)
19719 .k(8)
19720 .iterations(1)
19721 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19722 }
19723 }
19724
19725 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_lt_8) {
19726 TEST_REQUIRES_X86_XOP;
19727 for (size_t k = 1; k < 8; k++) {
19728 GemmMicrokernelTester()
19729 .mr(3)
19730 .nr(4)
19731 .kr(8)
19732 .sr(1)
19733 .m(3)
19734 .n(4)
19735 .k(k)
19736 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19737 }
19738 }
19739
19740 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_lt_8_strided_a) {
19741 TEST_REQUIRES_X86_XOP;
19742 for (size_t k = 1; k < 8; k++) {
19743 GemmMicrokernelTester()
19744 .mr(3)
19745 .nr(4)
19746 .kr(8)
19747 .sr(1)
19748 .m(3)
19749 .n(4)
19750 .k(k)
19751 .a_stride(11)
19752 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19753 }
19754 }
19755
19756 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_lt_8_subtile) {
19757 TEST_REQUIRES_X86_XOP;
19758 for (size_t k = 1; k < 8; k++) {
19759 for (uint32_t m = 1; m <= 3; m++) {
19760 for (uint32_t n = 1; n <= 4; n++) {
19761 GemmMicrokernelTester()
19762 .mr(3)
19763 .nr(4)
19764 .kr(8)
19765 .sr(1)
19766 .m(m)
19767 .n(n)
19768 .k(k)
19769 .iterations(1)
19770 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19771 }
19772 }
19773 }
19774 }
19775
19776 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_gt_8) {
19777 TEST_REQUIRES_X86_XOP;
19778 for (size_t k = 9; k < 16; k++) {
19779 GemmMicrokernelTester()
19780 .mr(3)
19781 .nr(4)
19782 .kr(8)
19783 .sr(1)
19784 .m(3)
19785 .n(4)
19786 .k(k)
19787 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19788 }
19789 }
19790
19791 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_gt_8_strided_a) {
19792 TEST_REQUIRES_X86_XOP;
19793 for (size_t k = 9; k < 16; k++) {
19794 GemmMicrokernelTester()
19795 .mr(3)
19796 .nr(4)
19797 .kr(8)
19798 .sr(1)
19799 .m(3)
19800 .n(4)
19801 .k(k)
19802 .a_stride(19)
19803 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19804 }
19805 }
19806
19807 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_gt_8_subtile) {
19808 TEST_REQUIRES_X86_XOP;
19809 for (size_t k = 9; k < 16; k++) {
19810 for (uint32_t m = 1; m <= 3; m++) {
19811 for (uint32_t n = 1; n <= 4; n++) {
19812 GemmMicrokernelTester()
19813 .mr(3)
19814 .nr(4)
19815 .kr(8)
19816 .sr(1)
19817 .m(m)
19818 .n(n)
19819 .k(k)
19820 .iterations(1)
19821 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19822 }
19823 }
19824 }
19825 }
19826
19827 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_div_8) {
19828 TEST_REQUIRES_X86_XOP;
19829 for (size_t k = 16; k <= 80; k += 8) {
19830 GemmMicrokernelTester()
19831 .mr(3)
19832 .nr(4)
19833 .kr(8)
19834 .sr(1)
19835 .m(3)
19836 .n(4)
19837 .k(k)
19838 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19839 }
19840 }
19841
19842 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_div_8_strided_a) {
19843 TEST_REQUIRES_X86_XOP;
19844 for (size_t k = 16; k <= 80; k += 8) {
19845 GemmMicrokernelTester()
19846 .mr(3)
19847 .nr(4)
19848 .kr(8)
19849 .sr(1)
19850 .m(3)
19851 .n(4)
19852 .k(k)
19853 .a_stride(83)
19854 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19855 }
19856 }
19857
19858 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_div_8_subtile) {
19859 TEST_REQUIRES_X86_XOP;
19860 for (size_t k = 16; k <= 80; k += 8) {
19861 for (uint32_t m = 1; m <= 3; m++) {
19862 for (uint32_t n = 1; n <= 4; n++) {
19863 GemmMicrokernelTester()
19864 .mr(3)
19865 .nr(4)
19866 .kr(8)
19867 .sr(1)
19868 .m(m)
19869 .n(n)
19870 .k(k)
19871 .iterations(1)
19872 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19873 }
19874 }
19875 }
19876 }
19877
19878 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_gt_4) {
19879 TEST_REQUIRES_X86_XOP;
19880 for (uint32_t n = 5; n < 8; n++) {
19881 for (size_t k = 1; k <= 40; k += 9) {
19882 GemmMicrokernelTester()
19883 .mr(3)
19884 .nr(4)
19885 .kr(8)
19886 .sr(1)
19887 .m(3)
19888 .n(4)
19889 .k(k)
19890 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19891 }
19892 }
19893 }
19894
19895 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_gt_4_strided_cn) {
19896 TEST_REQUIRES_X86_XOP;
19897 for (uint32_t n = 5; n < 8; n++) {
19898 for (size_t k = 1; k <= 40; k += 9) {
19899 GemmMicrokernelTester()
19900 .mr(3)
19901 .nr(4)
19902 .kr(8)
19903 .sr(1)
19904 .m(3)
19905 .n(4)
19906 .k(k)
19907 .cn_stride(7)
19908 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19909 }
19910 }
19911 }
19912
19913 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_gt_4_strided_a) {
19914 TEST_REQUIRES_X86_XOP;
19915 for (uint32_t n = 5; n < 8; n++) {
19916 for (size_t k = 1; k <= 40; k += 9) {
19917 GemmMicrokernelTester()
19918 .mr(3)
19919 .nr(4)
19920 .kr(8)
19921 .sr(1)
19922 .m(3)
19923 .n(n)
19924 .k(k)
19925 .a_stride(43)
19926 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19927 }
19928 }
19929 }
19930
19931 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_gt_4_subtile) {
19932 TEST_REQUIRES_X86_XOP;
19933 for (uint32_t n = 5; n < 8; n++) {
19934 for (size_t k = 1; k <= 40; k += 9) {
19935 for (uint32_t m = 1; m <= 3; m++) {
19936 GemmMicrokernelTester()
19937 .mr(3)
19938 .nr(4)
19939 .kr(8)
19940 .sr(1)
19941 .m(m)
19942 .n(n)
19943 .k(k)
19944 .iterations(1)
19945 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19946 }
19947 }
19948 }
19949 }
19950
19951 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_div_4) {
19952 TEST_REQUIRES_X86_XOP;
19953 for (uint32_t n = 8; n <= 12; n += 4) {
19954 for (size_t k = 1; k <= 40; k += 9) {
19955 GemmMicrokernelTester()
19956 .mr(3)
19957 .nr(4)
19958 .kr(8)
19959 .sr(1)
19960 .m(3)
19961 .n(4)
19962 .k(k)
19963 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19964 }
19965 }
19966 }
19967
19968 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_div_4_strided_cn) {
19969 TEST_REQUIRES_X86_XOP;
19970 for (uint32_t n = 8; n <= 12; n += 4) {
19971 for (size_t k = 1; k <= 40; k += 9) {
19972 GemmMicrokernelTester()
19973 .mr(3)
19974 .nr(4)
19975 .kr(8)
19976 .sr(1)
19977 .m(3)
19978 .n(n)
19979 .k(k)
19980 .cn_stride(7)
19981 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
19982 }
19983 }
19984 }
19985
19986 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_div_4_strided_a) {
19987 TEST_REQUIRES_X86_XOP;
19988 for (uint32_t n = 8; n <= 12; n += 4) {
19989 for (size_t k = 1; k <= 40; k += 9) {
19990 GemmMicrokernelTester()
19991 .mr(3)
19992 .nr(4)
19993 .kr(8)
19994 .sr(1)
19995 .m(3)
19996 .n(n)
19997 .k(k)
19998 .a_stride(43)
19999 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20000 }
20001 }
20002 }
20003
20004 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_div_4_subtile) {
20005 TEST_REQUIRES_X86_XOP;
20006 for (uint32_t n = 8; n <= 12; n += 4) {
20007 for (size_t k = 1; k <= 40; k += 9) {
20008 for (uint32_t m = 1; m <= 3; m++) {
20009 GemmMicrokernelTester()
20010 .mr(3)
20011 .nr(4)
20012 .kr(8)
20013 .sr(1)
20014 .m(m)
20015 .n(n)
20016 .k(k)
20017 .iterations(1)
20018 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20019 }
20020 }
20021 }
20022 }
20023
20024 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, strided_cm_subtile) {
20025 TEST_REQUIRES_X86_XOP;
20026 for (size_t k = 1; k <= 40; k += 9) {
20027 for (uint32_t m = 1; m <= 3; m++) {
20028 for (uint32_t n = 1; n <= 4; n++) {
20029 GemmMicrokernelTester()
20030 .mr(3)
20031 .nr(4)
20032 .kr(8)
20033 .sr(1)
20034 .m(m)
20035 .n(n)
20036 .k(k)
20037 .cm_stride(7)
20038 .iterations(1)
20039 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20040 }
20041 }
20042 }
20043 }
20044
20045 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, qmin) {
20046 TEST_REQUIRES_X86_XOP;
20047 GemmMicrokernelTester()
20048 .mr(3)
20049 .nr(4)
20050 .kr(8)
20051 .sr(1)
20052 .m(3)
20053 .n(4)
20054 .k(8)
20055 .qmin(128)
20056 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20057 }
20058
20059 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, qmax) {
20060 TEST_REQUIRES_X86_XOP;
20061 GemmMicrokernelTester()
20062 .mr(3)
20063 .nr(4)
20064 .kr(8)
20065 .sr(1)
20066 .m(3)
20067 .n(4)
20068 .k(8)
20069 .qmax(128)
20070 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20071 }
20072
20073 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, strided_cm) {
20074 TEST_REQUIRES_X86_XOP;
20075 GemmMicrokernelTester()
20076 .mr(3)
20077 .nr(4)
20078 .kr(8)
20079 .sr(1)
20080 .m(3)
20081 .n(4)
20082 .k(8)
20083 .cm_stride(7)
20084 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20085 }
20086#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
20087
20088
20089#if XNN_ARCH_X86 || XNN_ARCH_X86_64
20090 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8) {
20091 TEST_REQUIRES_X86_SSE2;
20092 GemmMicrokernelTester()
20093 .mr(1)
20094 .nr(4)
20095 .kr(8)
20096 .sr(1)
20097 .m(1)
20098 .n(4)
20099 .k(8)
20100 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20101 }
20102
20103 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, strided_cn) {
20104 TEST_REQUIRES_X86_SSE2;
20105 GemmMicrokernelTester()
20106 .mr(1)
20107 .nr(4)
20108 .kr(8)
20109 .sr(1)
20110 .m(1)
20111 .n(4)
20112 .k(8)
20113 .cn_stride(7)
20114 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20115 }
20116
20117 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8_strided_a) {
20118 TEST_REQUIRES_X86_SSE2;
20119 GemmMicrokernelTester()
20120 .mr(1)
20121 .nr(4)
20122 .kr(8)
20123 .sr(1)
20124 .m(1)
20125 .n(4)
20126 .k(8)
20127 .a_stride(11)
20128 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20129 }
20130
20131 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8_subtile) {
20132 TEST_REQUIRES_X86_SSE2;
20133 for (uint32_t m = 1; m <= 1; m++) {
20134 for (uint32_t n = 1; n <= 4; n++) {
20135 GemmMicrokernelTester()
20136 .mr(1)
20137 .nr(4)
20138 .kr(8)
20139 .sr(1)
20140 .m(m)
20141 .n(n)
20142 .k(8)
20143 .iterations(1)
20144 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20145 }
20146 }
20147 }
20148
20149 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8_subtile_m) {
20150 TEST_REQUIRES_X86_SSE2;
20151 for (uint32_t m = 1; m <= 1; m++) {
20152 GemmMicrokernelTester()
20153 .mr(1)
20154 .nr(4)
20155 .kr(8)
20156 .sr(1)
20157 .m(m)
20158 .n(4)
20159 .k(8)
20160 .iterations(1)
20161 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20162 }
20163 }
20164
20165 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8_subtile_n) {
20166 TEST_REQUIRES_X86_SSE2;
20167 for (uint32_t n = 1; n <= 4; n++) {
20168 GemmMicrokernelTester()
20169 .mr(1)
20170 .nr(4)
20171 .kr(8)
20172 .sr(1)
20173 .m(1)
20174 .n(n)
20175 .k(8)
20176 .iterations(1)
20177 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20178 }
20179 }
20180
20181 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_lt_8) {
20182 TEST_REQUIRES_X86_SSE2;
20183 for (size_t k = 1; k < 8; k++) {
20184 GemmMicrokernelTester()
20185 .mr(1)
20186 .nr(4)
20187 .kr(8)
20188 .sr(1)
20189 .m(1)
20190 .n(4)
20191 .k(k)
20192 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20193 }
20194 }
20195
20196 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_lt_8_strided_a) {
20197 TEST_REQUIRES_X86_SSE2;
20198 for (size_t k = 1; k < 8; k++) {
20199 GemmMicrokernelTester()
20200 .mr(1)
20201 .nr(4)
20202 .kr(8)
20203 .sr(1)
20204 .m(1)
20205 .n(4)
20206 .k(k)
20207 .a_stride(11)
20208 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20209 }
20210 }
20211
20212 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_lt_8_subtile) {
20213 TEST_REQUIRES_X86_SSE2;
20214 for (size_t k = 1; k < 8; k++) {
20215 for (uint32_t m = 1; m <= 1; m++) {
20216 for (uint32_t n = 1; n <= 4; n++) {
20217 GemmMicrokernelTester()
20218 .mr(1)
20219 .nr(4)
20220 .kr(8)
20221 .sr(1)
20222 .m(m)
20223 .n(n)
20224 .k(k)
20225 .iterations(1)
20226 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20227 }
20228 }
20229 }
20230 }
20231
20232 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_gt_8) {
20233 TEST_REQUIRES_X86_SSE2;
20234 for (size_t k = 9; k < 16; k++) {
20235 GemmMicrokernelTester()
20236 .mr(1)
20237 .nr(4)
20238 .kr(8)
20239 .sr(1)
20240 .m(1)
20241 .n(4)
20242 .k(k)
20243 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20244 }
20245 }
20246
20247 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_gt_8_strided_a) {
20248 TEST_REQUIRES_X86_SSE2;
20249 for (size_t k = 9; k < 16; k++) {
20250 GemmMicrokernelTester()
20251 .mr(1)
20252 .nr(4)
20253 .kr(8)
20254 .sr(1)
20255 .m(1)
20256 .n(4)
20257 .k(k)
20258 .a_stride(19)
20259 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20260 }
20261 }
20262
20263 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_gt_8_subtile) {
20264 TEST_REQUIRES_X86_SSE2;
20265 for (size_t k = 9; k < 16; k++) {
20266 for (uint32_t m = 1; m <= 1; m++) {
20267 for (uint32_t n = 1; n <= 4; n++) {
20268 GemmMicrokernelTester()
20269 .mr(1)
20270 .nr(4)
20271 .kr(8)
20272 .sr(1)
20273 .m(m)
20274 .n(n)
20275 .k(k)
20276 .iterations(1)
20277 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20278 }
20279 }
20280 }
20281 }
20282
20283 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_div_8) {
20284 TEST_REQUIRES_X86_SSE2;
20285 for (size_t k = 16; k <= 80; k += 8) {
20286 GemmMicrokernelTester()
20287 .mr(1)
20288 .nr(4)
20289 .kr(8)
20290 .sr(1)
20291 .m(1)
20292 .n(4)
20293 .k(k)
20294 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20295 }
20296 }
20297
20298 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_div_8_strided_a) {
20299 TEST_REQUIRES_X86_SSE2;
20300 for (size_t k = 16; k <= 80; k += 8) {
20301 GemmMicrokernelTester()
20302 .mr(1)
20303 .nr(4)
20304 .kr(8)
20305 .sr(1)
20306 .m(1)
20307 .n(4)
20308 .k(k)
20309 .a_stride(83)
20310 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20311 }
20312 }
20313
20314 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_div_8_subtile) {
20315 TEST_REQUIRES_X86_SSE2;
20316 for (size_t k = 16; k <= 80; k += 8) {
20317 for (uint32_t m = 1; m <= 1; m++) {
20318 for (uint32_t n = 1; n <= 4; n++) {
20319 GemmMicrokernelTester()
20320 .mr(1)
20321 .nr(4)
20322 .kr(8)
20323 .sr(1)
20324 .m(m)
20325 .n(n)
20326 .k(k)
20327 .iterations(1)
20328 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20329 }
20330 }
20331 }
20332 }
20333
20334 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4) {
20335 TEST_REQUIRES_X86_SSE2;
20336 for (uint32_t n = 5; n < 8; n++) {
20337 for (size_t k = 1; k <= 40; k += 9) {
20338 GemmMicrokernelTester()
20339 .mr(1)
20340 .nr(4)
20341 .kr(8)
20342 .sr(1)
20343 .m(1)
20344 .n(4)
20345 .k(k)
20346 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20347 }
20348 }
20349 }
20350
20351 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4_strided_cn) {
20352 TEST_REQUIRES_X86_SSE2;
20353 for (uint32_t n = 5; n < 8; n++) {
20354 for (size_t k = 1; k <= 40; k += 9) {
20355 GemmMicrokernelTester()
20356 .mr(1)
20357 .nr(4)
20358 .kr(8)
20359 .sr(1)
20360 .m(1)
20361 .n(4)
20362 .k(k)
20363 .cn_stride(7)
20364 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20365 }
20366 }
20367 }
20368
20369 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4_strided_a) {
20370 TEST_REQUIRES_X86_SSE2;
20371 for (uint32_t n = 5; n < 8; n++) {
20372 for (size_t k = 1; k <= 40; k += 9) {
20373 GemmMicrokernelTester()
20374 .mr(1)
20375 .nr(4)
20376 .kr(8)
20377 .sr(1)
20378 .m(1)
20379 .n(n)
20380 .k(k)
20381 .a_stride(43)
20382 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20383 }
20384 }
20385 }
20386
20387 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4_subtile) {
20388 TEST_REQUIRES_X86_SSE2;
20389 for (uint32_t n = 5; n < 8; n++) {
20390 for (size_t k = 1; k <= 40; k += 9) {
20391 for (uint32_t m = 1; m <= 1; m++) {
20392 GemmMicrokernelTester()
20393 .mr(1)
20394 .nr(4)
20395 .kr(8)
20396 .sr(1)
20397 .m(m)
20398 .n(n)
20399 .k(k)
20400 .iterations(1)
20401 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20402 }
20403 }
20404 }
20405 }
20406
20407 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4) {
20408 TEST_REQUIRES_X86_SSE2;
20409 for (uint32_t n = 8; n <= 12; n += 4) {
20410 for (size_t k = 1; k <= 40; k += 9) {
20411 GemmMicrokernelTester()
20412 .mr(1)
20413 .nr(4)
20414 .kr(8)
20415 .sr(1)
20416 .m(1)
20417 .n(4)
20418 .k(k)
20419 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20420 }
20421 }
20422 }
20423
20424 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4_strided_cn) {
20425 TEST_REQUIRES_X86_SSE2;
20426 for (uint32_t n = 8; n <= 12; n += 4) {
20427 for (size_t k = 1; k <= 40; k += 9) {
20428 GemmMicrokernelTester()
20429 .mr(1)
20430 .nr(4)
20431 .kr(8)
20432 .sr(1)
20433 .m(1)
20434 .n(n)
20435 .k(k)
20436 .cn_stride(7)
20437 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20438 }
20439 }
20440 }
20441
20442 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4_strided_a) {
20443 TEST_REQUIRES_X86_SSE2;
20444 for (uint32_t n = 8; n <= 12; n += 4) {
20445 for (size_t k = 1; k <= 40; k += 9) {
20446 GemmMicrokernelTester()
20447 .mr(1)
20448 .nr(4)
20449 .kr(8)
20450 .sr(1)
20451 .m(1)
20452 .n(n)
20453 .k(k)
20454 .a_stride(43)
20455 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20456 }
20457 }
20458 }
20459
20460 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4_subtile) {
20461 TEST_REQUIRES_X86_SSE2;
20462 for (uint32_t n = 8; n <= 12; n += 4) {
20463 for (size_t k = 1; k <= 40; k += 9) {
20464 for (uint32_t m = 1; m <= 1; m++) {
20465 GemmMicrokernelTester()
20466 .mr(1)
20467 .nr(4)
20468 .kr(8)
20469 .sr(1)
20470 .m(m)
20471 .n(n)
20472 .k(k)
20473 .iterations(1)
20474 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20475 }
20476 }
20477 }
20478 }
20479
20480 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, strided_cm_subtile) {
20481 TEST_REQUIRES_X86_SSE2;
20482 for (size_t k = 1; k <= 40; k += 9) {
20483 for (uint32_t m = 1; m <= 1; m++) {
20484 for (uint32_t n = 1; n <= 4; n++) {
20485 GemmMicrokernelTester()
20486 .mr(1)
20487 .nr(4)
20488 .kr(8)
20489 .sr(1)
20490 .m(m)
20491 .n(n)
20492 .k(k)
20493 .cm_stride(7)
20494 .iterations(1)
20495 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20496 }
20497 }
20498 }
20499 }
20500
20501 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, qmin) {
20502 TEST_REQUIRES_X86_SSE2;
20503 GemmMicrokernelTester()
20504 .mr(1)
20505 .nr(4)
20506 .kr(8)
20507 .sr(1)
20508 .m(1)
20509 .n(4)
20510 .k(8)
20511 .qmin(128)
20512 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20513 }
20514
20515 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, qmax) {
20516 TEST_REQUIRES_X86_SSE2;
20517 GemmMicrokernelTester()
20518 .mr(1)
20519 .nr(4)
20520 .kr(8)
20521 .sr(1)
20522 .m(1)
20523 .n(4)
20524 .k(8)
20525 .qmax(128)
20526 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20527 }
20528
20529 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, strided_cm) {
20530 TEST_REQUIRES_X86_SSE2;
20531 GemmMicrokernelTester()
20532 .mr(1)
20533 .nr(4)
20534 .kr(8)
20535 .sr(1)
20536 .m(1)
20537 .n(4)
20538 .k(8)
20539 .cm_stride(7)
20540 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20541 }
20542#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
20543
20544
20545#if XNN_ARCH_X86 || XNN_ARCH_X86_64
20546 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8) {
20547 TEST_REQUIRES_X86_SSE2;
20548 GemmMicrokernelTester()
20549 .mr(2)
20550 .nr(4)
20551 .kr(8)
20552 .sr(1)
20553 .m(2)
20554 .n(4)
20555 .k(8)
20556 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20557 }
20558
20559 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, strided_cn) {
20560 TEST_REQUIRES_X86_SSE2;
20561 GemmMicrokernelTester()
20562 .mr(2)
20563 .nr(4)
20564 .kr(8)
20565 .sr(1)
20566 .m(2)
20567 .n(4)
20568 .k(8)
20569 .cn_stride(7)
20570 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20571 }
20572
20573 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8_strided_a) {
20574 TEST_REQUIRES_X86_SSE2;
20575 GemmMicrokernelTester()
20576 .mr(2)
20577 .nr(4)
20578 .kr(8)
20579 .sr(1)
20580 .m(2)
20581 .n(4)
20582 .k(8)
20583 .a_stride(11)
20584 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20585 }
20586
20587 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8_subtile) {
20588 TEST_REQUIRES_X86_SSE2;
20589 for (uint32_t m = 1; m <= 2; m++) {
20590 for (uint32_t n = 1; n <= 4; n++) {
20591 GemmMicrokernelTester()
20592 .mr(2)
20593 .nr(4)
20594 .kr(8)
20595 .sr(1)
20596 .m(m)
20597 .n(n)
20598 .k(8)
20599 .iterations(1)
20600 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20601 }
20602 }
20603 }
20604
20605 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8_subtile_m) {
20606 TEST_REQUIRES_X86_SSE2;
20607 for (uint32_t m = 1; m <= 2; m++) {
20608 GemmMicrokernelTester()
20609 .mr(2)
20610 .nr(4)
20611 .kr(8)
20612 .sr(1)
20613 .m(m)
20614 .n(4)
20615 .k(8)
20616 .iterations(1)
20617 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20618 }
20619 }
20620
20621 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8_subtile_n) {
20622 TEST_REQUIRES_X86_SSE2;
20623 for (uint32_t n = 1; n <= 4; n++) {
20624 GemmMicrokernelTester()
20625 .mr(2)
20626 .nr(4)
20627 .kr(8)
20628 .sr(1)
20629 .m(2)
20630 .n(n)
20631 .k(8)
20632 .iterations(1)
20633 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20634 }
20635 }
20636
20637 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_lt_8) {
20638 TEST_REQUIRES_X86_SSE2;
20639 for (size_t k = 1; k < 8; k++) {
20640 GemmMicrokernelTester()
20641 .mr(2)
20642 .nr(4)
20643 .kr(8)
20644 .sr(1)
20645 .m(2)
20646 .n(4)
20647 .k(k)
20648 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20649 }
20650 }
20651
20652 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_lt_8_strided_a) {
20653 TEST_REQUIRES_X86_SSE2;
20654 for (size_t k = 1; k < 8; k++) {
20655 GemmMicrokernelTester()
20656 .mr(2)
20657 .nr(4)
20658 .kr(8)
20659 .sr(1)
20660 .m(2)
20661 .n(4)
20662 .k(k)
20663 .a_stride(11)
20664 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20665 }
20666 }
20667
20668 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_lt_8_subtile) {
20669 TEST_REQUIRES_X86_SSE2;
20670 for (size_t k = 1; k < 8; k++) {
20671 for (uint32_t m = 1; m <= 2; m++) {
20672 for (uint32_t n = 1; n <= 4; n++) {
20673 GemmMicrokernelTester()
20674 .mr(2)
20675 .nr(4)
20676 .kr(8)
20677 .sr(1)
20678 .m(m)
20679 .n(n)
20680 .k(k)
20681 .iterations(1)
20682 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20683 }
20684 }
20685 }
20686 }
20687
20688 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_gt_8) {
20689 TEST_REQUIRES_X86_SSE2;
20690 for (size_t k = 9; k < 16; k++) {
20691 GemmMicrokernelTester()
20692 .mr(2)
20693 .nr(4)
20694 .kr(8)
20695 .sr(1)
20696 .m(2)
20697 .n(4)
20698 .k(k)
20699 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20700 }
20701 }
20702
20703 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_gt_8_strided_a) {
20704 TEST_REQUIRES_X86_SSE2;
20705 for (size_t k = 9; k < 16; k++) {
20706 GemmMicrokernelTester()
20707 .mr(2)
20708 .nr(4)
20709 .kr(8)
20710 .sr(1)
20711 .m(2)
20712 .n(4)
20713 .k(k)
20714 .a_stride(19)
20715 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20716 }
20717 }
20718
20719 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_gt_8_subtile) {
20720 TEST_REQUIRES_X86_SSE2;
20721 for (size_t k = 9; k < 16; k++) {
20722 for (uint32_t m = 1; m <= 2; m++) {
20723 for (uint32_t n = 1; n <= 4; n++) {
20724 GemmMicrokernelTester()
20725 .mr(2)
20726 .nr(4)
20727 .kr(8)
20728 .sr(1)
20729 .m(m)
20730 .n(n)
20731 .k(k)
20732 .iterations(1)
20733 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20734 }
20735 }
20736 }
20737 }
20738
20739 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_div_8) {
20740 TEST_REQUIRES_X86_SSE2;
20741 for (size_t k = 16; k <= 80; k += 8) {
20742 GemmMicrokernelTester()
20743 .mr(2)
20744 .nr(4)
20745 .kr(8)
20746 .sr(1)
20747 .m(2)
20748 .n(4)
20749 .k(k)
20750 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20751 }
20752 }
20753
20754 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_div_8_strided_a) {
20755 TEST_REQUIRES_X86_SSE2;
20756 for (size_t k = 16; k <= 80; k += 8) {
20757 GemmMicrokernelTester()
20758 .mr(2)
20759 .nr(4)
20760 .kr(8)
20761 .sr(1)
20762 .m(2)
20763 .n(4)
20764 .k(k)
20765 .a_stride(83)
20766 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20767 }
20768 }
20769
20770 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_div_8_subtile) {
20771 TEST_REQUIRES_X86_SSE2;
20772 for (size_t k = 16; k <= 80; k += 8) {
20773 for (uint32_t m = 1; m <= 2; m++) {
20774 for (uint32_t n = 1; n <= 4; n++) {
20775 GemmMicrokernelTester()
20776 .mr(2)
20777 .nr(4)
20778 .kr(8)
20779 .sr(1)
20780 .m(m)
20781 .n(n)
20782 .k(k)
20783 .iterations(1)
20784 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20785 }
20786 }
20787 }
20788 }
20789
20790 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_gt_4) {
20791 TEST_REQUIRES_X86_SSE2;
20792 for (uint32_t n = 5; n < 8; n++) {
20793 for (size_t k = 1; k <= 40; k += 9) {
20794 GemmMicrokernelTester()
20795 .mr(2)
20796 .nr(4)
20797 .kr(8)
20798 .sr(1)
20799 .m(2)
20800 .n(4)
20801 .k(k)
20802 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20803 }
20804 }
20805 }
20806
20807 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_gt_4_strided_cn) {
20808 TEST_REQUIRES_X86_SSE2;
20809 for (uint32_t n = 5; n < 8; n++) {
20810 for (size_t k = 1; k <= 40; k += 9) {
20811 GemmMicrokernelTester()
20812 .mr(2)
20813 .nr(4)
20814 .kr(8)
20815 .sr(1)
20816 .m(2)
20817 .n(4)
20818 .k(k)
20819 .cn_stride(7)
20820 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20821 }
20822 }
20823 }
20824
20825 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_gt_4_strided_a) {
20826 TEST_REQUIRES_X86_SSE2;
20827 for (uint32_t n = 5; n < 8; n++) {
20828 for (size_t k = 1; k <= 40; k += 9) {
20829 GemmMicrokernelTester()
20830 .mr(2)
20831 .nr(4)
20832 .kr(8)
20833 .sr(1)
20834 .m(2)
20835 .n(n)
20836 .k(k)
20837 .a_stride(43)
20838 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20839 }
20840 }
20841 }
20842
20843 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_gt_4_subtile) {
20844 TEST_REQUIRES_X86_SSE2;
20845 for (uint32_t n = 5; n < 8; n++) {
20846 for (size_t k = 1; k <= 40; k += 9) {
20847 for (uint32_t m = 1; m <= 2; m++) {
20848 GemmMicrokernelTester()
20849 .mr(2)
20850 .nr(4)
20851 .kr(8)
20852 .sr(1)
20853 .m(m)
20854 .n(n)
20855 .k(k)
20856 .iterations(1)
20857 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20858 }
20859 }
20860 }
20861 }
20862
20863 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_div_4) {
20864 TEST_REQUIRES_X86_SSE2;
20865 for (uint32_t n = 8; n <= 12; n += 4) {
20866 for (size_t k = 1; k <= 40; k += 9) {
20867 GemmMicrokernelTester()
20868 .mr(2)
20869 .nr(4)
20870 .kr(8)
20871 .sr(1)
20872 .m(2)
20873 .n(4)
20874 .k(k)
20875 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20876 }
20877 }
20878 }
20879
20880 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_div_4_strided_cn) {
20881 TEST_REQUIRES_X86_SSE2;
20882 for (uint32_t n = 8; n <= 12; n += 4) {
20883 for (size_t k = 1; k <= 40; k += 9) {
20884 GemmMicrokernelTester()
20885 .mr(2)
20886 .nr(4)
20887 .kr(8)
20888 .sr(1)
20889 .m(2)
20890 .n(n)
20891 .k(k)
20892 .cn_stride(7)
20893 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20894 }
20895 }
20896 }
20897
20898 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_div_4_strided_a) {
20899 TEST_REQUIRES_X86_SSE2;
20900 for (uint32_t n = 8; n <= 12; n += 4) {
20901 for (size_t k = 1; k <= 40; k += 9) {
20902 GemmMicrokernelTester()
20903 .mr(2)
20904 .nr(4)
20905 .kr(8)
20906 .sr(1)
20907 .m(2)
20908 .n(n)
20909 .k(k)
20910 .a_stride(43)
20911 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20912 }
20913 }
20914 }
20915
20916 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_div_4_subtile) {
20917 TEST_REQUIRES_X86_SSE2;
20918 for (uint32_t n = 8; n <= 12; n += 4) {
20919 for (size_t k = 1; k <= 40; k += 9) {
20920 for (uint32_t m = 1; m <= 2; m++) {
20921 GemmMicrokernelTester()
20922 .mr(2)
20923 .nr(4)
20924 .kr(8)
20925 .sr(1)
20926 .m(m)
20927 .n(n)
20928 .k(k)
20929 .iterations(1)
20930 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20931 }
20932 }
20933 }
20934 }
20935
20936 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, strided_cm_subtile) {
20937 TEST_REQUIRES_X86_SSE2;
20938 for (size_t k = 1; k <= 40; k += 9) {
20939 for (uint32_t m = 1; m <= 2; m++) {
20940 for (uint32_t n = 1; n <= 4; n++) {
20941 GemmMicrokernelTester()
20942 .mr(2)
20943 .nr(4)
20944 .kr(8)
20945 .sr(1)
20946 .m(m)
20947 .n(n)
20948 .k(k)
20949 .cm_stride(7)
20950 .iterations(1)
20951 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20952 }
20953 }
20954 }
20955 }
20956
20957 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, qmin) {
20958 TEST_REQUIRES_X86_SSE2;
20959 GemmMicrokernelTester()
20960 .mr(2)
20961 .nr(4)
20962 .kr(8)
20963 .sr(1)
20964 .m(2)
20965 .n(4)
20966 .k(8)
20967 .qmin(128)
20968 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20969 }
20970
20971 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, qmax) {
20972 TEST_REQUIRES_X86_SSE2;
20973 GemmMicrokernelTester()
20974 .mr(2)
20975 .nr(4)
20976 .kr(8)
20977 .sr(1)
20978 .m(2)
20979 .n(4)
20980 .k(8)
20981 .qmax(128)
20982 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20983 }
20984
20985 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, strided_cm) {
20986 TEST_REQUIRES_X86_SSE2;
20987 GemmMicrokernelTester()
20988 .mr(2)
20989 .nr(4)
20990 .kr(8)
20991 .sr(1)
20992 .m(2)
20993 .n(4)
20994 .k(8)
20995 .cm_stride(7)
20996 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
20997 }
20998#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
20999
21000
21001#if XNN_ARCH_X86 || XNN_ARCH_X86_64
21002 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8) {
21003 TEST_REQUIRES_X86_SSE2;
21004 GemmMicrokernelTester()
21005 .mr(3)
21006 .nr(4)
21007 .kr(8)
21008 .sr(1)
21009 .m(3)
21010 .n(4)
21011 .k(8)
21012 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21013 }
21014
21015 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, strided_cn) {
21016 TEST_REQUIRES_X86_SSE2;
21017 GemmMicrokernelTester()
21018 .mr(3)
21019 .nr(4)
21020 .kr(8)
21021 .sr(1)
21022 .m(3)
21023 .n(4)
21024 .k(8)
21025 .cn_stride(7)
21026 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21027 }
21028
21029 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_strided_a) {
21030 TEST_REQUIRES_X86_SSE2;
21031 GemmMicrokernelTester()
21032 .mr(3)
21033 .nr(4)
21034 .kr(8)
21035 .sr(1)
21036 .m(3)
21037 .n(4)
21038 .k(8)
21039 .a_stride(11)
21040 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21041 }
21042
21043 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_subtile) {
21044 TEST_REQUIRES_X86_SSE2;
21045 for (uint32_t m = 1; m <= 3; m++) {
21046 for (uint32_t n = 1; n <= 4; n++) {
21047 GemmMicrokernelTester()
21048 .mr(3)
21049 .nr(4)
21050 .kr(8)
21051 .sr(1)
21052 .m(m)
21053 .n(n)
21054 .k(8)
21055 .iterations(1)
21056 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21057 }
21058 }
21059 }
21060
21061 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_subtile_m) {
21062 TEST_REQUIRES_X86_SSE2;
21063 for (uint32_t m = 1; m <= 3; m++) {
21064 GemmMicrokernelTester()
21065 .mr(3)
21066 .nr(4)
21067 .kr(8)
21068 .sr(1)
21069 .m(m)
21070 .n(4)
21071 .k(8)
21072 .iterations(1)
21073 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21074 }
21075 }
21076
21077 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_subtile_n) {
21078 TEST_REQUIRES_X86_SSE2;
21079 for (uint32_t n = 1; n <= 4; n++) {
21080 GemmMicrokernelTester()
21081 .mr(3)
21082 .nr(4)
21083 .kr(8)
21084 .sr(1)
21085 .m(3)
21086 .n(n)
21087 .k(8)
21088 .iterations(1)
21089 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21090 }
21091 }
21092
21093 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_lt_8) {
21094 TEST_REQUIRES_X86_SSE2;
21095 for (size_t k = 1; k < 8; k++) {
21096 GemmMicrokernelTester()
21097 .mr(3)
21098 .nr(4)
21099 .kr(8)
21100 .sr(1)
21101 .m(3)
21102 .n(4)
21103 .k(k)
21104 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21105 }
21106 }
21107
21108 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_lt_8_strided_a) {
21109 TEST_REQUIRES_X86_SSE2;
21110 for (size_t k = 1; k < 8; k++) {
21111 GemmMicrokernelTester()
21112 .mr(3)
21113 .nr(4)
21114 .kr(8)
21115 .sr(1)
21116 .m(3)
21117 .n(4)
21118 .k(k)
21119 .a_stride(11)
21120 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21121 }
21122 }
21123
21124 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_lt_8_subtile) {
21125 TEST_REQUIRES_X86_SSE2;
21126 for (size_t k = 1; k < 8; k++) {
21127 for (uint32_t m = 1; m <= 3; m++) {
21128 for (uint32_t n = 1; n <= 4; n++) {
21129 GemmMicrokernelTester()
21130 .mr(3)
21131 .nr(4)
21132 .kr(8)
21133 .sr(1)
21134 .m(m)
21135 .n(n)
21136 .k(k)
21137 .iterations(1)
21138 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21139 }
21140 }
21141 }
21142 }
21143
21144 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_gt_8) {
21145 TEST_REQUIRES_X86_SSE2;
21146 for (size_t k = 9; k < 16; k++) {
21147 GemmMicrokernelTester()
21148 .mr(3)
21149 .nr(4)
21150 .kr(8)
21151 .sr(1)
21152 .m(3)
21153 .n(4)
21154 .k(k)
21155 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21156 }
21157 }
21158
21159 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_gt_8_strided_a) {
21160 TEST_REQUIRES_X86_SSE2;
21161 for (size_t k = 9; k < 16; k++) {
21162 GemmMicrokernelTester()
21163 .mr(3)
21164 .nr(4)
21165 .kr(8)
21166 .sr(1)
21167 .m(3)
21168 .n(4)
21169 .k(k)
21170 .a_stride(19)
21171 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21172 }
21173 }
21174
21175 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_gt_8_subtile) {
21176 TEST_REQUIRES_X86_SSE2;
21177 for (size_t k = 9; k < 16; k++) {
21178 for (uint32_t m = 1; m <= 3; m++) {
21179 for (uint32_t n = 1; n <= 4; n++) {
21180 GemmMicrokernelTester()
21181 .mr(3)
21182 .nr(4)
21183 .kr(8)
21184 .sr(1)
21185 .m(m)
21186 .n(n)
21187 .k(k)
21188 .iterations(1)
21189 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21190 }
21191 }
21192 }
21193 }
21194
21195 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_div_8) {
21196 TEST_REQUIRES_X86_SSE2;
21197 for (size_t k = 16; k <= 80; k += 8) {
21198 GemmMicrokernelTester()
21199 .mr(3)
21200 .nr(4)
21201 .kr(8)
21202 .sr(1)
21203 .m(3)
21204 .n(4)
21205 .k(k)
21206 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21207 }
21208 }
21209
21210 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_div_8_strided_a) {
21211 TEST_REQUIRES_X86_SSE2;
21212 for (size_t k = 16; k <= 80; k += 8) {
21213 GemmMicrokernelTester()
21214 .mr(3)
21215 .nr(4)
21216 .kr(8)
21217 .sr(1)
21218 .m(3)
21219 .n(4)
21220 .k(k)
21221 .a_stride(83)
21222 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21223 }
21224 }
21225
21226 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_div_8_subtile) {
21227 TEST_REQUIRES_X86_SSE2;
21228 for (size_t k = 16; k <= 80; k += 8) {
21229 for (uint32_t m = 1; m <= 3; m++) {
21230 for (uint32_t n = 1; n <= 4; n++) {
21231 GemmMicrokernelTester()
21232 .mr(3)
21233 .nr(4)
21234 .kr(8)
21235 .sr(1)
21236 .m(m)
21237 .n(n)
21238 .k(k)
21239 .iterations(1)
21240 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21241 }
21242 }
21243 }
21244 }
21245
21246 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4) {
21247 TEST_REQUIRES_X86_SSE2;
21248 for (uint32_t n = 5; n < 8; n++) {
21249 for (size_t k = 1; k <= 40; k += 9) {
21250 GemmMicrokernelTester()
21251 .mr(3)
21252 .nr(4)
21253 .kr(8)
21254 .sr(1)
21255 .m(3)
21256 .n(4)
21257 .k(k)
21258 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21259 }
21260 }
21261 }
21262
21263 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4_strided_cn) {
21264 TEST_REQUIRES_X86_SSE2;
21265 for (uint32_t n = 5; n < 8; n++) {
21266 for (size_t k = 1; k <= 40; k += 9) {
21267 GemmMicrokernelTester()
21268 .mr(3)
21269 .nr(4)
21270 .kr(8)
21271 .sr(1)
21272 .m(3)
21273 .n(4)
21274 .k(k)
21275 .cn_stride(7)
21276 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21277 }
21278 }
21279 }
21280
21281 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4_strided_a) {
21282 TEST_REQUIRES_X86_SSE2;
21283 for (uint32_t n = 5; n < 8; n++) {
21284 for (size_t k = 1; k <= 40; k += 9) {
21285 GemmMicrokernelTester()
21286 .mr(3)
21287 .nr(4)
21288 .kr(8)
21289 .sr(1)
21290 .m(3)
21291 .n(n)
21292 .k(k)
21293 .a_stride(43)
21294 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21295 }
21296 }
21297 }
21298
21299 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4_subtile) {
21300 TEST_REQUIRES_X86_SSE2;
21301 for (uint32_t n = 5; n < 8; n++) {
21302 for (size_t k = 1; k <= 40; k += 9) {
21303 for (uint32_t m = 1; m <= 3; m++) {
21304 GemmMicrokernelTester()
21305 .mr(3)
21306 .nr(4)
21307 .kr(8)
21308 .sr(1)
21309 .m(m)
21310 .n(n)
21311 .k(k)
21312 .iterations(1)
21313 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21314 }
21315 }
21316 }
21317 }
21318
21319 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4) {
21320 TEST_REQUIRES_X86_SSE2;
21321 for (uint32_t n = 8; n <= 12; n += 4) {
21322 for (size_t k = 1; k <= 40; k += 9) {
21323 GemmMicrokernelTester()
21324 .mr(3)
21325 .nr(4)
21326 .kr(8)
21327 .sr(1)
21328 .m(3)
21329 .n(4)
21330 .k(k)
21331 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21332 }
21333 }
21334 }
21335
21336 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4_strided_cn) {
21337 TEST_REQUIRES_X86_SSE2;
21338 for (uint32_t n = 8; n <= 12; n += 4) {
21339 for (size_t k = 1; k <= 40; k += 9) {
21340 GemmMicrokernelTester()
21341 .mr(3)
21342 .nr(4)
21343 .kr(8)
21344 .sr(1)
21345 .m(3)
21346 .n(n)
21347 .k(k)
21348 .cn_stride(7)
21349 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21350 }
21351 }
21352 }
21353
21354 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4_strided_a) {
21355 TEST_REQUIRES_X86_SSE2;
21356 for (uint32_t n = 8; n <= 12; n += 4) {
21357 for (size_t k = 1; k <= 40; k += 9) {
21358 GemmMicrokernelTester()
21359 .mr(3)
21360 .nr(4)
21361 .kr(8)
21362 .sr(1)
21363 .m(3)
21364 .n(n)
21365 .k(k)
21366 .a_stride(43)
21367 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21368 }
21369 }
21370 }
21371
21372 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4_subtile) {
21373 TEST_REQUIRES_X86_SSE2;
21374 for (uint32_t n = 8; n <= 12; n += 4) {
21375 for (size_t k = 1; k <= 40; k += 9) {
21376 for (uint32_t m = 1; m <= 3; m++) {
21377 GemmMicrokernelTester()
21378 .mr(3)
21379 .nr(4)
21380 .kr(8)
21381 .sr(1)
21382 .m(m)
21383 .n(n)
21384 .k(k)
21385 .iterations(1)
21386 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21387 }
21388 }
21389 }
21390 }
21391
21392 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, strided_cm_subtile) {
21393 TEST_REQUIRES_X86_SSE2;
21394 for (size_t k = 1; k <= 40; k += 9) {
21395 for (uint32_t m = 1; m <= 3; m++) {
21396 for (uint32_t n = 1; n <= 4; n++) {
21397 GemmMicrokernelTester()
21398 .mr(3)
21399 .nr(4)
21400 .kr(8)
21401 .sr(1)
21402 .m(m)
21403 .n(n)
21404 .k(k)
21405 .cm_stride(7)
21406 .iterations(1)
21407 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21408 }
21409 }
21410 }
21411 }
21412
21413 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, qmin) {
21414 TEST_REQUIRES_X86_SSE2;
21415 GemmMicrokernelTester()
21416 .mr(3)
21417 .nr(4)
21418 .kr(8)
21419 .sr(1)
21420 .m(3)
21421 .n(4)
21422 .k(8)
21423 .qmin(128)
21424 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21425 }
21426
21427 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, qmax) {
21428 TEST_REQUIRES_X86_SSE2;
21429 GemmMicrokernelTester()
21430 .mr(3)
21431 .nr(4)
21432 .kr(8)
21433 .sr(1)
21434 .m(3)
21435 .n(4)
21436 .k(8)
21437 .qmax(128)
21438 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21439 }
21440
21441 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, strided_cm) {
21442 TEST_REQUIRES_X86_SSE2;
21443 GemmMicrokernelTester()
21444 .mr(3)
21445 .nr(4)
21446 .kr(8)
21447 .sr(1)
21448 .m(3)
21449 .n(4)
21450 .k(8)
21451 .cm_stride(7)
21452 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21453 }
21454#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
21455
21456
21457#if XNN_ARCH_X86 || XNN_ARCH_X86_64
21458 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8) {
21459 TEST_REQUIRES_X86_SSE41;
21460 GemmMicrokernelTester()
21461 .mr(1)
21462 .nr(4)
21463 .kr(8)
21464 .sr(1)
21465 .m(1)
21466 .n(4)
21467 .k(8)
21468 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21469 }
21470
21471 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, strided_cn) {
21472 TEST_REQUIRES_X86_SSE41;
21473 GemmMicrokernelTester()
21474 .mr(1)
21475 .nr(4)
21476 .kr(8)
21477 .sr(1)
21478 .m(1)
21479 .n(4)
21480 .k(8)
21481 .cn_stride(7)
21482 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21483 }
21484
21485 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8_strided_a) {
21486 TEST_REQUIRES_X86_SSE41;
21487 GemmMicrokernelTester()
21488 .mr(1)
21489 .nr(4)
21490 .kr(8)
21491 .sr(1)
21492 .m(1)
21493 .n(4)
21494 .k(8)
21495 .a_stride(11)
21496 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21497 }
21498
21499 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8_subtile) {
21500 TEST_REQUIRES_X86_SSE41;
21501 for (uint32_t m = 1; m <= 1; m++) {
21502 for (uint32_t n = 1; n <= 4; n++) {
21503 GemmMicrokernelTester()
21504 .mr(1)
21505 .nr(4)
21506 .kr(8)
21507 .sr(1)
21508 .m(m)
21509 .n(n)
21510 .k(8)
21511 .iterations(1)
21512 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21513 }
21514 }
21515 }
21516
21517 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8_subtile_m) {
21518 TEST_REQUIRES_X86_SSE41;
21519 for (uint32_t m = 1; m <= 1; m++) {
21520 GemmMicrokernelTester()
21521 .mr(1)
21522 .nr(4)
21523 .kr(8)
21524 .sr(1)
21525 .m(m)
21526 .n(4)
21527 .k(8)
21528 .iterations(1)
21529 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21530 }
21531 }
21532
21533 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8_subtile_n) {
21534 TEST_REQUIRES_X86_SSE41;
21535 for (uint32_t n = 1; n <= 4; n++) {
21536 GemmMicrokernelTester()
21537 .mr(1)
21538 .nr(4)
21539 .kr(8)
21540 .sr(1)
21541 .m(1)
21542 .n(n)
21543 .k(8)
21544 .iterations(1)
21545 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21546 }
21547 }
21548
21549 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_lt_8) {
21550 TEST_REQUIRES_X86_SSE41;
21551 for (size_t k = 1; k < 8; k++) {
21552 GemmMicrokernelTester()
21553 .mr(1)
21554 .nr(4)
21555 .kr(8)
21556 .sr(1)
21557 .m(1)
21558 .n(4)
21559 .k(k)
21560 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21561 }
21562 }
21563
21564 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_lt_8_strided_a) {
21565 TEST_REQUIRES_X86_SSE41;
21566 for (size_t k = 1; k < 8; k++) {
21567 GemmMicrokernelTester()
21568 .mr(1)
21569 .nr(4)
21570 .kr(8)
21571 .sr(1)
21572 .m(1)
21573 .n(4)
21574 .k(k)
21575 .a_stride(11)
21576 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21577 }
21578 }
21579
21580 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_lt_8_subtile) {
21581 TEST_REQUIRES_X86_SSE41;
21582 for (size_t k = 1; k < 8; k++) {
21583 for (uint32_t m = 1; m <= 1; m++) {
21584 for (uint32_t n = 1; n <= 4; n++) {
21585 GemmMicrokernelTester()
21586 .mr(1)
21587 .nr(4)
21588 .kr(8)
21589 .sr(1)
21590 .m(m)
21591 .n(n)
21592 .k(k)
21593 .iterations(1)
21594 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21595 }
21596 }
21597 }
21598 }
21599
21600 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_gt_8) {
21601 TEST_REQUIRES_X86_SSE41;
21602 for (size_t k = 9; k < 16; k++) {
21603 GemmMicrokernelTester()
21604 .mr(1)
21605 .nr(4)
21606 .kr(8)
21607 .sr(1)
21608 .m(1)
21609 .n(4)
21610 .k(k)
21611 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21612 }
21613 }
21614
21615 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_gt_8_strided_a) {
21616 TEST_REQUIRES_X86_SSE41;
21617 for (size_t k = 9; k < 16; k++) {
21618 GemmMicrokernelTester()
21619 .mr(1)
21620 .nr(4)
21621 .kr(8)
21622 .sr(1)
21623 .m(1)
21624 .n(4)
21625 .k(k)
21626 .a_stride(19)
21627 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21628 }
21629 }
21630
21631 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_gt_8_subtile) {
21632 TEST_REQUIRES_X86_SSE41;
21633 for (size_t k = 9; k < 16; k++) {
21634 for (uint32_t m = 1; m <= 1; m++) {
21635 for (uint32_t n = 1; n <= 4; n++) {
21636 GemmMicrokernelTester()
21637 .mr(1)
21638 .nr(4)
21639 .kr(8)
21640 .sr(1)
21641 .m(m)
21642 .n(n)
21643 .k(k)
21644 .iterations(1)
21645 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21646 }
21647 }
21648 }
21649 }
21650
21651 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_div_8) {
21652 TEST_REQUIRES_X86_SSE41;
21653 for (size_t k = 16; k <= 80; k += 8) {
21654 GemmMicrokernelTester()
21655 .mr(1)
21656 .nr(4)
21657 .kr(8)
21658 .sr(1)
21659 .m(1)
21660 .n(4)
21661 .k(k)
21662 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21663 }
21664 }
21665
21666 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_div_8_strided_a) {
21667 TEST_REQUIRES_X86_SSE41;
21668 for (size_t k = 16; k <= 80; k += 8) {
21669 GemmMicrokernelTester()
21670 .mr(1)
21671 .nr(4)
21672 .kr(8)
21673 .sr(1)
21674 .m(1)
21675 .n(4)
21676 .k(k)
21677 .a_stride(83)
21678 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21679 }
21680 }
21681
21682 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_div_8_subtile) {
21683 TEST_REQUIRES_X86_SSE41;
21684 for (size_t k = 16; k <= 80; k += 8) {
21685 for (uint32_t m = 1; m <= 1; m++) {
21686 for (uint32_t n = 1; n <= 4; n++) {
21687 GemmMicrokernelTester()
21688 .mr(1)
21689 .nr(4)
21690 .kr(8)
21691 .sr(1)
21692 .m(m)
21693 .n(n)
21694 .k(k)
21695 .iterations(1)
21696 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21697 }
21698 }
21699 }
21700 }
21701
21702 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4) {
21703 TEST_REQUIRES_X86_SSE41;
21704 for (uint32_t n = 5; n < 8; n++) {
21705 for (size_t k = 1; k <= 40; k += 9) {
21706 GemmMicrokernelTester()
21707 .mr(1)
21708 .nr(4)
21709 .kr(8)
21710 .sr(1)
21711 .m(1)
21712 .n(4)
21713 .k(k)
21714 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21715 }
21716 }
21717 }
21718
21719 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4_strided_cn) {
21720 TEST_REQUIRES_X86_SSE41;
21721 for (uint32_t n = 5; n < 8; n++) {
21722 for (size_t k = 1; k <= 40; k += 9) {
21723 GemmMicrokernelTester()
21724 .mr(1)
21725 .nr(4)
21726 .kr(8)
21727 .sr(1)
21728 .m(1)
21729 .n(4)
21730 .k(k)
21731 .cn_stride(7)
21732 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21733 }
21734 }
21735 }
21736
21737 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4_strided_a) {
21738 TEST_REQUIRES_X86_SSE41;
21739 for (uint32_t n = 5; n < 8; n++) {
21740 for (size_t k = 1; k <= 40; k += 9) {
21741 GemmMicrokernelTester()
21742 .mr(1)
21743 .nr(4)
21744 .kr(8)
21745 .sr(1)
21746 .m(1)
21747 .n(n)
21748 .k(k)
21749 .a_stride(43)
21750 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21751 }
21752 }
21753 }
21754
21755 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4_subtile) {
21756 TEST_REQUIRES_X86_SSE41;
21757 for (uint32_t n = 5; n < 8; n++) {
21758 for (size_t k = 1; k <= 40; k += 9) {
21759 for (uint32_t m = 1; m <= 1; m++) {
21760 GemmMicrokernelTester()
21761 .mr(1)
21762 .nr(4)
21763 .kr(8)
21764 .sr(1)
21765 .m(m)
21766 .n(n)
21767 .k(k)
21768 .iterations(1)
21769 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21770 }
21771 }
21772 }
21773 }
21774
21775 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4) {
21776 TEST_REQUIRES_X86_SSE41;
21777 for (uint32_t n = 8; n <= 12; n += 4) {
21778 for (size_t k = 1; k <= 40; k += 9) {
21779 GemmMicrokernelTester()
21780 .mr(1)
21781 .nr(4)
21782 .kr(8)
21783 .sr(1)
21784 .m(1)
21785 .n(4)
21786 .k(k)
21787 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21788 }
21789 }
21790 }
21791
21792 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4_strided_cn) {
21793 TEST_REQUIRES_X86_SSE41;
21794 for (uint32_t n = 8; n <= 12; n += 4) {
21795 for (size_t k = 1; k <= 40; k += 9) {
21796 GemmMicrokernelTester()
21797 .mr(1)
21798 .nr(4)
21799 .kr(8)
21800 .sr(1)
21801 .m(1)
21802 .n(n)
21803 .k(k)
21804 .cn_stride(7)
21805 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21806 }
21807 }
21808 }
21809
21810 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4_strided_a) {
21811 TEST_REQUIRES_X86_SSE41;
21812 for (uint32_t n = 8; n <= 12; n += 4) {
21813 for (size_t k = 1; k <= 40; k += 9) {
21814 GemmMicrokernelTester()
21815 .mr(1)
21816 .nr(4)
21817 .kr(8)
21818 .sr(1)
21819 .m(1)
21820 .n(n)
21821 .k(k)
21822 .a_stride(43)
21823 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21824 }
21825 }
21826 }
21827
21828 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4_subtile) {
21829 TEST_REQUIRES_X86_SSE41;
21830 for (uint32_t n = 8; n <= 12; n += 4) {
21831 for (size_t k = 1; k <= 40; k += 9) {
21832 for (uint32_t m = 1; m <= 1; m++) {
21833 GemmMicrokernelTester()
21834 .mr(1)
21835 .nr(4)
21836 .kr(8)
21837 .sr(1)
21838 .m(m)
21839 .n(n)
21840 .k(k)
21841 .iterations(1)
21842 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21843 }
21844 }
21845 }
21846 }
21847
21848 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, strided_cm_subtile) {
21849 TEST_REQUIRES_X86_SSE41;
21850 for (size_t k = 1; k <= 40; k += 9) {
21851 for (uint32_t m = 1; m <= 1; m++) {
21852 for (uint32_t n = 1; n <= 4; n++) {
21853 GemmMicrokernelTester()
21854 .mr(1)
21855 .nr(4)
21856 .kr(8)
21857 .sr(1)
21858 .m(m)
21859 .n(n)
21860 .k(k)
21861 .cm_stride(7)
21862 .iterations(1)
21863 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21864 }
21865 }
21866 }
21867 }
21868
21869 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, qmin) {
21870 TEST_REQUIRES_X86_SSE41;
21871 GemmMicrokernelTester()
21872 .mr(1)
21873 .nr(4)
21874 .kr(8)
21875 .sr(1)
21876 .m(1)
21877 .n(4)
21878 .k(8)
21879 .qmin(128)
21880 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21881 }
21882
21883 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, qmax) {
21884 TEST_REQUIRES_X86_SSE41;
21885 GemmMicrokernelTester()
21886 .mr(1)
21887 .nr(4)
21888 .kr(8)
21889 .sr(1)
21890 .m(1)
21891 .n(4)
21892 .k(8)
21893 .qmax(128)
21894 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21895 }
21896
21897 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, strided_cm) {
21898 TEST_REQUIRES_X86_SSE41;
21899 GemmMicrokernelTester()
21900 .mr(1)
21901 .nr(4)
21902 .kr(8)
21903 .sr(1)
21904 .m(1)
21905 .n(4)
21906 .k(8)
21907 .cm_stride(7)
21908 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21909 }
21910#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
21911
21912
21913#if XNN_ARCH_X86 || XNN_ARCH_X86_64
21914 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8) {
21915 TEST_REQUIRES_X86_SSE41;
21916 GemmMicrokernelTester()
21917 .mr(2)
21918 .nr(4)
21919 .kr(8)
21920 .sr(1)
21921 .m(2)
21922 .n(4)
21923 .k(8)
21924 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21925 }
21926
21927 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, strided_cn) {
21928 TEST_REQUIRES_X86_SSE41;
21929 GemmMicrokernelTester()
21930 .mr(2)
21931 .nr(4)
21932 .kr(8)
21933 .sr(1)
21934 .m(2)
21935 .n(4)
21936 .k(8)
21937 .cn_stride(7)
21938 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21939 }
21940
21941 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8_strided_a) {
21942 TEST_REQUIRES_X86_SSE41;
21943 GemmMicrokernelTester()
21944 .mr(2)
21945 .nr(4)
21946 .kr(8)
21947 .sr(1)
21948 .m(2)
21949 .n(4)
21950 .k(8)
21951 .a_stride(11)
21952 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21953 }
21954
21955 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8_subtile) {
21956 TEST_REQUIRES_X86_SSE41;
21957 for (uint32_t m = 1; m <= 2; m++) {
21958 for (uint32_t n = 1; n <= 4; n++) {
21959 GemmMicrokernelTester()
21960 .mr(2)
21961 .nr(4)
21962 .kr(8)
21963 .sr(1)
21964 .m(m)
21965 .n(n)
21966 .k(8)
21967 .iterations(1)
21968 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21969 }
21970 }
21971 }
21972
21973 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8_subtile_m) {
21974 TEST_REQUIRES_X86_SSE41;
21975 for (uint32_t m = 1; m <= 2; m++) {
21976 GemmMicrokernelTester()
21977 .mr(2)
21978 .nr(4)
21979 .kr(8)
21980 .sr(1)
21981 .m(m)
21982 .n(4)
21983 .k(8)
21984 .iterations(1)
21985 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
21986 }
21987 }
21988
21989 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8_subtile_n) {
21990 TEST_REQUIRES_X86_SSE41;
21991 for (uint32_t n = 1; n <= 4; n++) {
21992 GemmMicrokernelTester()
21993 .mr(2)
21994 .nr(4)
21995 .kr(8)
21996 .sr(1)
21997 .m(2)
21998 .n(n)
21999 .k(8)
22000 .iterations(1)
22001 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22002 }
22003 }
22004
22005 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_lt_8) {
22006 TEST_REQUIRES_X86_SSE41;
22007 for (size_t k = 1; k < 8; k++) {
22008 GemmMicrokernelTester()
22009 .mr(2)
22010 .nr(4)
22011 .kr(8)
22012 .sr(1)
22013 .m(2)
22014 .n(4)
22015 .k(k)
22016 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22017 }
22018 }
22019
22020 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_lt_8_strided_a) {
22021 TEST_REQUIRES_X86_SSE41;
22022 for (size_t k = 1; k < 8; k++) {
22023 GemmMicrokernelTester()
22024 .mr(2)
22025 .nr(4)
22026 .kr(8)
22027 .sr(1)
22028 .m(2)
22029 .n(4)
22030 .k(k)
22031 .a_stride(11)
22032 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22033 }
22034 }
22035
22036 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_lt_8_subtile) {
22037 TEST_REQUIRES_X86_SSE41;
22038 for (size_t k = 1; k < 8; k++) {
22039 for (uint32_t m = 1; m <= 2; m++) {
22040 for (uint32_t n = 1; n <= 4; n++) {
22041 GemmMicrokernelTester()
22042 .mr(2)
22043 .nr(4)
22044 .kr(8)
22045 .sr(1)
22046 .m(m)
22047 .n(n)
22048 .k(k)
22049 .iterations(1)
22050 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22051 }
22052 }
22053 }
22054 }
22055
22056 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_gt_8) {
22057 TEST_REQUIRES_X86_SSE41;
22058 for (size_t k = 9; k < 16; k++) {
22059 GemmMicrokernelTester()
22060 .mr(2)
22061 .nr(4)
22062 .kr(8)
22063 .sr(1)
22064 .m(2)
22065 .n(4)
22066 .k(k)
22067 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22068 }
22069 }
22070
22071 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_gt_8_strided_a) {
22072 TEST_REQUIRES_X86_SSE41;
22073 for (size_t k = 9; k < 16; k++) {
22074 GemmMicrokernelTester()
22075 .mr(2)
22076 .nr(4)
22077 .kr(8)
22078 .sr(1)
22079 .m(2)
22080 .n(4)
22081 .k(k)
22082 .a_stride(19)
22083 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22084 }
22085 }
22086
22087 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_gt_8_subtile) {
22088 TEST_REQUIRES_X86_SSE41;
22089 for (size_t k = 9; k < 16; k++) {
22090 for (uint32_t m = 1; m <= 2; m++) {
22091 for (uint32_t n = 1; n <= 4; n++) {
22092 GemmMicrokernelTester()
22093 .mr(2)
22094 .nr(4)
22095 .kr(8)
22096 .sr(1)
22097 .m(m)
22098 .n(n)
22099 .k(k)
22100 .iterations(1)
22101 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22102 }
22103 }
22104 }
22105 }
22106
22107 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_div_8) {
22108 TEST_REQUIRES_X86_SSE41;
22109 for (size_t k = 16; k <= 80; k += 8) {
22110 GemmMicrokernelTester()
22111 .mr(2)
22112 .nr(4)
22113 .kr(8)
22114 .sr(1)
22115 .m(2)
22116 .n(4)
22117 .k(k)
22118 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22119 }
22120 }
22121
22122 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_div_8_strided_a) {
22123 TEST_REQUIRES_X86_SSE41;
22124 for (size_t k = 16; k <= 80; k += 8) {
22125 GemmMicrokernelTester()
22126 .mr(2)
22127 .nr(4)
22128 .kr(8)
22129 .sr(1)
22130 .m(2)
22131 .n(4)
22132 .k(k)
22133 .a_stride(83)
22134 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22135 }
22136 }
22137
22138 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_div_8_subtile) {
22139 TEST_REQUIRES_X86_SSE41;
22140 for (size_t k = 16; k <= 80; k += 8) {
22141 for (uint32_t m = 1; m <= 2; m++) {
22142 for (uint32_t n = 1; n <= 4; n++) {
22143 GemmMicrokernelTester()
22144 .mr(2)
22145 .nr(4)
22146 .kr(8)
22147 .sr(1)
22148 .m(m)
22149 .n(n)
22150 .k(k)
22151 .iterations(1)
22152 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22153 }
22154 }
22155 }
22156 }
22157
22158 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_gt_4) {
22159 TEST_REQUIRES_X86_SSE41;
22160 for (uint32_t n = 5; n < 8; n++) {
22161 for (size_t k = 1; k <= 40; k += 9) {
22162 GemmMicrokernelTester()
22163 .mr(2)
22164 .nr(4)
22165 .kr(8)
22166 .sr(1)
22167 .m(2)
22168 .n(4)
22169 .k(k)
22170 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22171 }
22172 }
22173 }
22174
22175 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_gt_4_strided_cn) {
22176 TEST_REQUIRES_X86_SSE41;
22177 for (uint32_t n = 5; n < 8; n++) {
22178 for (size_t k = 1; k <= 40; k += 9) {
22179 GemmMicrokernelTester()
22180 .mr(2)
22181 .nr(4)
22182 .kr(8)
22183 .sr(1)
22184 .m(2)
22185 .n(4)
22186 .k(k)
22187 .cn_stride(7)
22188 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22189 }
22190 }
22191 }
22192
22193 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_gt_4_strided_a) {
22194 TEST_REQUIRES_X86_SSE41;
22195 for (uint32_t n = 5; n < 8; n++) {
22196 for (size_t k = 1; k <= 40; k += 9) {
22197 GemmMicrokernelTester()
22198 .mr(2)
22199 .nr(4)
22200 .kr(8)
22201 .sr(1)
22202 .m(2)
22203 .n(n)
22204 .k(k)
22205 .a_stride(43)
22206 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22207 }
22208 }
22209 }
22210
22211 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_gt_4_subtile) {
22212 TEST_REQUIRES_X86_SSE41;
22213 for (uint32_t n = 5; n < 8; n++) {
22214 for (size_t k = 1; k <= 40; k += 9) {
22215 for (uint32_t m = 1; m <= 2; m++) {
22216 GemmMicrokernelTester()
22217 .mr(2)
22218 .nr(4)
22219 .kr(8)
22220 .sr(1)
22221 .m(m)
22222 .n(n)
22223 .k(k)
22224 .iterations(1)
22225 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22226 }
22227 }
22228 }
22229 }
22230
22231 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_div_4) {
22232 TEST_REQUIRES_X86_SSE41;
22233 for (uint32_t n = 8; n <= 12; n += 4) {
22234 for (size_t k = 1; k <= 40; k += 9) {
22235 GemmMicrokernelTester()
22236 .mr(2)
22237 .nr(4)
22238 .kr(8)
22239 .sr(1)
22240 .m(2)
22241 .n(4)
22242 .k(k)
22243 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22244 }
22245 }
22246 }
22247
22248 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_div_4_strided_cn) {
22249 TEST_REQUIRES_X86_SSE41;
22250 for (uint32_t n = 8; n <= 12; n += 4) {
22251 for (size_t k = 1; k <= 40; k += 9) {
22252 GemmMicrokernelTester()
22253 .mr(2)
22254 .nr(4)
22255 .kr(8)
22256 .sr(1)
22257 .m(2)
22258 .n(n)
22259 .k(k)
22260 .cn_stride(7)
22261 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22262 }
22263 }
22264 }
22265
22266 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_div_4_strided_a) {
22267 TEST_REQUIRES_X86_SSE41;
22268 for (uint32_t n = 8; n <= 12; n += 4) {
22269 for (size_t k = 1; k <= 40; k += 9) {
22270 GemmMicrokernelTester()
22271 .mr(2)
22272 .nr(4)
22273 .kr(8)
22274 .sr(1)
22275 .m(2)
22276 .n(n)
22277 .k(k)
22278 .a_stride(43)
22279 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22280 }
22281 }
22282 }
22283
22284 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_div_4_subtile) {
22285 TEST_REQUIRES_X86_SSE41;
22286 for (uint32_t n = 8; n <= 12; n += 4) {
22287 for (size_t k = 1; k <= 40; k += 9) {
22288 for (uint32_t m = 1; m <= 2; m++) {
22289 GemmMicrokernelTester()
22290 .mr(2)
22291 .nr(4)
22292 .kr(8)
22293 .sr(1)
22294 .m(m)
22295 .n(n)
22296 .k(k)
22297 .iterations(1)
22298 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22299 }
22300 }
22301 }
22302 }
22303
22304 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, strided_cm_subtile) {
22305 TEST_REQUIRES_X86_SSE41;
22306 for (size_t k = 1; k <= 40; k += 9) {
22307 for (uint32_t m = 1; m <= 2; m++) {
22308 for (uint32_t n = 1; n <= 4; n++) {
22309 GemmMicrokernelTester()
22310 .mr(2)
22311 .nr(4)
22312 .kr(8)
22313 .sr(1)
22314 .m(m)
22315 .n(n)
22316 .k(k)
22317 .cm_stride(7)
22318 .iterations(1)
22319 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22320 }
22321 }
22322 }
22323 }
22324
22325 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, qmin) {
22326 TEST_REQUIRES_X86_SSE41;
22327 GemmMicrokernelTester()
22328 .mr(2)
22329 .nr(4)
22330 .kr(8)
22331 .sr(1)
22332 .m(2)
22333 .n(4)
22334 .k(8)
22335 .qmin(128)
22336 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22337 }
22338
22339 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, qmax) {
22340 TEST_REQUIRES_X86_SSE41;
22341 GemmMicrokernelTester()
22342 .mr(2)
22343 .nr(4)
22344 .kr(8)
22345 .sr(1)
22346 .m(2)
22347 .n(4)
22348 .k(8)
22349 .qmax(128)
22350 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22351 }
22352
22353 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, strided_cm) {
22354 TEST_REQUIRES_X86_SSE41;
22355 GemmMicrokernelTester()
22356 .mr(2)
22357 .nr(4)
22358 .kr(8)
22359 .sr(1)
22360 .m(2)
22361 .n(4)
22362 .k(8)
22363 .cm_stride(7)
22364 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22365 }
22366#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
22367
22368
22369#if XNN_ARCH_X86 || XNN_ARCH_X86_64
22370 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8) {
22371 TEST_REQUIRES_X86_SSE41;
22372 GemmMicrokernelTester()
22373 .mr(3)
22374 .nr(4)
22375 .kr(8)
22376 .sr(1)
22377 .m(3)
22378 .n(4)
22379 .k(8)
22380 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22381 }
22382
22383 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, strided_cn) {
22384 TEST_REQUIRES_X86_SSE41;
22385 GemmMicrokernelTester()
22386 .mr(3)
22387 .nr(4)
22388 .kr(8)
22389 .sr(1)
22390 .m(3)
22391 .n(4)
22392 .k(8)
22393 .cn_stride(7)
22394 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22395 }
22396
22397 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8_strided_a) {
22398 TEST_REQUIRES_X86_SSE41;
22399 GemmMicrokernelTester()
22400 .mr(3)
22401 .nr(4)
22402 .kr(8)
22403 .sr(1)
22404 .m(3)
22405 .n(4)
22406 .k(8)
22407 .a_stride(11)
22408 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22409 }
22410
22411 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8_subtile) {
22412 TEST_REQUIRES_X86_SSE41;
22413 for (uint32_t m = 1; m <= 3; m++) {
22414 for (uint32_t n = 1; n <= 4; n++) {
22415 GemmMicrokernelTester()
22416 .mr(3)
22417 .nr(4)
22418 .kr(8)
22419 .sr(1)
22420 .m(m)
22421 .n(n)
22422 .k(8)
22423 .iterations(1)
22424 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22425 }
22426 }
22427 }
22428
22429 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8_subtile_m) {
22430 TEST_REQUIRES_X86_SSE41;
22431 for (uint32_t m = 1; m <= 3; m++) {
22432 GemmMicrokernelTester()
22433 .mr(3)
22434 .nr(4)
22435 .kr(8)
22436 .sr(1)
22437 .m(m)
22438 .n(4)
22439 .k(8)
22440 .iterations(1)
22441 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22442 }
22443 }
22444
22445 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8_subtile_n) {
22446 TEST_REQUIRES_X86_SSE41;
22447 for (uint32_t n = 1; n <= 4; n++) {
22448 GemmMicrokernelTester()
22449 .mr(3)
22450 .nr(4)
22451 .kr(8)
22452 .sr(1)
22453 .m(3)
22454 .n(n)
22455 .k(8)
22456 .iterations(1)
22457 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22458 }
22459 }
22460
22461 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_lt_8) {
22462 TEST_REQUIRES_X86_SSE41;
22463 for (size_t k = 1; k < 8; k++) {
22464 GemmMicrokernelTester()
22465 .mr(3)
22466 .nr(4)
22467 .kr(8)
22468 .sr(1)
22469 .m(3)
22470 .n(4)
22471 .k(k)
22472 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22473 }
22474 }
22475
22476 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_lt_8_strided_a) {
22477 TEST_REQUIRES_X86_SSE41;
22478 for (size_t k = 1; k < 8; k++) {
22479 GemmMicrokernelTester()
22480 .mr(3)
22481 .nr(4)
22482 .kr(8)
22483 .sr(1)
22484 .m(3)
22485 .n(4)
22486 .k(k)
22487 .a_stride(11)
22488 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22489 }
22490 }
22491
22492 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_lt_8_subtile) {
22493 TEST_REQUIRES_X86_SSE41;
22494 for (size_t k = 1; k < 8; k++) {
22495 for (uint32_t m = 1; m <= 3; m++) {
22496 for (uint32_t n = 1; n <= 4; n++) {
22497 GemmMicrokernelTester()
22498 .mr(3)
22499 .nr(4)
22500 .kr(8)
22501 .sr(1)
22502 .m(m)
22503 .n(n)
22504 .k(k)
22505 .iterations(1)
22506 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22507 }
22508 }
22509 }
22510 }
22511
22512 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_gt_8) {
22513 TEST_REQUIRES_X86_SSE41;
22514 for (size_t k = 9; k < 16; k++) {
22515 GemmMicrokernelTester()
22516 .mr(3)
22517 .nr(4)
22518 .kr(8)
22519 .sr(1)
22520 .m(3)
22521 .n(4)
22522 .k(k)
22523 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22524 }
22525 }
22526
22527 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_gt_8_strided_a) {
22528 TEST_REQUIRES_X86_SSE41;
22529 for (size_t k = 9; k < 16; k++) {
22530 GemmMicrokernelTester()
22531 .mr(3)
22532 .nr(4)
22533 .kr(8)
22534 .sr(1)
22535 .m(3)
22536 .n(4)
22537 .k(k)
22538 .a_stride(19)
22539 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22540 }
22541 }
22542
22543 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_gt_8_subtile) {
22544 TEST_REQUIRES_X86_SSE41;
22545 for (size_t k = 9; k < 16; k++) {
22546 for (uint32_t m = 1; m <= 3; m++) {
22547 for (uint32_t n = 1; n <= 4; n++) {
22548 GemmMicrokernelTester()
22549 .mr(3)
22550 .nr(4)
22551 .kr(8)
22552 .sr(1)
22553 .m(m)
22554 .n(n)
22555 .k(k)
22556 .iterations(1)
22557 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22558 }
22559 }
22560 }
22561 }
22562
22563 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_div_8) {
22564 TEST_REQUIRES_X86_SSE41;
22565 for (size_t k = 16; k <= 80; k += 8) {
22566 GemmMicrokernelTester()
22567 .mr(3)
22568 .nr(4)
22569 .kr(8)
22570 .sr(1)
22571 .m(3)
22572 .n(4)
22573 .k(k)
22574 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22575 }
22576 }
22577
22578 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_div_8_strided_a) {
22579 TEST_REQUIRES_X86_SSE41;
22580 for (size_t k = 16; k <= 80; k += 8) {
22581 GemmMicrokernelTester()
22582 .mr(3)
22583 .nr(4)
22584 .kr(8)
22585 .sr(1)
22586 .m(3)
22587 .n(4)
22588 .k(k)
22589 .a_stride(83)
22590 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22591 }
22592 }
22593
22594 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_div_8_subtile) {
22595 TEST_REQUIRES_X86_SSE41;
22596 for (size_t k = 16; k <= 80; k += 8) {
22597 for (uint32_t m = 1; m <= 3; m++) {
22598 for (uint32_t n = 1; n <= 4; n++) {
22599 GemmMicrokernelTester()
22600 .mr(3)
22601 .nr(4)
22602 .kr(8)
22603 .sr(1)
22604 .m(m)
22605 .n(n)
22606 .k(k)
22607 .iterations(1)
22608 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22609 }
22610 }
22611 }
22612 }
22613
22614 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4) {
22615 TEST_REQUIRES_X86_SSE41;
22616 for (uint32_t n = 5; n < 8; n++) {
22617 for (size_t k = 1; k <= 40; k += 9) {
22618 GemmMicrokernelTester()
22619 .mr(3)
22620 .nr(4)
22621 .kr(8)
22622 .sr(1)
22623 .m(3)
22624 .n(4)
22625 .k(k)
22626 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22627 }
22628 }
22629 }
22630
22631 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4_strided_cn) {
22632 TEST_REQUIRES_X86_SSE41;
22633 for (uint32_t n = 5; n < 8; n++) {
22634 for (size_t k = 1; k <= 40; k += 9) {
22635 GemmMicrokernelTester()
22636 .mr(3)
22637 .nr(4)
22638 .kr(8)
22639 .sr(1)
22640 .m(3)
22641 .n(4)
22642 .k(k)
22643 .cn_stride(7)
22644 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22645 }
22646 }
22647 }
22648
22649 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4_strided_a) {
22650 TEST_REQUIRES_X86_SSE41;
22651 for (uint32_t n = 5; n < 8; n++) {
22652 for (size_t k = 1; k <= 40; k += 9) {
22653 GemmMicrokernelTester()
22654 .mr(3)
22655 .nr(4)
22656 .kr(8)
22657 .sr(1)
22658 .m(3)
22659 .n(n)
22660 .k(k)
22661 .a_stride(43)
22662 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22663 }
22664 }
22665 }
22666
22667 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4_subtile) {
22668 TEST_REQUIRES_X86_SSE41;
22669 for (uint32_t n = 5; n < 8; n++) {
22670 for (size_t k = 1; k <= 40; k += 9) {
22671 for (uint32_t m = 1; m <= 3; m++) {
22672 GemmMicrokernelTester()
22673 .mr(3)
22674 .nr(4)
22675 .kr(8)
22676 .sr(1)
22677 .m(m)
22678 .n(n)
22679 .k(k)
22680 .iterations(1)
22681 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22682 }
22683 }
22684 }
22685 }
22686
22687 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4) {
22688 TEST_REQUIRES_X86_SSE41;
22689 for (uint32_t n = 8; n <= 12; n += 4) {
22690 for (size_t k = 1; k <= 40; k += 9) {
22691 GemmMicrokernelTester()
22692 .mr(3)
22693 .nr(4)
22694 .kr(8)
22695 .sr(1)
22696 .m(3)
22697 .n(4)
22698 .k(k)
22699 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22700 }
22701 }
22702 }
22703
22704 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4_strided_cn) {
22705 TEST_REQUIRES_X86_SSE41;
22706 for (uint32_t n = 8; n <= 12; n += 4) {
22707 for (size_t k = 1; k <= 40; k += 9) {
22708 GemmMicrokernelTester()
22709 .mr(3)
22710 .nr(4)
22711 .kr(8)
22712 .sr(1)
22713 .m(3)
22714 .n(n)
22715 .k(k)
22716 .cn_stride(7)
22717 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22718 }
22719 }
22720 }
22721
22722 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4_strided_a) {
22723 TEST_REQUIRES_X86_SSE41;
22724 for (uint32_t n = 8; n <= 12; n += 4) {
22725 for (size_t k = 1; k <= 40; k += 9) {
22726 GemmMicrokernelTester()
22727 .mr(3)
22728 .nr(4)
22729 .kr(8)
22730 .sr(1)
22731 .m(3)
22732 .n(n)
22733 .k(k)
22734 .a_stride(43)
22735 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22736 }
22737 }
22738 }
22739
22740 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4_subtile) {
22741 TEST_REQUIRES_X86_SSE41;
22742 for (uint32_t n = 8; n <= 12; n += 4) {
22743 for (size_t k = 1; k <= 40; k += 9) {
22744 for (uint32_t m = 1; m <= 3; m++) {
22745 GemmMicrokernelTester()
22746 .mr(3)
22747 .nr(4)
22748 .kr(8)
22749 .sr(1)
22750 .m(m)
22751 .n(n)
22752 .k(k)
22753 .iterations(1)
22754 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22755 }
22756 }
22757 }
22758 }
22759
22760 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, strided_cm_subtile) {
22761 TEST_REQUIRES_X86_SSE41;
22762 for (size_t k = 1; k <= 40; k += 9) {
22763 for (uint32_t m = 1; m <= 3; m++) {
22764 for (uint32_t n = 1; n <= 4; n++) {
22765 GemmMicrokernelTester()
22766 .mr(3)
22767 .nr(4)
22768 .kr(8)
22769 .sr(1)
22770 .m(m)
22771 .n(n)
22772 .k(k)
22773 .cm_stride(7)
22774 .iterations(1)
22775 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22776 }
22777 }
22778 }
22779 }
22780
22781 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, qmin) {
22782 TEST_REQUIRES_X86_SSE41;
22783 GemmMicrokernelTester()
22784 .mr(3)
22785 .nr(4)
22786 .kr(8)
22787 .sr(1)
22788 .m(3)
22789 .n(4)
22790 .k(8)
22791 .qmin(128)
22792 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22793 }
22794
22795 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, qmax) {
22796 TEST_REQUIRES_X86_SSE41;
22797 GemmMicrokernelTester()
22798 .mr(3)
22799 .nr(4)
22800 .kr(8)
22801 .sr(1)
22802 .m(3)
22803 .n(4)
22804 .k(8)
22805 .qmax(128)
22806 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22807 }
22808
22809 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, strided_cm) {
22810 TEST_REQUIRES_X86_SSE41;
22811 GemmMicrokernelTester()
22812 .mr(3)
22813 .nr(4)
22814 .kr(8)
22815 .sr(1)
22816 .m(3)
22817 .n(4)
22818 .k(8)
22819 .cm_stride(7)
22820 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22821 }
22822#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
22823
22824
22825#if XNN_ARCH_X86 || XNN_ARCH_X86_64
22826 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8) {
22827 TEST_REQUIRES_X86_AVX;
22828 GemmMicrokernelTester()
22829 .mr(1)
22830 .nr(4)
22831 .kr(8)
22832 .sr(1)
22833 .m(1)
22834 .n(4)
22835 .k(8)
22836 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22837 }
22838
22839 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, strided_cn) {
22840 TEST_REQUIRES_X86_AVX;
22841 GemmMicrokernelTester()
22842 .mr(1)
22843 .nr(4)
22844 .kr(8)
22845 .sr(1)
22846 .m(1)
22847 .n(4)
22848 .k(8)
22849 .cn_stride(7)
22850 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22851 }
22852
22853 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8_strided_a) {
22854 TEST_REQUIRES_X86_AVX;
22855 GemmMicrokernelTester()
22856 .mr(1)
22857 .nr(4)
22858 .kr(8)
22859 .sr(1)
22860 .m(1)
22861 .n(4)
22862 .k(8)
22863 .a_stride(11)
22864 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22865 }
22866
22867 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8_subtile) {
22868 TEST_REQUIRES_X86_AVX;
22869 for (uint32_t m = 1; m <= 1; m++) {
22870 for (uint32_t n = 1; n <= 4; n++) {
22871 GemmMicrokernelTester()
22872 .mr(1)
22873 .nr(4)
22874 .kr(8)
22875 .sr(1)
22876 .m(m)
22877 .n(n)
22878 .k(8)
22879 .iterations(1)
22880 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22881 }
22882 }
22883 }
22884
22885 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8_subtile_m) {
22886 TEST_REQUIRES_X86_AVX;
22887 for (uint32_t m = 1; m <= 1; m++) {
22888 GemmMicrokernelTester()
22889 .mr(1)
22890 .nr(4)
22891 .kr(8)
22892 .sr(1)
22893 .m(m)
22894 .n(4)
22895 .k(8)
22896 .iterations(1)
22897 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22898 }
22899 }
22900
22901 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8_subtile_n) {
22902 TEST_REQUIRES_X86_AVX;
22903 for (uint32_t n = 1; n <= 4; n++) {
22904 GemmMicrokernelTester()
22905 .mr(1)
22906 .nr(4)
22907 .kr(8)
22908 .sr(1)
22909 .m(1)
22910 .n(n)
22911 .k(8)
22912 .iterations(1)
22913 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22914 }
22915 }
22916
22917 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_lt_8) {
22918 TEST_REQUIRES_X86_AVX;
22919 for (size_t k = 1; k < 8; k++) {
22920 GemmMicrokernelTester()
22921 .mr(1)
22922 .nr(4)
22923 .kr(8)
22924 .sr(1)
22925 .m(1)
22926 .n(4)
22927 .k(k)
22928 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22929 }
22930 }
22931
22932 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_lt_8_strided_a) {
22933 TEST_REQUIRES_X86_AVX;
22934 for (size_t k = 1; k < 8; k++) {
22935 GemmMicrokernelTester()
22936 .mr(1)
22937 .nr(4)
22938 .kr(8)
22939 .sr(1)
22940 .m(1)
22941 .n(4)
22942 .k(k)
22943 .a_stride(11)
22944 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22945 }
22946 }
22947
22948 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_lt_8_subtile) {
22949 TEST_REQUIRES_X86_AVX;
22950 for (size_t k = 1; k < 8; k++) {
22951 for (uint32_t m = 1; m <= 1; m++) {
22952 for (uint32_t n = 1; n <= 4; n++) {
22953 GemmMicrokernelTester()
22954 .mr(1)
22955 .nr(4)
22956 .kr(8)
22957 .sr(1)
22958 .m(m)
22959 .n(n)
22960 .k(k)
22961 .iterations(1)
22962 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22963 }
22964 }
22965 }
22966 }
22967
22968 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_gt_8) {
22969 TEST_REQUIRES_X86_AVX;
22970 for (size_t k = 9; k < 16; k++) {
22971 GemmMicrokernelTester()
22972 .mr(1)
22973 .nr(4)
22974 .kr(8)
22975 .sr(1)
22976 .m(1)
22977 .n(4)
22978 .k(k)
22979 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22980 }
22981 }
22982
22983 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_gt_8_strided_a) {
22984 TEST_REQUIRES_X86_AVX;
22985 for (size_t k = 9; k < 16; k++) {
22986 GemmMicrokernelTester()
22987 .mr(1)
22988 .nr(4)
22989 .kr(8)
22990 .sr(1)
22991 .m(1)
22992 .n(4)
22993 .k(k)
22994 .a_stride(19)
22995 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
22996 }
22997 }
22998
22999 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_gt_8_subtile) {
23000 TEST_REQUIRES_X86_AVX;
23001 for (size_t k = 9; k < 16; k++) {
23002 for (uint32_t m = 1; m <= 1; m++) {
23003 for (uint32_t n = 1; n <= 4; n++) {
23004 GemmMicrokernelTester()
23005 .mr(1)
23006 .nr(4)
23007 .kr(8)
23008 .sr(1)
23009 .m(m)
23010 .n(n)
23011 .k(k)
23012 .iterations(1)
23013 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23014 }
23015 }
23016 }
23017 }
23018
23019 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_div_8) {
23020 TEST_REQUIRES_X86_AVX;
23021 for (size_t k = 16; k <= 80; k += 8) {
23022 GemmMicrokernelTester()
23023 .mr(1)
23024 .nr(4)
23025 .kr(8)
23026 .sr(1)
23027 .m(1)
23028 .n(4)
23029 .k(k)
23030 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23031 }
23032 }
23033
23034 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_div_8_strided_a) {
23035 TEST_REQUIRES_X86_AVX;
23036 for (size_t k = 16; k <= 80; k += 8) {
23037 GemmMicrokernelTester()
23038 .mr(1)
23039 .nr(4)
23040 .kr(8)
23041 .sr(1)
23042 .m(1)
23043 .n(4)
23044 .k(k)
23045 .a_stride(83)
23046 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23047 }
23048 }
23049
23050 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_div_8_subtile) {
23051 TEST_REQUIRES_X86_AVX;
23052 for (size_t k = 16; k <= 80; k += 8) {
23053 for (uint32_t m = 1; m <= 1; m++) {
23054 for (uint32_t n = 1; n <= 4; n++) {
23055 GemmMicrokernelTester()
23056 .mr(1)
23057 .nr(4)
23058 .kr(8)
23059 .sr(1)
23060 .m(m)
23061 .n(n)
23062 .k(k)
23063 .iterations(1)
23064 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23065 }
23066 }
23067 }
23068 }
23069
23070 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4) {
23071 TEST_REQUIRES_X86_AVX;
23072 for (uint32_t n = 5; n < 8; n++) {
23073 for (size_t k = 1; k <= 40; k += 9) {
23074 GemmMicrokernelTester()
23075 .mr(1)
23076 .nr(4)
23077 .kr(8)
23078 .sr(1)
23079 .m(1)
23080 .n(4)
23081 .k(k)
23082 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23083 }
23084 }
23085 }
23086
23087 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4_strided_cn) {
23088 TEST_REQUIRES_X86_AVX;
23089 for (uint32_t n = 5; n < 8; n++) {
23090 for (size_t k = 1; k <= 40; k += 9) {
23091 GemmMicrokernelTester()
23092 .mr(1)
23093 .nr(4)
23094 .kr(8)
23095 .sr(1)
23096 .m(1)
23097 .n(4)
23098 .k(k)
23099 .cn_stride(7)
23100 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23101 }
23102 }
23103 }
23104
23105 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4_strided_a) {
23106 TEST_REQUIRES_X86_AVX;
23107 for (uint32_t n = 5; n < 8; n++) {
23108 for (size_t k = 1; k <= 40; k += 9) {
23109 GemmMicrokernelTester()
23110 .mr(1)
23111 .nr(4)
23112 .kr(8)
23113 .sr(1)
23114 .m(1)
23115 .n(n)
23116 .k(k)
23117 .a_stride(43)
23118 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23119 }
23120 }
23121 }
23122
23123 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4_subtile) {
23124 TEST_REQUIRES_X86_AVX;
23125 for (uint32_t n = 5; n < 8; n++) {
23126 for (size_t k = 1; k <= 40; k += 9) {
23127 for (uint32_t m = 1; m <= 1; m++) {
23128 GemmMicrokernelTester()
23129 .mr(1)
23130 .nr(4)
23131 .kr(8)
23132 .sr(1)
23133 .m(m)
23134 .n(n)
23135 .k(k)
23136 .iterations(1)
23137 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23138 }
23139 }
23140 }
23141 }
23142
23143 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4) {
23144 TEST_REQUIRES_X86_AVX;
23145 for (uint32_t n = 8; n <= 12; n += 4) {
23146 for (size_t k = 1; k <= 40; k += 9) {
23147 GemmMicrokernelTester()
23148 .mr(1)
23149 .nr(4)
23150 .kr(8)
23151 .sr(1)
23152 .m(1)
23153 .n(4)
23154 .k(k)
23155 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23156 }
23157 }
23158 }
23159
23160 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4_strided_cn) {
23161 TEST_REQUIRES_X86_AVX;
23162 for (uint32_t n = 8; n <= 12; n += 4) {
23163 for (size_t k = 1; k <= 40; k += 9) {
23164 GemmMicrokernelTester()
23165 .mr(1)
23166 .nr(4)
23167 .kr(8)
23168 .sr(1)
23169 .m(1)
23170 .n(n)
23171 .k(k)
23172 .cn_stride(7)
23173 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23174 }
23175 }
23176 }
23177
23178 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4_strided_a) {
23179 TEST_REQUIRES_X86_AVX;
23180 for (uint32_t n = 8; n <= 12; n += 4) {
23181 for (size_t k = 1; k <= 40; k += 9) {
23182 GemmMicrokernelTester()
23183 .mr(1)
23184 .nr(4)
23185 .kr(8)
23186 .sr(1)
23187 .m(1)
23188 .n(n)
23189 .k(k)
23190 .a_stride(43)
23191 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23192 }
23193 }
23194 }
23195
23196 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4_subtile) {
23197 TEST_REQUIRES_X86_AVX;
23198 for (uint32_t n = 8; n <= 12; n += 4) {
23199 for (size_t k = 1; k <= 40; k += 9) {
23200 for (uint32_t m = 1; m <= 1; m++) {
23201 GemmMicrokernelTester()
23202 .mr(1)
23203 .nr(4)
23204 .kr(8)
23205 .sr(1)
23206 .m(m)
23207 .n(n)
23208 .k(k)
23209 .iterations(1)
23210 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23211 }
23212 }
23213 }
23214 }
23215
23216 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, strided_cm_subtile) {
23217 TEST_REQUIRES_X86_AVX;
23218 for (size_t k = 1; k <= 40; k += 9) {
23219 for (uint32_t m = 1; m <= 1; m++) {
23220 for (uint32_t n = 1; n <= 4; n++) {
23221 GemmMicrokernelTester()
23222 .mr(1)
23223 .nr(4)
23224 .kr(8)
23225 .sr(1)
23226 .m(m)
23227 .n(n)
23228 .k(k)
23229 .cm_stride(7)
23230 .iterations(1)
23231 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23232 }
23233 }
23234 }
23235 }
23236
23237 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, qmin) {
23238 TEST_REQUIRES_X86_AVX;
23239 GemmMicrokernelTester()
23240 .mr(1)
23241 .nr(4)
23242 .kr(8)
23243 .sr(1)
23244 .m(1)
23245 .n(4)
23246 .k(8)
23247 .qmin(128)
23248 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23249 }
23250
23251 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, qmax) {
23252 TEST_REQUIRES_X86_AVX;
23253 GemmMicrokernelTester()
23254 .mr(1)
23255 .nr(4)
23256 .kr(8)
23257 .sr(1)
23258 .m(1)
23259 .n(4)
23260 .k(8)
23261 .qmax(128)
23262 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23263 }
23264
23265 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, strided_cm) {
23266 TEST_REQUIRES_X86_AVX;
23267 GemmMicrokernelTester()
23268 .mr(1)
23269 .nr(4)
23270 .kr(8)
23271 .sr(1)
23272 .m(1)
23273 .n(4)
23274 .k(8)
23275 .cm_stride(7)
23276 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23277 }
23278#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
23279
23280
23281#if XNN_ARCH_X86 || XNN_ARCH_X86_64
23282 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8) {
23283 TEST_REQUIRES_X86_AVX;
23284 GemmMicrokernelTester()
23285 .mr(2)
23286 .nr(4)
23287 .kr(8)
23288 .sr(1)
23289 .m(2)
23290 .n(4)
23291 .k(8)
23292 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23293 }
23294
23295 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, strided_cn) {
23296 TEST_REQUIRES_X86_AVX;
23297 GemmMicrokernelTester()
23298 .mr(2)
23299 .nr(4)
23300 .kr(8)
23301 .sr(1)
23302 .m(2)
23303 .n(4)
23304 .k(8)
23305 .cn_stride(7)
23306 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23307 }
23308
23309 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8_strided_a) {
23310 TEST_REQUIRES_X86_AVX;
23311 GemmMicrokernelTester()
23312 .mr(2)
23313 .nr(4)
23314 .kr(8)
23315 .sr(1)
23316 .m(2)
23317 .n(4)
23318 .k(8)
23319 .a_stride(11)
23320 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23321 }
23322
23323 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8_subtile) {
23324 TEST_REQUIRES_X86_AVX;
23325 for (uint32_t m = 1; m <= 2; m++) {
23326 for (uint32_t n = 1; n <= 4; n++) {
23327 GemmMicrokernelTester()
23328 .mr(2)
23329 .nr(4)
23330 .kr(8)
23331 .sr(1)
23332 .m(m)
23333 .n(n)
23334 .k(8)
23335 .iterations(1)
23336 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23337 }
23338 }
23339 }
23340
23341 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8_subtile_m) {
23342 TEST_REQUIRES_X86_AVX;
23343 for (uint32_t m = 1; m <= 2; m++) {
23344 GemmMicrokernelTester()
23345 .mr(2)
23346 .nr(4)
23347 .kr(8)
23348 .sr(1)
23349 .m(m)
23350 .n(4)
23351 .k(8)
23352 .iterations(1)
23353 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23354 }
23355 }
23356
23357 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8_subtile_n) {
23358 TEST_REQUIRES_X86_AVX;
23359 for (uint32_t n = 1; n <= 4; n++) {
23360 GemmMicrokernelTester()
23361 .mr(2)
23362 .nr(4)
23363 .kr(8)
23364 .sr(1)
23365 .m(2)
23366 .n(n)
23367 .k(8)
23368 .iterations(1)
23369 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23370 }
23371 }
23372
23373 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_lt_8) {
23374 TEST_REQUIRES_X86_AVX;
23375 for (size_t k = 1; k < 8; k++) {
23376 GemmMicrokernelTester()
23377 .mr(2)
23378 .nr(4)
23379 .kr(8)
23380 .sr(1)
23381 .m(2)
23382 .n(4)
23383 .k(k)
23384 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23385 }
23386 }
23387
23388 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_lt_8_strided_a) {
23389 TEST_REQUIRES_X86_AVX;
23390 for (size_t k = 1; k < 8; k++) {
23391 GemmMicrokernelTester()
23392 .mr(2)
23393 .nr(4)
23394 .kr(8)
23395 .sr(1)
23396 .m(2)
23397 .n(4)
23398 .k(k)
23399 .a_stride(11)
23400 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23401 }
23402 }
23403
23404 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_lt_8_subtile) {
23405 TEST_REQUIRES_X86_AVX;
23406 for (size_t k = 1; k < 8; k++) {
23407 for (uint32_t m = 1; m <= 2; m++) {
23408 for (uint32_t n = 1; n <= 4; n++) {
23409 GemmMicrokernelTester()
23410 .mr(2)
23411 .nr(4)
23412 .kr(8)
23413 .sr(1)
23414 .m(m)
23415 .n(n)
23416 .k(k)
23417 .iterations(1)
23418 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23419 }
23420 }
23421 }
23422 }
23423
23424 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_gt_8) {
23425 TEST_REQUIRES_X86_AVX;
23426 for (size_t k = 9; k < 16; k++) {
23427 GemmMicrokernelTester()
23428 .mr(2)
23429 .nr(4)
23430 .kr(8)
23431 .sr(1)
23432 .m(2)
23433 .n(4)
23434 .k(k)
23435 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23436 }
23437 }
23438
23439 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_gt_8_strided_a) {
23440 TEST_REQUIRES_X86_AVX;
23441 for (size_t k = 9; k < 16; k++) {
23442 GemmMicrokernelTester()
23443 .mr(2)
23444 .nr(4)
23445 .kr(8)
23446 .sr(1)
23447 .m(2)
23448 .n(4)
23449 .k(k)
23450 .a_stride(19)
23451 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23452 }
23453 }
23454
23455 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_gt_8_subtile) {
23456 TEST_REQUIRES_X86_AVX;
23457 for (size_t k = 9; k < 16; k++) {
23458 for (uint32_t m = 1; m <= 2; m++) {
23459 for (uint32_t n = 1; n <= 4; n++) {
23460 GemmMicrokernelTester()
23461 .mr(2)
23462 .nr(4)
23463 .kr(8)
23464 .sr(1)
23465 .m(m)
23466 .n(n)
23467 .k(k)
23468 .iterations(1)
23469 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23470 }
23471 }
23472 }
23473 }
23474
23475 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_div_8) {
23476 TEST_REQUIRES_X86_AVX;
23477 for (size_t k = 16; k <= 80; k += 8) {
23478 GemmMicrokernelTester()
23479 .mr(2)
23480 .nr(4)
23481 .kr(8)
23482 .sr(1)
23483 .m(2)
23484 .n(4)
23485 .k(k)
23486 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23487 }
23488 }
23489
23490 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_div_8_strided_a) {
23491 TEST_REQUIRES_X86_AVX;
23492 for (size_t k = 16; k <= 80; k += 8) {
23493 GemmMicrokernelTester()
23494 .mr(2)
23495 .nr(4)
23496 .kr(8)
23497 .sr(1)
23498 .m(2)
23499 .n(4)
23500 .k(k)
23501 .a_stride(83)
23502 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23503 }
23504 }
23505
23506 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_div_8_subtile) {
23507 TEST_REQUIRES_X86_AVX;
23508 for (size_t k = 16; k <= 80; k += 8) {
23509 for (uint32_t m = 1; m <= 2; m++) {
23510 for (uint32_t n = 1; n <= 4; n++) {
23511 GemmMicrokernelTester()
23512 .mr(2)
23513 .nr(4)
23514 .kr(8)
23515 .sr(1)
23516 .m(m)
23517 .n(n)
23518 .k(k)
23519 .iterations(1)
23520 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23521 }
23522 }
23523 }
23524 }
23525
23526 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4) {
23527 TEST_REQUIRES_X86_AVX;
23528 for (uint32_t n = 5; n < 8; n++) {
23529 for (size_t k = 1; k <= 40; k += 9) {
23530 GemmMicrokernelTester()
23531 .mr(2)
23532 .nr(4)
23533 .kr(8)
23534 .sr(1)
23535 .m(2)
23536 .n(4)
23537 .k(k)
23538 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23539 }
23540 }
23541 }
23542
23543 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4_strided_cn) {
23544 TEST_REQUIRES_X86_AVX;
23545 for (uint32_t n = 5; n < 8; n++) {
23546 for (size_t k = 1; k <= 40; k += 9) {
23547 GemmMicrokernelTester()
23548 .mr(2)
23549 .nr(4)
23550 .kr(8)
23551 .sr(1)
23552 .m(2)
23553 .n(4)
23554 .k(k)
23555 .cn_stride(7)
23556 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23557 }
23558 }
23559 }
23560
23561 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4_strided_a) {
23562 TEST_REQUIRES_X86_AVX;
23563 for (uint32_t n = 5; n < 8; n++) {
23564 for (size_t k = 1; k <= 40; k += 9) {
23565 GemmMicrokernelTester()
23566 .mr(2)
23567 .nr(4)
23568 .kr(8)
23569 .sr(1)
23570 .m(2)
23571 .n(n)
23572 .k(k)
23573 .a_stride(43)
23574 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23575 }
23576 }
23577 }
23578
23579 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4_subtile) {
23580 TEST_REQUIRES_X86_AVX;
23581 for (uint32_t n = 5; n < 8; n++) {
23582 for (size_t k = 1; k <= 40; k += 9) {
23583 for (uint32_t m = 1; m <= 2; m++) {
23584 GemmMicrokernelTester()
23585 .mr(2)
23586 .nr(4)
23587 .kr(8)
23588 .sr(1)
23589 .m(m)
23590 .n(n)
23591 .k(k)
23592 .iterations(1)
23593 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23594 }
23595 }
23596 }
23597 }
23598
23599 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4) {
23600 TEST_REQUIRES_X86_AVX;
23601 for (uint32_t n = 8; n <= 12; n += 4) {
23602 for (size_t k = 1; k <= 40; k += 9) {
23603 GemmMicrokernelTester()
23604 .mr(2)
23605 .nr(4)
23606 .kr(8)
23607 .sr(1)
23608 .m(2)
23609 .n(4)
23610 .k(k)
23611 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23612 }
23613 }
23614 }
23615
23616 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4_strided_cn) {
23617 TEST_REQUIRES_X86_AVX;
23618 for (uint32_t n = 8; n <= 12; n += 4) {
23619 for (size_t k = 1; k <= 40; k += 9) {
23620 GemmMicrokernelTester()
23621 .mr(2)
23622 .nr(4)
23623 .kr(8)
23624 .sr(1)
23625 .m(2)
23626 .n(n)
23627 .k(k)
23628 .cn_stride(7)
23629 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23630 }
23631 }
23632 }
23633
23634 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4_strided_a) {
23635 TEST_REQUIRES_X86_AVX;
23636 for (uint32_t n = 8; n <= 12; n += 4) {
23637 for (size_t k = 1; k <= 40; k += 9) {
23638 GemmMicrokernelTester()
23639 .mr(2)
23640 .nr(4)
23641 .kr(8)
23642 .sr(1)
23643 .m(2)
23644 .n(n)
23645 .k(k)
23646 .a_stride(43)
23647 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23648 }
23649 }
23650 }
23651
23652 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4_subtile) {
23653 TEST_REQUIRES_X86_AVX;
23654 for (uint32_t n = 8; n <= 12; n += 4) {
23655 for (size_t k = 1; k <= 40; k += 9) {
23656 for (uint32_t m = 1; m <= 2; m++) {
23657 GemmMicrokernelTester()
23658 .mr(2)
23659 .nr(4)
23660 .kr(8)
23661 .sr(1)
23662 .m(m)
23663 .n(n)
23664 .k(k)
23665 .iterations(1)
23666 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23667 }
23668 }
23669 }
23670 }
23671
23672 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, strided_cm_subtile) {
23673 TEST_REQUIRES_X86_AVX;
23674 for (size_t k = 1; k <= 40; k += 9) {
23675 for (uint32_t m = 1; m <= 2; m++) {
23676 for (uint32_t n = 1; n <= 4; n++) {
23677 GemmMicrokernelTester()
23678 .mr(2)
23679 .nr(4)
23680 .kr(8)
23681 .sr(1)
23682 .m(m)
23683 .n(n)
23684 .k(k)
23685 .cm_stride(7)
23686 .iterations(1)
23687 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23688 }
23689 }
23690 }
23691 }
23692
23693 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, qmin) {
23694 TEST_REQUIRES_X86_AVX;
23695 GemmMicrokernelTester()
23696 .mr(2)
23697 .nr(4)
23698 .kr(8)
23699 .sr(1)
23700 .m(2)
23701 .n(4)
23702 .k(8)
23703 .qmin(128)
23704 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23705 }
23706
23707 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, qmax) {
23708 TEST_REQUIRES_X86_AVX;
23709 GemmMicrokernelTester()
23710 .mr(2)
23711 .nr(4)
23712 .kr(8)
23713 .sr(1)
23714 .m(2)
23715 .n(4)
23716 .k(8)
23717 .qmax(128)
23718 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23719 }
23720
23721 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, strided_cm) {
23722 TEST_REQUIRES_X86_AVX;
23723 GemmMicrokernelTester()
23724 .mr(2)
23725 .nr(4)
23726 .kr(8)
23727 .sr(1)
23728 .m(2)
23729 .n(4)
23730 .k(8)
23731 .cm_stride(7)
23732 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23733 }
23734#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
23735
23736
23737#if XNN_ARCH_X86 || XNN_ARCH_X86_64
23738 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8) {
23739 TEST_REQUIRES_X86_AVX;
23740 GemmMicrokernelTester()
23741 .mr(3)
23742 .nr(4)
23743 .kr(8)
23744 .sr(1)
23745 .m(3)
23746 .n(4)
23747 .k(8)
23748 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23749 }
23750
23751 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, strided_cn) {
23752 TEST_REQUIRES_X86_AVX;
23753 GemmMicrokernelTester()
23754 .mr(3)
23755 .nr(4)
23756 .kr(8)
23757 .sr(1)
23758 .m(3)
23759 .n(4)
23760 .k(8)
23761 .cn_stride(7)
23762 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23763 }
23764
23765 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_strided_a) {
23766 TEST_REQUIRES_X86_AVX;
23767 GemmMicrokernelTester()
23768 .mr(3)
23769 .nr(4)
23770 .kr(8)
23771 .sr(1)
23772 .m(3)
23773 .n(4)
23774 .k(8)
23775 .a_stride(11)
23776 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23777 }
23778
23779 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_subtile) {
23780 TEST_REQUIRES_X86_AVX;
23781 for (uint32_t m = 1; m <= 3; m++) {
23782 for (uint32_t n = 1; n <= 4; n++) {
23783 GemmMicrokernelTester()
23784 .mr(3)
23785 .nr(4)
23786 .kr(8)
23787 .sr(1)
23788 .m(m)
23789 .n(n)
23790 .k(8)
23791 .iterations(1)
23792 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23793 }
23794 }
23795 }
23796
23797 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_subtile_m) {
23798 TEST_REQUIRES_X86_AVX;
23799 for (uint32_t m = 1; m <= 3; m++) {
23800 GemmMicrokernelTester()
23801 .mr(3)
23802 .nr(4)
23803 .kr(8)
23804 .sr(1)
23805 .m(m)
23806 .n(4)
23807 .k(8)
23808 .iterations(1)
23809 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23810 }
23811 }
23812
23813 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_subtile_n) {
23814 TEST_REQUIRES_X86_AVX;
23815 for (uint32_t n = 1; n <= 4; n++) {
23816 GemmMicrokernelTester()
23817 .mr(3)
23818 .nr(4)
23819 .kr(8)
23820 .sr(1)
23821 .m(3)
23822 .n(n)
23823 .k(8)
23824 .iterations(1)
23825 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23826 }
23827 }
23828
23829 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_lt_8) {
23830 TEST_REQUIRES_X86_AVX;
23831 for (size_t k = 1; k < 8; k++) {
23832 GemmMicrokernelTester()
23833 .mr(3)
23834 .nr(4)
23835 .kr(8)
23836 .sr(1)
23837 .m(3)
23838 .n(4)
23839 .k(k)
23840 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23841 }
23842 }
23843
23844 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_lt_8_strided_a) {
23845 TEST_REQUIRES_X86_AVX;
23846 for (size_t k = 1; k < 8; k++) {
23847 GemmMicrokernelTester()
23848 .mr(3)
23849 .nr(4)
23850 .kr(8)
23851 .sr(1)
23852 .m(3)
23853 .n(4)
23854 .k(k)
23855 .a_stride(11)
23856 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23857 }
23858 }
23859
23860 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_lt_8_subtile) {
23861 TEST_REQUIRES_X86_AVX;
23862 for (size_t k = 1; k < 8; k++) {
23863 for (uint32_t m = 1; m <= 3; m++) {
23864 for (uint32_t n = 1; n <= 4; n++) {
23865 GemmMicrokernelTester()
23866 .mr(3)
23867 .nr(4)
23868 .kr(8)
23869 .sr(1)
23870 .m(m)
23871 .n(n)
23872 .k(k)
23873 .iterations(1)
23874 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23875 }
23876 }
23877 }
23878 }
23879
23880 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_gt_8) {
23881 TEST_REQUIRES_X86_AVX;
23882 for (size_t k = 9; k < 16; k++) {
23883 GemmMicrokernelTester()
23884 .mr(3)
23885 .nr(4)
23886 .kr(8)
23887 .sr(1)
23888 .m(3)
23889 .n(4)
23890 .k(k)
23891 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23892 }
23893 }
23894
23895 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_gt_8_strided_a) {
23896 TEST_REQUIRES_X86_AVX;
23897 for (size_t k = 9; k < 16; k++) {
23898 GemmMicrokernelTester()
23899 .mr(3)
23900 .nr(4)
23901 .kr(8)
23902 .sr(1)
23903 .m(3)
23904 .n(4)
23905 .k(k)
23906 .a_stride(19)
23907 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23908 }
23909 }
23910
23911 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_gt_8_subtile) {
23912 TEST_REQUIRES_X86_AVX;
23913 for (size_t k = 9; k < 16; k++) {
23914 for (uint32_t m = 1; m <= 3; m++) {
23915 for (uint32_t n = 1; n <= 4; n++) {
23916 GemmMicrokernelTester()
23917 .mr(3)
23918 .nr(4)
23919 .kr(8)
23920 .sr(1)
23921 .m(m)
23922 .n(n)
23923 .k(k)
23924 .iterations(1)
23925 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23926 }
23927 }
23928 }
23929 }
23930
23931 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_div_8) {
23932 TEST_REQUIRES_X86_AVX;
23933 for (size_t k = 16; k <= 80; k += 8) {
23934 GemmMicrokernelTester()
23935 .mr(3)
23936 .nr(4)
23937 .kr(8)
23938 .sr(1)
23939 .m(3)
23940 .n(4)
23941 .k(k)
23942 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23943 }
23944 }
23945
23946 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_div_8_strided_a) {
23947 TEST_REQUIRES_X86_AVX;
23948 for (size_t k = 16; k <= 80; k += 8) {
23949 GemmMicrokernelTester()
23950 .mr(3)
23951 .nr(4)
23952 .kr(8)
23953 .sr(1)
23954 .m(3)
23955 .n(4)
23956 .k(k)
23957 .a_stride(83)
23958 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23959 }
23960 }
23961
23962 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_div_8_subtile) {
23963 TEST_REQUIRES_X86_AVX;
23964 for (size_t k = 16; k <= 80; k += 8) {
23965 for (uint32_t m = 1; m <= 3; m++) {
23966 for (uint32_t n = 1; n <= 4; n++) {
23967 GemmMicrokernelTester()
23968 .mr(3)
23969 .nr(4)
23970 .kr(8)
23971 .sr(1)
23972 .m(m)
23973 .n(n)
23974 .k(k)
23975 .iterations(1)
23976 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23977 }
23978 }
23979 }
23980 }
23981
23982 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4) {
23983 TEST_REQUIRES_X86_AVX;
23984 for (uint32_t n = 5; n < 8; n++) {
23985 for (size_t k = 1; k <= 40; k += 9) {
23986 GemmMicrokernelTester()
23987 .mr(3)
23988 .nr(4)
23989 .kr(8)
23990 .sr(1)
23991 .m(3)
23992 .n(4)
23993 .k(k)
23994 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
23995 }
23996 }
23997 }
23998
23999 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4_strided_cn) {
24000 TEST_REQUIRES_X86_AVX;
24001 for (uint32_t n = 5; n < 8; n++) {
24002 for (size_t k = 1; k <= 40; k += 9) {
24003 GemmMicrokernelTester()
24004 .mr(3)
24005 .nr(4)
24006 .kr(8)
24007 .sr(1)
24008 .m(3)
24009 .n(4)
24010 .k(k)
24011 .cn_stride(7)
24012 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24013 }
24014 }
24015 }
24016
24017 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4_strided_a) {
24018 TEST_REQUIRES_X86_AVX;
24019 for (uint32_t n = 5; n < 8; n++) {
24020 for (size_t k = 1; k <= 40; k += 9) {
24021 GemmMicrokernelTester()
24022 .mr(3)
24023 .nr(4)
24024 .kr(8)
24025 .sr(1)
24026 .m(3)
24027 .n(n)
24028 .k(k)
24029 .a_stride(43)
24030 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24031 }
24032 }
24033 }
24034
24035 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4_subtile) {
24036 TEST_REQUIRES_X86_AVX;
24037 for (uint32_t n = 5; n < 8; n++) {
24038 for (size_t k = 1; k <= 40; k += 9) {
24039 for (uint32_t m = 1; m <= 3; m++) {
24040 GemmMicrokernelTester()
24041 .mr(3)
24042 .nr(4)
24043 .kr(8)
24044 .sr(1)
24045 .m(m)
24046 .n(n)
24047 .k(k)
24048 .iterations(1)
24049 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24050 }
24051 }
24052 }
24053 }
24054
24055 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4) {
24056 TEST_REQUIRES_X86_AVX;
24057 for (uint32_t n = 8; n <= 12; n += 4) {
24058 for (size_t k = 1; k <= 40; k += 9) {
24059 GemmMicrokernelTester()
24060 .mr(3)
24061 .nr(4)
24062 .kr(8)
24063 .sr(1)
24064 .m(3)
24065 .n(4)
24066 .k(k)
24067 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24068 }
24069 }
24070 }
24071
24072 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4_strided_cn) {
24073 TEST_REQUIRES_X86_AVX;
24074 for (uint32_t n = 8; n <= 12; n += 4) {
24075 for (size_t k = 1; k <= 40; k += 9) {
24076 GemmMicrokernelTester()
24077 .mr(3)
24078 .nr(4)
24079 .kr(8)
24080 .sr(1)
24081 .m(3)
24082 .n(n)
24083 .k(k)
24084 .cn_stride(7)
24085 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24086 }
24087 }
24088 }
24089
24090 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4_strided_a) {
24091 TEST_REQUIRES_X86_AVX;
24092 for (uint32_t n = 8; n <= 12; n += 4) {
24093 for (size_t k = 1; k <= 40; k += 9) {
24094 GemmMicrokernelTester()
24095 .mr(3)
24096 .nr(4)
24097 .kr(8)
24098 .sr(1)
24099 .m(3)
24100 .n(n)
24101 .k(k)
24102 .a_stride(43)
24103 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24104 }
24105 }
24106 }
24107
24108 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4_subtile) {
24109 TEST_REQUIRES_X86_AVX;
24110 for (uint32_t n = 8; n <= 12; n += 4) {
24111 for (size_t k = 1; k <= 40; k += 9) {
24112 for (uint32_t m = 1; m <= 3; m++) {
24113 GemmMicrokernelTester()
24114 .mr(3)
24115 .nr(4)
24116 .kr(8)
24117 .sr(1)
24118 .m(m)
24119 .n(n)
24120 .k(k)
24121 .iterations(1)
24122 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24123 }
24124 }
24125 }
24126 }
24127
24128 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, strided_cm_subtile) {
24129 TEST_REQUIRES_X86_AVX;
24130 for (size_t k = 1; k <= 40; k += 9) {
24131 for (uint32_t m = 1; m <= 3; m++) {
24132 for (uint32_t n = 1; n <= 4; n++) {
24133 GemmMicrokernelTester()
24134 .mr(3)
24135 .nr(4)
24136 .kr(8)
24137 .sr(1)
24138 .m(m)
24139 .n(n)
24140 .k(k)
24141 .cm_stride(7)
24142 .iterations(1)
24143 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24144 }
24145 }
24146 }
24147 }
24148
24149 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, qmin) {
24150 TEST_REQUIRES_X86_AVX;
24151 GemmMicrokernelTester()
24152 .mr(3)
24153 .nr(4)
24154 .kr(8)
24155 .sr(1)
24156 .m(3)
24157 .n(4)
24158 .k(8)
24159 .qmin(128)
24160 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24161 }
24162
24163 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, qmax) {
24164 TEST_REQUIRES_X86_AVX;
24165 GemmMicrokernelTester()
24166 .mr(3)
24167 .nr(4)
24168 .kr(8)
24169 .sr(1)
24170 .m(3)
24171 .n(4)
24172 .k(8)
24173 .qmax(128)
24174 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24175 }
24176
24177 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, strided_cm) {
24178 TEST_REQUIRES_X86_AVX;
24179 GemmMicrokernelTester()
24180 .mr(3)
24181 .nr(4)
24182 .kr(8)
24183 .sr(1)
24184 .m(3)
24185 .n(4)
24186 .k(8)
24187 .cm_stride(7)
24188 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24189 }
24190#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
24191
24192
24193#if XNN_ARCH_X86 || XNN_ARCH_X86_64
24194 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8) {
24195 TEST_REQUIRES_X86_XOP;
24196 GemmMicrokernelTester()
24197 .mr(1)
24198 .nr(4)
24199 .kr(8)
24200 .sr(1)
24201 .m(1)
24202 .n(4)
24203 .k(8)
24204 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24205 }
24206
24207 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, strided_cn) {
24208 TEST_REQUIRES_X86_XOP;
24209 GemmMicrokernelTester()
24210 .mr(1)
24211 .nr(4)
24212 .kr(8)
24213 .sr(1)
24214 .m(1)
24215 .n(4)
24216 .k(8)
24217 .cn_stride(7)
24218 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24219 }
24220
24221 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_strided_a) {
24222 TEST_REQUIRES_X86_XOP;
24223 GemmMicrokernelTester()
24224 .mr(1)
24225 .nr(4)
24226 .kr(8)
24227 .sr(1)
24228 .m(1)
24229 .n(4)
24230 .k(8)
24231 .a_stride(11)
24232 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24233 }
24234
24235 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_subtile) {
24236 TEST_REQUIRES_X86_XOP;
24237 for (uint32_t m = 1; m <= 1; m++) {
24238 for (uint32_t n = 1; n <= 4; n++) {
24239 GemmMicrokernelTester()
24240 .mr(1)
24241 .nr(4)
24242 .kr(8)
24243 .sr(1)
24244 .m(m)
24245 .n(n)
24246 .k(8)
24247 .iterations(1)
24248 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24249 }
24250 }
24251 }
24252
24253 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_subtile_m) {
24254 TEST_REQUIRES_X86_XOP;
24255 for (uint32_t m = 1; m <= 1; m++) {
24256 GemmMicrokernelTester()
24257 .mr(1)
24258 .nr(4)
24259 .kr(8)
24260 .sr(1)
24261 .m(m)
24262 .n(4)
24263 .k(8)
24264 .iterations(1)
24265 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24266 }
24267 }
24268
24269 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_subtile_n) {
24270 TEST_REQUIRES_X86_XOP;
24271 for (uint32_t n = 1; n <= 4; n++) {
24272 GemmMicrokernelTester()
24273 .mr(1)
24274 .nr(4)
24275 .kr(8)
24276 .sr(1)
24277 .m(1)
24278 .n(n)
24279 .k(8)
24280 .iterations(1)
24281 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24282 }
24283 }
24284
24285 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_lt_8) {
24286 TEST_REQUIRES_X86_XOP;
24287 for (size_t k = 1; k < 8; k++) {
24288 GemmMicrokernelTester()
24289 .mr(1)
24290 .nr(4)
24291 .kr(8)
24292 .sr(1)
24293 .m(1)
24294 .n(4)
24295 .k(k)
24296 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24297 }
24298 }
24299
24300 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_lt_8_strided_a) {
24301 TEST_REQUIRES_X86_XOP;
24302 for (size_t k = 1; k < 8; k++) {
24303 GemmMicrokernelTester()
24304 .mr(1)
24305 .nr(4)
24306 .kr(8)
24307 .sr(1)
24308 .m(1)
24309 .n(4)
24310 .k(k)
24311 .a_stride(11)
24312 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24313 }
24314 }
24315
24316 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_lt_8_subtile) {
24317 TEST_REQUIRES_X86_XOP;
24318 for (size_t k = 1; k < 8; k++) {
24319 for (uint32_t m = 1; m <= 1; m++) {
24320 for (uint32_t n = 1; n <= 4; n++) {
24321 GemmMicrokernelTester()
24322 .mr(1)
24323 .nr(4)
24324 .kr(8)
24325 .sr(1)
24326 .m(m)
24327 .n(n)
24328 .k(k)
24329 .iterations(1)
24330 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24331 }
24332 }
24333 }
24334 }
24335
24336 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_gt_8) {
24337 TEST_REQUIRES_X86_XOP;
24338 for (size_t k = 9; k < 16; k++) {
24339 GemmMicrokernelTester()
24340 .mr(1)
24341 .nr(4)
24342 .kr(8)
24343 .sr(1)
24344 .m(1)
24345 .n(4)
24346 .k(k)
24347 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24348 }
24349 }
24350
24351 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_gt_8_strided_a) {
24352 TEST_REQUIRES_X86_XOP;
24353 for (size_t k = 9; k < 16; k++) {
24354 GemmMicrokernelTester()
24355 .mr(1)
24356 .nr(4)
24357 .kr(8)
24358 .sr(1)
24359 .m(1)
24360 .n(4)
24361 .k(k)
24362 .a_stride(19)
24363 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24364 }
24365 }
24366
24367 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_gt_8_subtile) {
24368 TEST_REQUIRES_X86_XOP;
24369 for (size_t k = 9; k < 16; k++) {
24370 for (uint32_t m = 1; m <= 1; m++) {
24371 for (uint32_t n = 1; n <= 4; n++) {
24372 GemmMicrokernelTester()
24373 .mr(1)
24374 .nr(4)
24375 .kr(8)
24376 .sr(1)
24377 .m(m)
24378 .n(n)
24379 .k(k)
24380 .iterations(1)
24381 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24382 }
24383 }
24384 }
24385 }
24386
24387 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_div_8) {
24388 TEST_REQUIRES_X86_XOP;
24389 for (size_t k = 16; k <= 80; k += 8) {
24390 GemmMicrokernelTester()
24391 .mr(1)
24392 .nr(4)
24393 .kr(8)
24394 .sr(1)
24395 .m(1)
24396 .n(4)
24397 .k(k)
24398 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24399 }
24400 }
24401
24402 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_div_8_strided_a) {
24403 TEST_REQUIRES_X86_XOP;
24404 for (size_t k = 16; k <= 80; k += 8) {
24405 GemmMicrokernelTester()
24406 .mr(1)
24407 .nr(4)
24408 .kr(8)
24409 .sr(1)
24410 .m(1)
24411 .n(4)
24412 .k(k)
24413 .a_stride(83)
24414 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24415 }
24416 }
24417
24418 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_div_8_subtile) {
24419 TEST_REQUIRES_X86_XOP;
24420 for (size_t k = 16; k <= 80; k += 8) {
24421 for (uint32_t m = 1; m <= 1; m++) {
24422 for (uint32_t n = 1; n <= 4; n++) {
24423 GemmMicrokernelTester()
24424 .mr(1)
24425 .nr(4)
24426 .kr(8)
24427 .sr(1)
24428 .m(m)
24429 .n(n)
24430 .k(k)
24431 .iterations(1)
24432 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24433 }
24434 }
24435 }
24436 }
24437
24438 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4) {
24439 TEST_REQUIRES_X86_XOP;
24440 for (uint32_t n = 5; n < 8; n++) {
24441 for (size_t k = 1; k <= 40; k += 9) {
24442 GemmMicrokernelTester()
24443 .mr(1)
24444 .nr(4)
24445 .kr(8)
24446 .sr(1)
24447 .m(1)
24448 .n(4)
24449 .k(k)
24450 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24451 }
24452 }
24453 }
24454
24455 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4_strided_cn) {
24456 TEST_REQUIRES_X86_XOP;
24457 for (uint32_t n = 5; n < 8; n++) {
24458 for (size_t k = 1; k <= 40; k += 9) {
24459 GemmMicrokernelTester()
24460 .mr(1)
24461 .nr(4)
24462 .kr(8)
24463 .sr(1)
24464 .m(1)
24465 .n(4)
24466 .k(k)
24467 .cn_stride(7)
24468 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24469 }
24470 }
24471 }
24472
24473 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4_strided_a) {
24474 TEST_REQUIRES_X86_XOP;
24475 for (uint32_t n = 5; n < 8; n++) {
24476 for (size_t k = 1; k <= 40; k += 9) {
24477 GemmMicrokernelTester()
24478 .mr(1)
24479 .nr(4)
24480 .kr(8)
24481 .sr(1)
24482 .m(1)
24483 .n(n)
24484 .k(k)
24485 .a_stride(43)
24486 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24487 }
24488 }
24489 }
24490
24491 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4_subtile) {
24492 TEST_REQUIRES_X86_XOP;
24493 for (uint32_t n = 5; n < 8; n++) {
24494 for (size_t k = 1; k <= 40; k += 9) {
24495 for (uint32_t m = 1; m <= 1; m++) {
24496 GemmMicrokernelTester()
24497 .mr(1)
24498 .nr(4)
24499 .kr(8)
24500 .sr(1)
24501 .m(m)
24502 .n(n)
24503 .k(k)
24504 .iterations(1)
24505 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24506 }
24507 }
24508 }
24509 }
24510
24511 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4) {
24512 TEST_REQUIRES_X86_XOP;
24513 for (uint32_t n = 8; n <= 12; n += 4) {
24514 for (size_t k = 1; k <= 40; k += 9) {
24515 GemmMicrokernelTester()
24516 .mr(1)
24517 .nr(4)
24518 .kr(8)
24519 .sr(1)
24520 .m(1)
24521 .n(4)
24522 .k(k)
24523 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24524 }
24525 }
24526 }
24527
24528 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4_strided_cn) {
24529 TEST_REQUIRES_X86_XOP;
24530 for (uint32_t n = 8; n <= 12; n += 4) {
24531 for (size_t k = 1; k <= 40; k += 9) {
24532 GemmMicrokernelTester()
24533 .mr(1)
24534 .nr(4)
24535 .kr(8)
24536 .sr(1)
24537 .m(1)
24538 .n(n)
24539 .k(k)
24540 .cn_stride(7)
24541 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24542 }
24543 }
24544 }
24545
24546 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4_strided_a) {
24547 TEST_REQUIRES_X86_XOP;
24548 for (uint32_t n = 8; n <= 12; n += 4) {
24549 for (size_t k = 1; k <= 40; k += 9) {
24550 GemmMicrokernelTester()
24551 .mr(1)
24552 .nr(4)
24553 .kr(8)
24554 .sr(1)
24555 .m(1)
24556 .n(n)
24557 .k(k)
24558 .a_stride(43)
24559 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24560 }
24561 }
24562 }
24563
24564 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4_subtile) {
24565 TEST_REQUIRES_X86_XOP;
24566 for (uint32_t n = 8; n <= 12; n += 4) {
24567 for (size_t k = 1; k <= 40; k += 9) {
24568 for (uint32_t m = 1; m <= 1; m++) {
24569 GemmMicrokernelTester()
24570 .mr(1)
24571 .nr(4)
24572 .kr(8)
24573 .sr(1)
24574 .m(m)
24575 .n(n)
24576 .k(k)
24577 .iterations(1)
24578 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24579 }
24580 }
24581 }
24582 }
24583
24584 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, strided_cm_subtile) {
24585 TEST_REQUIRES_X86_XOP;
24586 for (size_t k = 1; k <= 40; k += 9) {
24587 for (uint32_t m = 1; m <= 1; m++) {
24588 for (uint32_t n = 1; n <= 4; n++) {
24589 GemmMicrokernelTester()
24590 .mr(1)
24591 .nr(4)
24592 .kr(8)
24593 .sr(1)
24594 .m(m)
24595 .n(n)
24596 .k(k)
24597 .cm_stride(7)
24598 .iterations(1)
24599 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24600 }
24601 }
24602 }
24603 }
24604
24605 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, qmin) {
24606 TEST_REQUIRES_X86_XOP;
24607 GemmMicrokernelTester()
24608 .mr(1)
24609 .nr(4)
24610 .kr(8)
24611 .sr(1)
24612 .m(1)
24613 .n(4)
24614 .k(8)
24615 .qmin(128)
24616 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24617 }
24618
24619 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, qmax) {
24620 TEST_REQUIRES_X86_XOP;
24621 GemmMicrokernelTester()
24622 .mr(1)
24623 .nr(4)
24624 .kr(8)
24625 .sr(1)
24626 .m(1)
24627 .n(4)
24628 .k(8)
24629 .qmax(128)
24630 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24631 }
24632
24633 TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, strided_cm) {
24634 TEST_REQUIRES_X86_XOP;
24635 GemmMicrokernelTester()
24636 .mr(1)
24637 .nr(4)
24638 .kr(8)
24639 .sr(1)
24640 .m(1)
24641 .n(4)
24642 .k(8)
24643 .cm_stride(7)
24644 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24645 }
24646#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
24647
24648
24649#if XNN_ARCH_X86 || XNN_ARCH_X86_64
24650 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8) {
24651 TEST_REQUIRES_X86_XOP;
24652 GemmMicrokernelTester()
24653 .mr(2)
24654 .nr(4)
24655 .kr(8)
24656 .sr(1)
24657 .m(2)
24658 .n(4)
24659 .k(8)
24660 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24661 }
24662
24663 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, strided_cn) {
24664 TEST_REQUIRES_X86_XOP;
24665 GemmMicrokernelTester()
24666 .mr(2)
24667 .nr(4)
24668 .kr(8)
24669 .sr(1)
24670 .m(2)
24671 .n(4)
24672 .k(8)
24673 .cn_stride(7)
24674 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24675 }
24676
24677 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_strided_a) {
24678 TEST_REQUIRES_X86_XOP;
24679 GemmMicrokernelTester()
24680 .mr(2)
24681 .nr(4)
24682 .kr(8)
24683 .sr(1)
24684 .m(2)
24685 .n(4)
24686 .k(8)
24687 .a_stride(11)
24688 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24689 }
24690
24691 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_subtile) {
24692 TEST_REQUIRES_X86_XOP;
24693 for (uint32_t m = 1; m <= 2; m++) {
24694 for (uint32_t n = 1; n <= 4; n++) {
24695 GemmMicrokernelTester()
24696 .mr(2)
24697 .nr(4)
24698 .kr(8)
24699 .sr(1)
24700 .m(m)
24701 .n(n)
24702 .k(8)
24703 .iterations(1)
24704 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24705 }
24706 }
24707 }
24708
24709 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_subtile_m) {
24710 TEST_REQUIRES_X86_XOP;
24711 for (uint32_t m = 1; m <= 2; m++) {
24712 GemmMicrokernelTester()
24713 .mr(2)
24714 .nr(4)
24715 .kr(8)
24716 .sr(1)
24717 .m(m)
24718 .n(4)
24719 .k(8)
24720 .iterations(1)
24721 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24722 }
24723 }
24724
24725 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_subtile_n) {
24726 TEST_REQUIRES_X86_XOP;
24727 for (uint32_t n = 1; n <= 4; n++) {
24728 GemmMicrokernelTester()
24729 .mr(2)
24730 .nr(4)
24731 .kr(8)
24732 .sr(1)
24733 .m(2)
24734 .n(n)
24735 .k(8)
24736 .iterations(1)
24737 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24738 }
24739 }
24740
24741 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_lt_8) {
24742 TEST_REQUIRES_X86_XOP;
24743 for (size_t k = 1; k < 8; k++) {
24744 GemmMicrokernelTester()
24745 .mr(2)
24746 .nr(4)
24747 .kr(8)
24748 .sr(1)
24749 .m(2)
24750 .n(4)
24751 .k(k)
24752 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24753 }
24754 }
24755
24756 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_lt_8_strided_a) {
24757 TEST_REQUIRES_X86_XOP;
24758 for (size_t k = 1; k < 8; k++) {
24759 GemmMicrokernelTester()
24760 .mr(2)
24761 .nr(4)
24762 .kr(8)
24763 .sr(1)
24764 .m(2)
24765 .n(4)
24766 .k(k)
24767 .a_stride(11)
24768 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24769 }
24770 }
24771
24772 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_lt_8_subtile) {
24773 TEST_REQUIRES_X86_XOP;
24774 for (size_t k = 1; k < 8; k++) {
24775 for (uint32_t m = 1; m <= 2; m++) {
24776 for (uint32_t n = 1; n <= 4; n++) {
24777 GemmMicrokernelTester()
24778 .mr(2)
24779 .nr(4)
24780 .kr(8)
24781 .sr(1)
24782 .m(m)
24783 .n(n)
24784 .k(k)
24785 .iterations(1)
24786 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24787 }
24788 }
24789 }
24790 }
24791
24792 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_gt_8) {
24793 TEST_REQUIRES_X86_XOP;
24794 for (size_t k = 9; k < 16; k++) {
24795 GemmMicrokernelTester()
24796 .mr(2)
24797 .nr(4)
24798 .kr(8)
24799 .sr(1)
24800 .m(2)
24801 .n(4)
24802 .k(k)
24803 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24804 }
24805 }
24806
24807 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_gt_8_strided_a) {
24808 TEST_REQUIRES_X86_XOP;
24809 for (size_t k = 9; k < 16; k++) {
24810 GemmMicrokernelTester()
24811 .mr(2)
24812 .nr(4)
24813 .kr(8)
24814 .sr(1)
24815 .m(2)
24816 .n(4)
24817 .k(k)
24818 .a_stride(19)
24819 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24820 }
24821 }
24822
24823 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_gt_8_subtile) {
24824 TEST_REQUIRES_X86_XOP;
24825 for (size_t k = 9; k < 16; k++) {
24826 for (uint32_t m = 1; m <= 2; m++) {
24827 for (uint32_t n = 1; n <= 4; n++) {
24828 GemmMicrokernelTester()
24829 .mr(2)
24830 .nr(4)
24831 .kr(8)
24832 .sr(1)
24833 .m(m)
24834 .n(n)
24835 .k(k)
24836 .iterations(1)
24837 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24838 }
24839 }
24840 }
24841 }
24842
24843 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_div_8) {
24844 TEST_REQUIRES_X86_XOP;
24845 for (size_t k = 16; k <= 80; k += 8) {
24846 GemmMicrokernelTester()
24847 .mr(2)
24848 .nr(4)
24849 .kr(8)
24850 .sr(1)
24851 .m(2)
24852 .n(4)
24853 .k(k)
24854 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24855 }
24856 }
24857
24858 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_div_8_strided_a) {
24859 TEST_REQUIRES_X86_XOP;
24860 for (size_t k = 16; k <= 80; k += 8) {
24861 GemmMicrokernelTester()
24862 .mr(2)
24863 .nr(4)
24864 .kr(8)
24865 .sr(1)
24866 .m(2)
24867 .n(4)
24868 .k(k)
24869 .a_stride(83)
24870 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24871 }
24872 }
24873
24874 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_div_8_subtile) {
24875 TEST_REQUIRES_X86_XOP;
24876 for (size_t k = 16; k <= 80; k += 8) {
24877 for (uint32_t m = 1; m <= 2; m++) {
24878 for (uint32_t n = 1; n <= 4; n++) {
24879 GemmMicrokernelTester()
24880 .mr(2)
24881 .nr(4)
24882 .kr(8)
24883 .sr(1)
24884 .m(m)
24885 .n(n)
24886 .k(k)
24887 .iterations(1)
24888 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24889 }
24890 }
24891 }
24892 }
24893
24894 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4) {
24895 TEST_REQUIRES_X86_XOP;
24896 for (uint32_t n = 5; n < 8; n++) {
24897 for (size_t k = 1; k <= 40; k += 9) {
24898 GemmMicrokernelTester()
24899 .mr(2)
24900 .nr(4)
24901 .kr(8)
24902 .sr(1)
24903 .m(2)
24904 .n(4)
24905 .k(k)
24906 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24907 }
24908 }
24909 }
24910
24911 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4_strided_cn) {
24912 TEST_REQUIRES_X86_XOP;
24913 for (uint32_t n = 5; n < 8; n++) {
24914 for (size_t k = 1; k <= 40; k += 9) {
24915 GemmMicrokernelTester()
24916 .mr(2)
24917 .nr(4)
24918 .kr(8)
24919 .sr(1)
24920 .m(2)
24921 .n(4)
24922 .k(k)
24923 .cn_stride(7)
24924 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24925 }
24926 }
24927 }
24928
24929 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4_strided_a) {
24930 TEST_REQUIRES_X86_XOP;
24931 for (uint32_t n = 5; n < 8; n++) {
24932 for (size_t k = 1; k <= 40; k += 9) {
24933 GemmMicrokernelTester()
24934 .mr(2)
24935 .nr(4)
24936 .kr(8)
24937 .sr(1)
24938 .m(2)
24939 .n(n)
24940 .k(k)
24941 .a_stride(43)
24942 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24943 }
24944 }
24945 }
24946
24947 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4_subtile) {
24948 TEST_REQUIRES_X86_XOP;
24949 for (uint32_t n = 5; n < 8; n++) {
24950 for (size_t k = 1; k <= 40; k += 9) {
24951 for (uint32_t m = 1; m <= 2; m++) {
24952 GemmMicrokernelTester()
24953 .mr(2)
24954 .nr(4)
24955 .kr(8)
24956 .sr(1)
24957 .m(m)
24958 .n(n)
24959 .k(k)
24960 .iterations(1)
24961 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24962 }
24963 }
24964 }
24965 }
24966
24967 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4) {
24968 TEST_REQUIRES_X86_XOP;
24969 for (uint32_t n = 8; n <= 12; n += 4) {
24970 for (size_t k = 1; k <= 40; k += 9) {
24971 GemmMicrokernelTester()
24972 .mr(2)
24973 .nr(4)
24974 .kr(8)
24975 .sr(1)
24976 .m(2)
24977 .n(4)
24978 .k(k)
24979 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24980 }
24981 }
24982 }
24983
24984 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4_strided_cn) {
24985 TEST_REQUIRES_X86_XOP;
24986 for (uint32_t n = 8; n <= 12; n += 4) {
24987 for (size_t k = 1; k <= 40; k += 9) {
24988 GemmMicrokernelTester()
24989 .mr(2)
24990 .nr(4)
24991 .kr(8)
24992 .sr(1)
24993 .m(2)
24994 .n(n)
24995 .k(k)
24996 .cn_stride(7)
24997 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
24998 }
24999 }
25000 }
25001
25002 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4_strided_a) {
25003 TEST_REQUIRES_X86_XOP;
25004 for (uint32_t n = 8; n <= 12; n += 4) {
25005 for (size_t k = 1; k <= 40; k += 9) {
25006 GemmMicrokernelTester()
25007 .mr(2)
25008 .nr(4)
25009 .kr(8)
25010 .sr(1)
25011 .m(2)
25012 .n(n)
25013 .k(k)
25014 .a_stride(43)
25015 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25016 }
25017 }
25018 }
25019
25020 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4_subtile) {
25021 TEST_REQUIRES_X86_XOP;
25022 for (uint32_t n = 8; n <= 12; n += 4) {
25023 for (size_t k = 1; k <= 40; k += 9) {
25024 for (uint32_t m = 1; m <= 2; m++) {
25025 GemmMicrokernelTester()
25026 .mr(2)
25027 .nr(4)
25028 .kr(8)
25029 .sr(1)
25030 .m(m)
25031 .n(n)
25032 .k(k)
25033 .iterations(1)
25034 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25035 }
25036 }
25037 }
25038 }
25039
25040 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, strided_cm_subtile) {
25041 TEST_REQUIRES_X86_XOP;
25042 for (size_t k = 1; k <= 40; k += 9) {
25043 for (uint32_t m = 1; m <= 2; m++) {
25044 for (uint32_t n = 1; n <= 4; n++) {
25045 GemmMicrokernelTester()
25046 .mr(2)
25047 .nr(4)
25048 .kr(8)
25049 .sr(1)
25050 .m(m)
25051 .n(n)
25052 .k(k)
25053 .cm_stride(7)
25054 .iterations(1)
25055 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25056 }
25057 }
25058 }
25059 }
25060
25061 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, qmin) {
25062 TEST_REQUIRES_X86_XOP;
25063 GemmMicrokernelTester()
25064 .mr(2)
25065 .nr(4)
25066 .kr(8)
25067 .sr(1)
25068 .m(2)
25069 .n(4)
25070 .k(8)
25071 .qmin(128)
25072 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25073 }
25074
25075 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, qmax) {
25076 TEST_REQUIRES_X86_XOP;
25077 GemmMicrokernelTester()
25078 .mr(2)
25079 .nr(4)
25080 .kr(8)
25081 .sr(1)
25082 .m(2)
25083 .n(4)
25084 .k(8)
25085 .qmax(128)
25086 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25087 }
25088
25089 TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, strided_cm) {
25090 TEST_REQUIRES_X86_XOP;
25091 GemmMicrokernelTester()
25092 .mr(2)
25093 .nr(4)
25094 .kr(8)
25095 .sr(1)
25096 .m(2)
25097 .n(4)
25098 .k(8)
25099 .cm_stride(7)
25100 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25101 }
25102#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
25103
25104
25105#if XNN_ARCH_X86 || XNN_ARCH_X86_64
25106 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8) {
25107 TEST_REQUIRES_X86_XOP;
25108 GemmMicrokernelTester()
25109 .mr(3)
25110 .nr(4)
25111 .kr(8)
25112 .sr(1)
25113 .m(3)
25114 .n(4)
25115 .k(8)
25116 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25117 }
25118
25119 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, strided_cn) {
25120 TEST_REQUIRES_X86_XOP;
25121 GemmMicrokernelTester()
25122 .mr(3)
25123 .nr(4)
25124 .kr(8)
25125 .sr(1)
25126 .m(3)
25127 .n(4)
25128 .k(8)
25129 .cn_stride(7)
25130 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25131 }
25132
25133 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8_strided_a) {
25134 TEST_REQUIRES_X86_XOP;
25135 GemmMicrokernelTester()
25136 .mr(3)
25137 .nr(4)
25138 .kr(8)
25139 .sr(1)
25140 .m(3)
25141 .n(4)
25142 .k(8)
25143 .a_stride(11)
25144 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25145 }
25146
25147 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8_subtile) {
25148 TEST_REQUIRES_X86_XOP;
25149 for (uint32_t m = 1; m <= 3; m++) {
25150 for (uint32_t n = 1; n <= 4; n++) {
25151 GemmMicrokernelTester()
25152 .mr(3)
25153 .nr(4)
25154 .kr(8)
25155 .sr(1)
25156 .m(m)
25157 .n(n)
25158 .k(8)
25159 .iterations(1)
25160 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25161 }
25162 }
25163 }
25164
25165 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8_subtile_m) {
25166 TEST_REQUIRES_X86_XOP;
25167 for (uint32_t m = 1; m <= 3; m++) {
25168 GemmMicrokernelTester()
25169 .mr(3)
25170 .nr(4)
25171 .kr(8)
25172 .sr(1)
25173 .m(m)
25174 .n(4)
25175 .k(8)
25176 .iterations(1)
25177 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25178 }
25179 }
25180
25181 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8_subtile_n) {
25182 TEST_REQUIRES_X86_XOP;
25183 for (uint32_t n = 1; n <= 4; n++) {
25184 GemmMicrokernelTester()
25185 .mr(3)
25186 .nr(4)
25187 .kr(8)
25188 .sr(1)
25189 .m(3)
25190 .n(n)
25191 .k(8)
25192 .iterations(1)
25193 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25194 }
25195 }
25196
25197 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_lt_8) {
25198 TEST_REQUIRES_X86_XOP;
25199 for (size_t k = 1; k < 8; k++) {
25200 GemmMicrokernelTester()
25201 .mr(3)
25202 .nr(4)
25203 .kr(8)
25204 .sr(1)
25205 .m(3)
25206 .n(4)
25207 .k(k)
25208 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25209 }
25210 }
25211
25212 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_lt_8_strided_a) {
25213 TEST_REQUIRES_X86_XOP;
25214 for (size_t k = 1; k < 8; k++) {
25215 GemmMicrokernelTester()
25216 .mr(3)
25217 .nr(4)
25218 .kr(8)
25219 .sr(1)
25220 .m(3)
25221 .n(4)
25222 .k(k)
25223 .a_stride(11)
25224 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25225 }
25226 }
25227
25228 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_lt_8_subtile) {
25229 TEST_REQUIRES_X86_XOP;
25230 for (size_t k = 1; k < 8; k++) {
25231 for (uint32_t m = 1; m <= 3; m++) {
25232 for (uint32_t n = 1; n <= 4; n++) {
25233 GemmMicrokernelTester()
25234 .mr(3)
25235 .nr(4)
25236 .kr(8)
25237 .sr(1)
25238 .m(m)
25239 .n(n)
25240 .k(k)
25241 .iterations(1)
25242 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25243 }
25244 }
25245 }
25246 }
25247
25248 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_gt_8) {
25249 TEST_REQUIRES_X86_XOP;
25250 for (size_t k = 9; k < 16; k++) {
25251 GemmMicrokernelTester()
25252 .mr(3)
25253 .nr(4)
25254 .kr(8)
25255 .sr(1)
25256 .m(3)
25257 .n(4)
25258 .k(k)
25259 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25260 }
25261 }
25262
25263 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_gt_8_strided_a) {
25264 TEST_REQUIRES_X86_XOP;
25265 for (size_t k = 9; k < 16; k++) {
25266 GemmMicrokernelTester()
25267 .mr(3)
25268 .nr(4)
25269 .kr(8)
25270 .sr(1)
25271 .m(3)
25272 .n(4)
25273 .k(k)
25274 .a_stride(19)
25275 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25276 }
25277 }
25278
25279 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_gt_8_subtile) {
25280 TEST_REQUIRES_X86_XOP;
25281 for (size_t k = 9; k < 16; k++) {
25282 for (uint32_t m = 1; m <= 3; m++) {
25283 for (uint32_t n = 1; n <= 4; n++) {
25284 GemmMicrokernelTester()
25285 .mr(3)
25286 .nr(4)
25287 .kr(8)
25288 .sr(1)
25289 .m(m)
25290 .n(n)
25291 .k(k)
25292 .iterations(1)
25293 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25294 }
25295 }
25296 }
25297 }
25298
25299 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_div_8) {
25300 TEST_REQUIRES_X86_XOP;
25301 for (size_t k = 16; k <= 80; k += 8) {
25302 GemmMicrokernelTester()
25303 .mr(3)
25304 .nr(4)
25305 .kr(8)
25306 .sr(1)
25307 .m(3)
25308 .n(4)
25309 .k(k)
25310 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25311 }
25312 }
25313
25314 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_div_8_strided_a) {
25315 TEST_REQUIRES_X86_XOP;
25316 for (size_t k = 16; k <= 80; k += 8) {
25317 GemmMicrokernelTester()
25318 .mr(3)
25319 .nr(4)
25320 .kr(8)
25321 .sr(1)
25322 .m(3)
25323 .n(4)
25324 .k(k)
25325 .a_stride(83)
25326 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25327 }
25328 }
25329
25330 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_div_8_subtile) {
25331 TEST_REQUIRES_X86_XOP;
25332 for (size_t k = 16; k <= 80; k += 8) {
25333 for (uint32_t m = 1; m <= 3; m++) {
25334 for (uint32_t n = 1; n <= 4; n++) {
25335 GemmMicrokernelTester()
25336 .mr(3)
25337 .nr(4)
25338 .kr(8)
25339 .sr(1)
25340 .m(m)
25341 .n(n)
25342 .k(k)
25343 .iterations(1)
25344 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25345 }
25346 }
25347 }
25348 }
25349
25350 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4) {
25351 TEST_REQUIRES_X86_XOP;
25352 for (uint32_t n = 5; n < 8; n++) {
25353 for (size_t k = 1; k <= 40; k += 9) {
25354 GemmMicrokernelTester()
25355 .mr(3)
25356 .nr(4)
25357 .kr(8)
25358 .sr(1)
25359 .m(3)
25360 .n(4)
25361 .k(k)
25362 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25363 }
25364 }
25365 }
25366
25367 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4_strided_cn) {
25368 TEST_REQUIRES_X86_XOP;
25369 for (uint32_t n = 5; n < 8; n++) {
25370 for (size_t k = 1; k <= 40; k += 9) {
25371 GemmMicrokernelTester()
25372 .mr(3)
25373 .nr(4)
25374 .kr(8)
25375 .sr(1)
25376 .m(3)
25377 .n(4)
25378 .k(k)
25379 .cn_stride(7)
25380 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25381 }
25382 }
25383 }
25384
25385 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4_strided_a) {
25386 TEST_REQUIRES_X86_XOP;
25387 for (uint32_t n = 5; n < 8; n++) {
25388 for (size_t k = 1; k <= 40; k += 9) {
25389 GemmMicrokernelTester()
25390 .mr(3)
25391 .nr(4)
25392 .kr(8)
25393 .sr(1)
25394 .m(3)
25395 .n(n)
25396 .k(k)
25397 .a_stride(43)
25398 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25399 }
25400 }
25401 }
25402
25403 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4_subtile) {
25404 TEST_REQUIRES_X86_XOP;
25405 for (uint32_t n = 5; n < 8; n++) {
25406 for (size_t k = 1; k <= 40; k += 9) {
25407 for (uint32_t m = 1; m <= 3; m++) {
25408 GemmMicrokernelTester()
25409 .mr(3)
25410 .nr(4)
25411 .kr(8)
25412 .sr(1)
25413 .m(m)
25414 .n(n)
25415 .k(k)
25416 .iterations(1)
25417 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25418 }
25419 }
25420 }
25421 }
25422
25423 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4) {
25424 TEST_REQUIRES_X86_XOP;
25425 for (uint32_t n = 8; n <= 12; n += 4) {
25426 for (size_t k = 1; k <= 40; k += 9) {
25427 GemmMicrokernelTester()
25428 .mr(3)
25429 .nr(4)
25430 .kr(8)
25431 .sr(1)
25432 .m(3)
25433 .n(4)
25434 .k(k)
25435 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25436 }
25437 }
25438 }
25439
25440 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4_strided_cn) {
25441 TEST_REQUIRES_X86_XOP;
25442 for (uint32_t n = 8; n <= 12; n += 4) {
25443 for (size_t k = 1; k <= 40; k += 9) {
25444 GemmMicrokernelTester()
25445 .mr(3)
25446 .nr(4)
25447 .kr(8)
25448 .sr(1)
25449 .m(3)
25450 .n(n)
25451 .k(k)
25452 .cn_stride(7)
25453 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25454 }
25455 }
25456 }
25457
25458 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4_strided_a) {
25459 TEST_REQUIRES_X86_XOP;
25460 for (uint32_t n = 8; n <= 12; n += 4) {
25461 for (size_t k = 1; k <= 40; k += 9) {
25462 GemmMicrokernelTester()
25463 .mr(3)
25464 .nr(4)
25465 .kr(8)
25466 .sr(1)
25467 .m(3)
25468 .n(n)
25469 .k(k)
25470 .a_stride(43)
25471 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25472 }
25473 }
25474 }
25475
25476 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4_subtile) {
25477 TEST_REQUIRES_X86_XOP;
25478 for (uint32_t n = 8; n <= 12; n += 4) {
25479 for (size_t k = 1; k <= 40; k += 9) {
25480 for (uint32_t m = 1; m <= 3; m++) {
25481 GemmMicrokernelTester()
25482 .mr(3)
25483 .nr(4)
25484 .kr(8)
25485 .sr(1)
25486 .m(m)
25487 .n(n)
25488 .k(k)
25489 .iterations(1)
25490 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25491 }
25492 }
25493 }
25494 }
25495
25496 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, strided_cm_subtile) {
25497 TEST_REQUIRES_X86_XOP;
25498 for (size_t k = 1; k <= 40; k += 9) {
25499 for (uint32_t m = 1; m <= 3; m++) {
25500 for (uint32_t n = 1; n <= 4; n++) {
25501 GemmMicrokernelTester()
25502 .mr(3)
25503 .nr(4)
25504 .kr(8)
25505 .sr(1)
25506 .m(m)
25507 .n(n)
25508 .k(k)
25509 .cm_stride(7)
25510 .iterations(1)
25511 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25512 }
25513 }
25514 }
25515 }
25516
25517 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, qmin) {
25518 TEST_REQUIRES_X86_XOP;
25519 GemmMicrokernelTester()
25520 .mr(3)
25521 .nr(4)
25522 .kr(8)
25523 .sr(1)
25524 .m(3)
25525 .n(4)
25526 .k(8)
25527 .qmin(128)
25528 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25529 }
25530
25531 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, qmax) {
25532 TEST_REQUIRES_X86_XOP;
25533 GemmMicrokernelTester()
25534 .mr(3)
25535 .nr(4)
25536 .kr(8)
25537 .sr(1)
25538 .m(3)
25539 .n(4)
25540 .k(8)
25541 .qmax(128)
25542 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25543 }
25544
25545 TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, strided_cm) {
25546 TEST_REQUIRES_X86_XOP;
25547 GemmMicrokernelTester()
25548 .mr(3)
25549 .nr(4)
25550 .kr(8)
25551 .sr(1)
25552 .m(3)
25553 .n(4)
25554 .k(8)
25555 .cm_stride(7)
25556 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25557 }
25558#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
25559
25560
25561#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Marat Dukhan0b043742021-06-02 18:29:11 -070025562 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_eq_8) {
25563 TEST_REQUIRES_X86_AVX2;
25564 GemmMicrokernelTester()
25565 .mr(1)
25566 .nr(8)
25567 .kr(8)
25568 .sr(1)
25569 .m(1)
25570 .n(8)
25571 .k(8)
25572 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25573 }
25574
25575 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, strided_cn) {
25576 TEST_REQUIRES_X86_AVX2;
25577 GemmMicrokernelTester()
25578 .mr(1)
25579 .nr(8)
25580 .kr(8)
25581 .sr(1)
25582 .m(1)
25583 .n(8)
25584 .k(8)
25585 .cn_stride(11)
25586 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25587 }
25588
25589 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_eq_8_strided_a) {
25590 TEST_REQUIRES_X86_AVX2;
25591 GemmMicrokernelTester()
25592 .mr(1)
25593 .nr(8)
25594 .kr(8)
25595 .sr(1)
25596 .m(1)
25597 .n(8)
25598 .k(8)
25599 .a_stride(11)
25600 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25601 }
25602
25603 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_eq_8_subtile) {
25604 TEST_REQUIRES_X86_AVX2;
25605 for (uint32_t m = 1; m <= 1; m++) {
25606 for (uint32_t n = 1; n <= 8; n++) {
25607 GemmMicrokernelTester()
25608 .mr(1)
25609 .nr(8)
25610 .kr(8)
25611 .sr(1)
25612 .m(m)
25613 .n(n)
25614 .k(8)
25615 .iterations(1)
25616 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25617 }
25618 }
25619 }
25620
25621 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_eq_8_subtile_m) {
25622 TEST_REQUIRES_X86_AVX2;
25623 for (uint32_t m = 1; m <= 1; m++) {
25624 GemmMicrokernelTester()
25625 .mr(1)
25626 .nr(8)
25627 .kr(8)
25628 .sr(1)
25629 .m(m)
25630 .n(8)
25631 .k(8)
25632 .iterations(1)
25633 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25634 }
25635 }
25636
25637 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_eq_8_subtile_n) {
25638 TEST_REQUIRES_X86_AVX2;
25639 for (uint32_t n = 1; n <= 8; n++) {
25640 GemmMicrokernelTester()
25641 .mr(1)
25642 .nr(8)
25643 .kr(8)
25644 .sr(1)
25645 .m(1)
25646 .n(n)
25647 .k(8)
25648 .iterations(1)
25649 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25650 }
25651 }
25652
25653 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_lt_8) {
25654 TEST_REQUIRES_X86_AVX2;
25655 for (size_t k = 1; k < 8; k++) {
25656 GemmMicrokernelTester()
25657 .mr(1)
25658 .nr(8)
25659 .kr(8)
25660 .sr(1)
25661 .m(1)
25662 .n(8)
25663 .k(k)
25664 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25665 }
25666 }
25667
25668 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_lt_8_strided_a) {
25669 TEST_REQUIRES_X86_AVX2;
25670 for (size_t k = 1; k < 8; k++) {
25671 GemmMicrokernelTester()
25672 .mr(1)
25673 .nr(8)
25674 .kr(8)
25675 .sr(1)
25676 .m(1)
25677 .n(8)
25678 .k(k)
25679 .a_stride(11)
25680 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25681 }
25682 }
25683
25684 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_lt_8_subtile) {
25685 TEST_REQUIRES_X86_AVX2;
25686 for (size_t k = 1; k < 8; k++) {
25687 for (uint32_t m = 1; m <= 1; m++) {
25688 for (uint32_t n = 1; n <= 8; n++) {
25689 GemmMicrokernelTester()
25690 .mr(1)
25691 .nr(8)
25692 .kr(8)
25693 .sr(1)
25694 .m(m)
25695 .n(n)
25696 .k(k)
25697 .iterations(1)
25698 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25699 }
25700 }
25701 }
25702 }
25703
25704 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_gt_8) {
25705 TEST_REQUIRES_X86_AVX2;
25706 for (size_t k = 9; k < 16; k++) {
25707 GemmMicrokernelTester()
25708 .mr(1)
25709 .nr(8)
25710 .kr(8)
25711 .sr(1)
25712 .m(1)
25713 .n(8)
25714 .k(k)
25715 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25716 }
25717 }
25718
25719 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_gt_8_strided_a) {
25720 TEST_REQUIRES_X86_AVX2;
25721 for (size_t k = 9; k < 16; k++) {
25722 GemmMicrokernelTester()
25723 .mr(1)
25724 .nr(8)
25725 .kr(8)
25726 .sr(1)
25727 .m(1)
25728 .n(8)
25729 .k(k)
25730 .a_stride(19)
25731 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25732 }
25733 }
25734
25735 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_gt_8_subtile) {
25736 TEST_REQUIRES_X86_AVX2;
25737 for (size_t k = 9; k < 16; k++) {
25738 for (uint32_t m = 1; m <= 1; m++) {
25739 for (uint32_t n = 1; n <= 8; n++) {
25740 GemmMicrokernelTester()
25741 .mr(1)
25742 .nr(8)
25743 .kr(8)
25744 .sr(1)
25745 .m(m)
25746 .n(n)
25747 .k(k)
25748 .iterations(1)
25749 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25750 }
25751 }
25752 }
25753 }
25754
25755 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_div_8) {
25756 TEST_REQUIRES_X86_AVX2;
25757 for (size_t k = 16; k <= 80; k += 8) {
25758 GemmMicrokernelTester()
25759 .mr(1)
25760 .nr(8)
25761 .kr(8)
25762 .sr(1)
25763 .m(1)
25764 .n(8)
25765 .k(k)
25766 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25767 }
25768 }
25769
25770 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_div_8_strided_a) {
25771 TEST_REQUIRES_X86_AVX2;
25772 for (size_t k = 16; k <= 80; k += 8) {
25773 GemmMicrokernelTester()
25774 .mr(1)
25775 .nr(8)
25776 .kr(8)
25777 .sr(1)
25778 .m(1)
25779 .n(8)
25780 .k(k)
25781 .a_stride(83)
25782 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25783 }
25784 }
25785
25786 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_div_8_subtile) {
25787 TEST_REQUIRES_X86_AVX2;
25788 for (size_t k = 16; k <= 80; k += 8) {
25789 for (uint32_t m = 1; m <= 1; m++) {
25790 for (uint32_t n = 1; n <= 8; n++) {
25791 GemmMicrokernelTester()
25792 .mr(1)
25793 .nr(8)
25794 .kr(8)
25795 .sr(1)
25796 .m(m)
25797 .n(n)
25798 .k(k)
25799 .iterations(1)
25800 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25801 }
25802 }
25803 }
25804 }
25805
25806 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, n_gt_8) {
25807 TEST_REQUIRES_X86_AVX2;
25808 for (uint32_t n = 9; n < 16; n++) {
25809 for (size_t k = 1; k <= 40; k += 9) {
25810 GemmMicrokernelTester()
25811 .mr(1)
25812 .nr(8)
25813 .kr(8)
25814 .sr(1)
25815 .m(1)
25816 .n(8)
25817 .k(k)
25818 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25819 }
25820 }
25821 }
25822
25823 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, n_gt_8_strided_cn) {
25824 TEST_REQUIRES_X86_AVX2;
25825 for (uint32_t n = 9; n < 16; n++) {
25826 for (size_t k = 1; k <= 40; k += 9) {
25827 GemmMicrokernelTester()
25828 .mr(1)
25829 .nr(8)
25830 .kr(8)
25831 .sr(1)
25832 .m(1)
25833 .n(8)
25834 .k(k)
25835 .cn_stride(11)
25836 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25837 }
25838 }
25839 }
25840
25841 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, n_gt_8_strided_a) {
25842 TEST_REQUIRES_X86_AVX2;
25843 for (uint32_t n = 9; n < 16; n++) {
25844 for (size_t k = 1; k <= 40; k += 9) {
25845 GemmMicrokernelTester()
25846 .mr(1)
25847 .nr(8)
25848 .kr(8)
25849 .sr(1)
25850 .m(1)
25851 .n(n)
25852 .k(k)
25853 .a_stride(43)
25854 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25855 }
25856 }
25857 }
25858
25859 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, n_gt_8_subtile) {
25860 TEST_REQUIRES_X86_AVX2;
25861 for (uint32_t n = 9; n < 16; n++) {
25862 for (size_t k = 1; k <= 40; k += 9) {
25863 for (uint32_t m = 1; m <= 1; m++) {
25864 GemmMicrokernelTester()
25865 .mr(1)
25866 .nr(8)
25867 .kr(8)
25868 .sr(1)
25869 .m(m)
25870 .n(n)
25871 .k(k)
25872 .iterations(1)
25873 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25874 }
25875 }
25876 }
25877 }
25878
25879 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, n_div_8) {
25880 TEST_REQUIRES_X86_AVX2;
25881 for (uint32_t n = 16; n <= 24; n += 8) {
25882 for (size_t k = 1; k <= 40; k += 9) {
25883 GemmMicrokernelTester()
25884 .mr(1)
25885 .nr(8)
25886 .kr(8)
25887 .sr(1)
25888 .m(1)
25889 .n(8)
25890 .k(k)
25891 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25892 }
25893 }
25894 }
25895
25896 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, n_div_8_strided_cn) {
25897 TEST_REQUIRES_X86_AVX2;
25898 for (uint32_t n = 16; n <= 24; n += 8) {
25899 for (size_t k = 1; k <= 40; k += 9) {
25900 GemmMicrokernelTester()
25901 .mr(1)
25902 .nr(8)
25903 .kr(8)
25904 .sr(1)
25905 .m(1)
25906 .n(n)
25907 .k(k)
25908 .cn_stride(11)
25909 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25910 }
25911 }
25912 }
25913
25914 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, n_div_8_strided_a) {
25915 TEST_REQUIRES_X86_AVX2;
25916 for (uint32_t n = 16; n <= 24; n += 8) {
25917 for (size_t k = 1; k <= 40; k += 9) {
25918 GemmMicrokernelTester()
25919 .mr(1)
25920 .nr(8)
25921 .kr(8)
25922 .sr(1)
25923 .m(1)
25924 .n(n)
25925 .k(k)
25926 .a_stride(43)
25927 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25928 }
25929 }
25930 }
25931
25932 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, n_div_8_subtile) {
25933 TEST_REQUIRES_X86_AVX2;
25934 for (uint32_t n = 16; n <= 24; n += 8) {
25935 for (size_t k = 1; k <= 40; k += 9) {
25936 for (uint32_t m = 1; m <= 1; m++) {
25937 GemmMicrokernelTester()
25938 .mr(1)
25939 .nr(8)
25940 .kr(8)
25941 .sr(1)
25942 .m(m)
25943 .n(n)
25944 .k(k)
25945 .iterations(1)
25946 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25947 }
25948 }
25949 }
25950 }
25951
25952 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, strided_cm_subtile) {
25953 TEST_REQUIRES_X86_AVX2;
25954 for (size_t k = 1; k <= 40; k += 9) {
25955 for (uint32_t m = 1; m <= 1; m++) {
25956 for (uint32_t n = 1; n <= 8; n++) {
25957 GemmMicrokernelTester()
25958 .mr(1)
25959 .nr(8)
25960 .kr(8)
25961 .sr(1)
25962 .m(m)
25963 .n(n)
25964 .k(k)
25965 .cm_stride(11)
25966 .iterations(1)
25967 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25968 }
25969 }
25970 }
25971 }
25972
25973 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, qmin) {
25974 TEST_REQUIRES_X86_AVX2;
25975 GemmMicrokernelTester()
25976 .mr(1)
25977 .nr(8)
25978 .kr(8)
25979 .sr(1)
25980 .m(1)
25981 .n(8)
25982 .k(8)
25983 .qmin(128)
25984 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25985 }
25986
25987 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, qmax) {
25988 TEST_REQUIRES_X86_AVX2;
25989 GemmMicrokernelTester()
25990 .mr(1)
25991 .nr(8)
25992 .kr(8)
25993 .sr(1)
25994 .m(1)
25995 .n(8)
25996 .k(8)
25997 .qmax(128)
25998 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
25999 }
26000
26001 TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, strided_cm) {
26002 TEST_REQUIRES_X86_AVX2;
26003 GemmMicrokernelTester()
26004 .mr(1)
26005 .nr(8)
26006 .kr(8)
26007 .sr(1)
26008 .m(1)
26009 .n(8)
26010 .k(8)
26011 .cm_stride(11)
26012 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26013 }
26014#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
26015
26016
26017#if XNN_ARCH_X86 || XNN_ARCH_X86_64
26018 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_eq_8) {
26019 TEST_REQUIRES_X86_AVX2;
26020 GemmMicrokernelTester()
26021 .mr(2)
26022 .nr(8)
26023 .kr(8)
26024 .sr(1)
26025 .m(2)
26026 .n(8)
26027 .k(8)
26028 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26029 }
26030
26031 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, strided_cn) {
26032 TEST_REQUIRES_X86_AVX2;
26033 GemmMicrokernelTester()
26034 .mr(2)
26035 .nr(8)
26036 .kr(8)
26037 .sr(1)
26038 .m(2)
26039 .n(8)
26040 .k(8)
26041 .cn_stride(11)
26042 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26043 }
26044
26045 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_eq_8_strided_a) {
26046 TEST_REQUIRES_X86_AVX2;
26047 GemmMicrokernelTester()
26048 .mr(2)
26049 .nr(8)
26050 .kr(8)
26051 .sr(1)
26052 .m(2)
26053 .n(8)
26054 .k(8)
26055 .a_stride(11)
26056 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26057 }
26058
26059 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_eq_8_subtile) {
26060 TEST_REQUIRES_X86_AVX2;
26061 for (uint32_t m = 1; m <= 2; m++) {
26062 for (uint32_t n = 1; n <= 8; n++) {
26063 GemmMicrokernelTester()
26064 .mr(2)
26065 .nr(8)
26066 .kr(8)
26067 .sr(1)
26068 .m(m)
26069 .n(n)
26070 .k(8)
26071 .iterations(1)
26072 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26073 }
26074 }
26075 }
26076
26077 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_eq_8_subtile_m) {
26078 TEST_REQUIRES_X86_AVX2;
26079 for (uint32_t m = 1; m <= 2; m++) {
26080 GemmMicrokernelTester()
26081 .mr(2)
26082 .nr(8)
26083 .kr(8)
26084 .sr(1)
26085 .m(m)
26086 .n(8)
26087 .k(8)
26088 .iterations(1)
26089 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26090 }
26091 }
26092
26093 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_eq_8_subtile_n) {
26094 TEST_REQUIRES_X86_AVX2;
26095 for (uint32_t n = 1; n <= 8; n++) {
26096 GemmMicrokernelTester()
26097 .mr(2)
26098 .nr(8)
26099 .kr(8)
26100 .sr(1)
26101 .m(2)
26102 .n(n)
26103 .k(8)
26104 .iterations(1)
26105 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26106 }
26107 }
26108
26109 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_lt_8) {
26110 TEST_REQUIRES_X86_AVX2;
26111 for (size_t k = 1; k < 8; k++) {
26112 GemmMicrokernelTester()
26113 .mr(2)
26114 .nr(8)
26115 .kr(8)
26116 .sr(1)
26117 .m(2)
26118 .n(8)
26119 .k(k)
26120 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26121 }
26122 }
26123
26124 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_lt_8_strided_a) {
26125 TEST_REQUIRES_X86_AVX2;
26126 for (size_t k = 1; k < 8; k++) {
26127 GemmMicrokernelTester()
26128 .mr(2)
26129 .nr(8)
26130 .kr(8)
26131 .sr(1)
26132 .m(2)
26133 .n(8)
26134 .k(k)
26135 .a_stride(11)
26136 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26137 }
26138 }
26139
26140 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_lt_8_subtile) {
26141 TEST_REQUIRES_X86_AVX2;
26142 for (size_t k = 1; k < 8; k++) {
26143 for (uint32_t m = 1; m <= 2; m++) {
26144 for (uint32_t n = 1; n <= 8; n++) {
26145 GemmMicrokernelTester()
26146 .mr(2)
26147 .nr(8)
26148 .kr(8)
26149 .sr(1)
26150 .m(m)
26151 .n(n)
26152 .k(k)
26153 .iterations(1)
26154 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26155 }
26156 }
26157 }
26158 }
26159
26160 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_gt_8) {
26161 TEST_REQUIRES_X86_AVX2;
26162 for (size_t k = 9; k < 16; k++) {
26163 GemmMicrokernelTester()
26164 .mr(2)
26165 .nr(8)
26166 .kr(8)
26167 .sr(1)
26168 .m(2)
26169 .n(8)
26170 .k(k)
26171 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26172 }
26173 }
26174
26175 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_gt_8_strided_a) {
26176 TEST_REQUIRES_X86_AVX2;
26177 for (size_t k = 9; k < 16; k++) {
26178 GemmMicrokernelTester()
26179 .mr(2)
26180 .nr(8)
26181 .kr(8)
26182 .sr(1)
26183 .m(2)
26184 .n(8)
26185 .k(k)
26186 .a_stride(19)
26187 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26188 }
26189 }
26190
26191 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_gt_8_subtile) {
26192 TEST_REQUIRES_X86_AVX2;
26193 for (size_t k = 9; k < 16; k++) {
26194 for (uint32_t m = 1; m <= 2; m++) {
26195 for (uint32_t n = 1; n <= 8; n++) {
26196 GemmMicrokernelTester()
26197 .mr(2)
26198 .nr(8)
26199 .kr(8)
26200 .sr(1)
26201 .m(m)
26202 .n(n)
26203 .k(k)
26204 .iterations(1)
26205 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26206 }
26207 }
26208 }
26209 }
26210
26211 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_div_8) {
26212 TEST_REQUIRES_X86_AVX2;
26213 for (size_t k = 16; k <= 80; k += 8) {
26214 GemmMicrokernelTester()
26215 .mr(2)
26216 .nr(8)
26217 .kr(8)
26218 .sr(1)
26219 .m(2)
26220 .n(8)
26221 .k(k)
26222 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26223 }
26224 }
26225
26226 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_div_8_strided_a) {
26227 TEST_REQUIRES_X86_AVX2;
26228 for (size_t k = 16; k <= 80; k += 8) {
26229 GemmMicrokernelTester()
26230 .mr(2)
26231 .nr(8)
26232 .kr(8)
26233 .sr(1)
26234 .m(2)
26235 .n(8)
26236 .k(k)
26237 .a_stride(83)
26238 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26239 }
26240 }
26241
26242 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, k_div_8_subtile) {
26243 TEST_REQUIRES_X86_AVX2;
26244 for (size_t k = 16; k <= 80; k += 8) {
26245 for (uint32_t m = 1; m <= 2; m++) {
26246 for (uint32_t n = 1; n <= 8; n++) {
26247 GemmMicrokernelTester()
26248 .mr(2)
26249 .nr(8)
26250 .kr(8)
26251 .sr(1)
26252 .m(m)
26253 .n(n)
26254 .k(k)
26255 .iterations(1)
26256 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26257 }
26258 }
26259 }
26260 }
26261
26262 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, n_gt_8) {
26263 TEST_REQUIRES_X86_AVX2;
26264 for (uint32_t n = 9; n < 16; n++) {
26265 for (size_t k = 1; k <= 40; k += 9) {
26266 GemmMicrokernelTester()
26267 .mr(2)
26268 .nr(8)
26269 .kr(8)
26270 .sr(1)
26271 .m(2)
26272 .n(8)
26273 .k(k)
26274 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26275 }
26276 }
26277 }
26278
26279 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, n_gt_8_strided_cn) {
26280 TEST_REQUIRES_X86_AVX2;
26281 for (uint32_t n = 9; n < 16; n++) {
26282 for (size_t k = 1; k <= 40; k += 9) {
26283 GemmMicrokernelTester()
26284 .mr(2)
26285 .nr(8)
26286 .kr(8)
26287 .sr(1)
26288 .m(2)
26289 .n(8)
26290 .k(k)
26291 .cn_stride(11)
26292 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26293 }
26294 }
26295 }
26296
26297 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, n_gt_8_strided_a) {
26298 TEST_REQUIRES_X86_AVX2;
26299 for (uint32_t n = 9; n < 16; n++) {
26300 for (size_t k = 1; k <= 40; k += 9) {
26301 GemmMicrokernelTester()
26302 .mr(2)
26303 .nr(8)
26304 .kr(8)
26305 .sr(1)
26306 .m(2)
26307 .n(n)
26308 .k(k)
26309 .a_stride(43)
26310 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26311 }
26312 }
26313 }
26314
26315 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, n_gt_8_subtile) {
26316 TEST_REQUIRES_X86_AVX2;
26317 for (uint32_t n = 9; n < 16; n++) {
26318 for (size_t k = 1; k <= 40; k += 9) {
26319 for (uint32_t m = 1; m <= 2; m++) {
26320 GemmMicrokernelTester()
26321 .mr(2)
26322 .nr(8)
26323 .kr(8)
26324 .sr(1)
26325 .m(m)
26326 .n(n)
26327 .k(k)
26328 .iterations(1)
26329 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26330 }
26331 }
26332 }
26333 }
26334
26335 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, n_div_8) {
26336 TEST_REQUIRES_X86_AVX2;
26337 for (uint32_t n = 16; n <= 24; n += 8) {
26338 for (size_t k = 1; k <= 40; k += 9) {
26339 GemmMicrokernelTester()
26340 .mr(2)
26341 .nr(8)
26342 .kr(8)
26343 .sr(1)
26344 .m(2)
26345 .n(8)
26346 .k(k)
26347 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26348 }
26349 }
26350 }
26351
26352 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, n_div_8_strided_cn) {
26353 TEST_REQUIRES_X86_AVX2;
26354 for (uint32_t n = 16; n <= 24; n += 8) {
26355 for (size_t k = 1; k <= 40; k += 9) {
26356 GemmMicrokernelTester()
26357 .mr(2)
26358 .nr(8)
26359 .kr(8)
26360 .sr(1)
26361 .m(2)
26362 .n(n)
26363 .k(k)
26364 .cn_stride(11)
26365 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26366 }
26367 }
26368 }
26369
26370 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, n_div_8_strided_a) {
26371 TEST_REQUIRES_X86_AVX2;
26372 for (uint32_t n = 16; n <= 24; n += 8) {
26373 for (size_t k = 1; k <= 40; k += 9) {
26374 GemmMicrokernelTester()
26375 .mr(2)
26376 .nr(8)
26377 .kr(8)
26378 .sr(1)
26379 .m(2)
26380 .n(n)
26381 .k(k)
26382 .a_stride(43)
26383 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26384 }
26385 }
26386 }
26387
26388 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, n_div_8_subtile) {
26389 TEST_REQUIRES_X86_AVX2;
26390 for (uint32_t n = 16; n <= 24; n += 8) {
26391 for (size_t k = 1; k <= 40; k += 9) {
26392 for (uint32_t m = 1; m <= 2; m++) {
26393 GemmMicrokernelTester()
26394 .mr(2)
26395 .nr(8)
26396 .kr(8)
26397 .sr(1)
26398 .m(m)
26399 .n(n)
26400 .k(k)
26401 .iterations(1)
26402 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26403 }
26404 }
26405 }
26406 }
26407
26408 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, strided_cm_subtile) {
26409 TEST_REQUIRES_X86_AVX2;
26410 for (size_t k = 1; k <= 40; k += 9) {
26411 for (uint32_t m = 1; m <= 2; m++) {
26412 for (uint32_t n = 1; n <= 8; n++) {
26413 GemmMicrokernelTester()
26414 .mr(2)
26415 .nr(8)
26416 .kr(8)
26417 .sr(1)
26418 .m(m)
26419 .n(n)
26420 .k(k)
26421 .cm_stride(11)
26422 .iterations(1)
26423 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26424 }
26425 }
26426 }
26427 }
26428
26429 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, qmin) {
26430 TEST_REQUIRES_X86_AVX2;
26431 GemmMicrokernelTester()
26432 .mr(2)
26433 .nr(8)
26434 .kr(8)
26435 .sr(1)
26436 .m(2)
26437 .n(8)
26438 .k(8)
26439 .qmin(128)
26440 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26441 }
26442
26443 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, qmax) {
26444 TEST_REQUIRES_X86_AVX2;
26445 GemmMicrokernelTester()
26446 .mr(2)
26447 .nr(8)
26448 .kr(8)
26449 .sr(1)
26450 .m(2)
26451 .n(8)
26452 .k(8)
26453 .qmax(128)
26454 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26455 }
26456
26457 TEST(QC8_GEMM_MINMAX_FP32_2X8C8__AVX2, strided_cm) {
26458 TEST_REQUIRES_X86_AVX2;
26459 GemmMicrokernelTester()
26460 .mr(2)
26461 .nr(8)
26462 .kr(8)
26463 .sr(1)
26464 .m(2)
26465 .n(8)
26466 .k(8)
26467 .cm_stride(11)
26468 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26469 }
26470#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
26471
26472
26473#if XNN_ARCH_X86 || XNN_ARCH_X86_64
26474 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, k_eq_8) {
26475 TEST_REQUIRES_X86_AVX2;
26476 GemmMicrokernelTester()
26477 .mr(3)
26478 .nr(8)
26479 .kr(8)
26480 .sr(1)
26481 .m(3)
26482 .n(8)
26483 .k(8)
26484 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26485 }
26486
26487 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, strided_cn) {
26488 TEST_REQUIRES_X86_AVX2;
26489 GemmMicrokernelTester()
26490 .mr(3)
26491 .nr(8)
26492 .kr(8)
26493 .sr(1)
26494 .m(3)
26495 .n(8)
26496 .k(8)
26497 .cn_stride(11)
26498 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26499 }
26500
26501 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, k_eq_8_strided_a) {
26502 TEST_REQUIRES_X86_AVX2;
26503 GemmMicrokernelTester()
26504 .mr(3)
26505 .nr(8)
26506 .kr(8)
26507 .sr(1)
26508 .m(3)
26509 .n(8)
26510 .k(8)
26511 .a_stride(11)
26512 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26513 }
26514
26515 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, k_eq_8_subtile) {
26516 TEST_REQUIRES_X86_AVX2;
26517 for (uint32_t m = 1; m <= 3; m++) {
26518 for (uint32_t n = 1; n <= 8; n++) {
26519 GemmMicrokernelTester()
26520 .mr(3)
26521 .nr(8)
26522 .kr(8)
26523 .sr(1)
26524 .m(m)
26525 .n(n)
26526 .k(8)
26527 .iterations(1)
26528 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26529 }
26530 }
26531 }
26532
26533 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, k_eq_8_subtile_m) {
26534 TEST_REQUIRES_X86_AVX2;
26535 for (uint32_t m = 1; m <= 3; m++) {
26536 GemmMicrokernelTester()
26537 .mr(3)
26538 .nr(8)
26539 .kr(8)
26540 .sr(1)
26541 .m(m)
26542 .n(8)
26543 .k(8)
26544 .iterations(1)
26545 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26546 }
26547 }
26548
26549 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, k_eq_8_subtile_n) {
26550 TEST_REQUIRES_X86_AVX2;
26551 for (uint32_t n = 1; n <= 8; n++) {
26552 GemmMicrokernelTester()
26553 .mr(3)
26554 .nr(8)
26555 .kr(8)
26556 .sr(1)
26557 .m(3)
26558 .n(n)
26559 .k(8)
26560 .iterations(1)
26561 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26562 }
26563 }
26564
26565 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, k_lt_8) {
26566 TEST_REQUIRES_X86_AVX2;
26567 for (size_t k = 1; k < 8; k++) {
26568 GemmMicrokernelTester()
26569 .mr(3)
26570 .nr(8)
26571 .kr(8)
26572 .sr(1)
26573 .m(3)
26574 .n(8)
26575 .k(k)
26576 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26577 }
26578 }
26579
26580 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, k_lt_8_strided_a) {
26581 TEST_REQUIRES_X86_AVX2;
26582 for (size_t k = 1; k < 8; k++) {
26583 GemmMicrokernelTester()
26584 .mr(3)
26585 .nr(8)
26586 .kr(8)
26587 .sr(1)
26588 .m(3)
26589 .n(8)
26590 .k(k)
26591 .a_stride(11)
26592 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26593 }
26594 }
26595
26596 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, k_lt_8_subtile) {
26597 TEST_REQUIRES_X86_AVX2;
26598 for (size_t k = 1; k < 8; k++) {
26599 for (uint32_t m = 1; m <= 3; m++) {
26600 for (uint32_t n = 1; n <= 8; n++) {
26601 GemmMicrokernelTester()
26602 .mr(3)
26603 .nr(8)
26604 .kr(8)
26605 .sr(1)
26606 .m(m)
26607 .n(n)
26608 .k(k)
26609 .iterations(1)
26610 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26611 }
26612 }
26613 }
26614 }
26615
26616 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, k_gt_8) {
26617 TEST_REQUIRES_X86_AVX2;
26618 for (size_t k = 9; k < 16; k++) {
26619 GemmMicrokernelTester()
26620 .mr(3)
26621 .nr(8)
26622 .kr(8)
26623 .sr(1)
26624 .m(3)
26625 .n(8)
26626 .k(k)
26627 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26628 }
26629 }
26630
26631 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, k_gt_8_strided_a) {
26632 TEST_REQUIRES_X86_AVX2;
26633 for (size_t k = 9; k < 16; k++) {
26634 GemmMicrokernelTester()
26635 .mr(3)
26636 .nr(8)
26637 .kr(8)
26638 .sr(1)
26639 .m(3)
26640 .n(8)
26641 .k(k)
26642 .a_stride(19)
26643 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26644 }
26645 }
26646
26647 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, k_gt_8_subtile) {
26648 TEST_REQUIRES_X86_AVX2;
26649 for (size_t k = 9; k < 16; k++) {
26650 for (uint32_t m = 1; m <= 3; m++) {
26651 for (uint32_t n = 1; n <= 8; n++) {
26652 GemmMicrokernelTester()
26653 .mr(3)
26654 .nr(8)
26655 .kr(8)
26656 .sr(1)
26657 .m(m)
26658 .n(n)
26659 .k(k)
26660 .iterations(1)
26661 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26662 }
26663 }
26664 }
26665 }
26666
26667 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, k_div_8) {
26668 TEST_REQUIRES_X86_AVX2;
26669 for (size_t k = 16; k <= 80; k += 8) {
26670 GemmMicrokernelTester()
26671 .mr(3)
26672 .nr(8)
26673 .kr(8)
26674 .sr(1)
26675 .m(3)
26676 .n(8)
26677 .k(k)
26678 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26679 }
26680 }
26681
26682 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, k_div_8_strided_a) {
26683 TEST_REQUIRES_X86_AVX2;
26684 for (size_t k = 16; k <= 80; k += 8) {
26685 GemmMicrokernelTester()
26686 .mr(3)
26687 .nr(8)
26688 .kr(8)
26689 .sr(1)
26690 .m(3)
26691 .n(8)
26692 .k(k)
26693 .a_stride(83)
26694 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26695 }
26696 }
26697
26698 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, k_div_8_subtile) {
26699 TEST_REQUIRES_X86_AVX2;
26700 for (size_t k = 16; k <= 80; k += 8) {
26701 for (uint32_t m = 1; m <= 3; m++) {
26702 for (uint32_t n = 1; n <= 8; n++) {
26703 GemmMicrokernelTester()
26704 .mr(3)
26705 .nr(8)
26706 .kr(8)
26707 .sr(1)
26708 .m(m)
26709 .n(n)
26710 .k(k)
26711 .iterations(1)
26712 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26713 }
26714 }
26715 }
26716 }
26717
26718 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, n_gt_8) {
26719 TEST_REQUIRES_X86_AVX2;
26720 for (uint32_t n = 9; n < 16; n++) {
26721 for (size_t k = 1; k <= 40; k += 9) {
26722 GemmMicrokernelTester()
26723 .mr(3)
26724 .nr(8)
26725 .kr(8)
26726 .sr(1)
26727 .m(3)
26728 .n(8)
26729 .k(k)
26730 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26731 }
26732 }
26733 }
26734
26735 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, n_gt_8_strided_cn) {
26736 TEST_REQUIRES_X86_AVX2;
26737 for (uint32_t n = 9; n < 16; n++) {
26738 for (size_t k = 1; k <= 40; k += 9) {
26739 GemmMicrokernelTester()
26740 .mr(3)
26741 .nr(8)
26742 .kr(8)
26743 .sr(1)
26744 .m(3)
26745 .n(8)
26746 .k(k)
26747 .cn_stride(11)
26748 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26749 }
26750 }
26751 }
26752
26753 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, n_gt_8_strided_a) {
26754 TEST_REQUIRES_X86_AVX2;
26755 for (uint32_t n = 9; n < 16; n++) {
26756 for (size_t k = 1; k <= 40; k += 9) {
26757 GemmMicrokernelTester()
26758 .mr(3)
26759 .nr(8)
26760 .kr(8)
26761 .sr(1)
26762 .m(3)
26763 .n(n)
26764 .k(k)
26765 .a_stride(43)
26766 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26767 }
26768 }
26769 }
26770
26771 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, n_gt_8_subtile) {
26772 TEST_REQUIRES_X86_AVX2;
26773 for (uint32_t n = 9; n < 16; n++) {
26774 for (size_t k = 1; k <= 40; k += 9) {
26775 for (uint32_t m = 1; m <= 3; m++) {
26776 GemmMicrokernelTester()
26777 .mr(3)
26778 .nr(8)
26779 .kr(8)
26780 .sr(1)
26781 .m(m)
26782 .n(n)
26783 .k(k)
26784 .iterations(1)
26785 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26786 }
26787 }
26788 }
26789 }
26790
26791 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, n_div_8) {
26792 TEST_REQUIRES_X86_AVX2;
26793 for (uint32_t n = 16; n <= 24; n += 8) {
26794 for (size_t k = 1; k <= 40; k += 9) {
26795 GemmMicrokernelTester()
26796 .mr(3)
26797 .nr(8)
26798 .kr(8)
26799 .sr(1)
26800 .m(3)
26801 .n(8)
26802 .k(k)
26803 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26804 }
26805 }
26806 }
26807
26808 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, n_div_8_strided_cn) {
26809 TEST_REQUIRES_X86_AVX2;
26810 for (uint32_t n = 16; n <= 24; n += 8) {
26811 for (size_t k = 1; k <= 40; k += 9) {
26812 GemmMicrokernelTester()
26813 .mr(3)
26814 .nr(8)
26815 .kr(8)
26816 .sr(1)
26817 .m(3)
26818 .n(n)
26819 .k(k)
26820 .cn_stride(11)
26821 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26822 }
26823 }
26824 }
26825
26826 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, n_div_8_strided_a) {
26827 TEST_REQUIRES_X86_AVX2;
26828 for (uint32_t n = 16; n <= 24; n += 8) {
26829 for (size_t k = 1; k <= 40; k += 9) {
26830 GemmMicrokernelTester()
26831 .mr(3)
26832 .nr(8)
26833 .kr(8)
26834 .sr(1)
26835 .m(3)
26836 .n(n)
26837 .k(k)
26838 .a_stride(43)
26839 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26840 }
26841 }
26842 }
26843
26844 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, n_div_8_subtile) {
26845 TEST_REQUIRES_X86_AVX2;
26846 for (uint32_t n = 16; n <= 24; n += 8) {
26847 for (size_t k = 1; k <= 40; k += 9) {
26848 for (uint32_t m = 1; m <= 3; m++) {
26849 GemmMicrokernelTester()
26850 .mr(3)
26851 .nr(8)
26852 .kr(8)
26853 .sr(1)
26854 .m(m)
26855 .n(n)
26856 .k(k)
26857 .iterations(1)
26858 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26859 }
26860 }
26861 }
26862 }
26863
26864 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, strided_cm_subtile) {
26865 TEST_REQUIRES_X86_AVX2;
26866 for (size_t k = 1; k <= 40; k += 9) {
26867 for (uint32_t m = 1; m <= 3; m++) {
26868 for (uint32_t n = 1; n <= 8; n++) {
26869 GemmMicrokernelTester()
26870 .mr(3)
26871 .nr(8)
26872 .kr(8)
26873 .sr(1)
26874 .m(m)
26875 .n(n)
26876 .k(k)
26877 .cm_stride(11)
26878 .iterations(1)
26879 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26880 }
26881 }
26882 }
26883 }
26884
26885 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, qmin) {
26886 TEST_REQUIRES_X86_AVX2;
26887 GemmMicrokernelTester()
26888 .mr(3)
26889 .nr(8)
26890 .kr(8)
26891 .sr(1)
26892 .m(3)
26893 .n(8)
26894 .k(8)
26895 .qmin(128)
26896 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26897 }
26898
26899 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, qmax) {
26900 TEST_REQUIRES_X86_AVX2;
26901 GemmMicrokernelTester()
26902 .mr(3)
26903 .nr(8)
26904 .kr(8)
26905 .sr(1)
26906 .m(3)
26907 .n(8)
26908 .k(8)
26909 .qmax(128)
26910 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26911 }
26912
26913 TEST(QC8_GEMM_MINMAX_FP32_3X8C8__AVX2, strided_cm) {
26914 TEST_REQUIRES_X86_AVX2;
26915 GemmMicrokernelTester()
26916 .mr(3)
26917 .nr(8)
26918 .kr(8)
26919 .sr(1)
26920 .m(3)
26921 .n(8)
26922 .k(8)
26923 .cm_stride(11)
26924 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26925 }
26926#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
26927
26928
26929#if XNN_ARCH_X86 || XNN_ARCH_X86_64
26930 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_eq_8) {
26931 TEST_REQUIRES_X86_AVX2;
26932 GemmMicrokernelTester()
26933 .extended_weights(true)
26934 .mr(1)
26935 .nr(8)
26936 .kr(8)
26937 .sr(1)
26938 .m(1)
26939 .n(8)
26940 .k(8)
26941 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26942 }
26943
26944 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, strided_cn) {
26945 TEST_REQUIRES_X86_AVX2;
26946 GemmMicrokernelTester()
26947 .extended_weights(true)
26948 .mr(1)
26949 .nr(8)
26950 .kr(8)
26951 .sr(1)
26952 .m(1)
26953 .n(8)
26954 .k(8)
26955 .cn_stride(11)
26956 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26957 }
26958
26959 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_eq_8_strided_a) {
26960 TEST_REQUIRES_X86_AVX2;
26961 GemmMicrokernelTester()
26962 .extended_weights(true)
26963 .mr(1)
26964 .nr(8)
26965 .kr(8)
26966 .sr(1)
26967 .m(1)
26968 .n(8)
26969 .k(8)
26970 .a_stride(11)
26971 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26972 }
26973
26974 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_eq_8_subtile) {
26975 TEST_REQUIRES_X86_AVX2;
26976 for (uint32_t m = 1; m <= 1; m++) {
26977 for (uint32_t n = 1; n <= 8; n++) {
26978 GemmMicrokernelTester()
26979 .extended_weights(true)
26980 .mr(1)
26981 .nr(8)
26982 .kr(8)
26983 .sr(1)
26984 .m(m)
26985 .n(n)
26986 .k(8)
26987 .iterations(1)
26988 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
26989 }
26990 }
26991 }
26992
26993 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_eq_8_subtile_m) {
26994 TEST_REQUIRES_X86_AVX2;
26995 for (uint32_t m = 1; m <= 1; m++) {
26996 GemmMicrokernelTester()
26997 .extended_weights(true)
26998 .mr(1)
26999 .nr(8)
27000 .kr(8)
27001 .sr(1)
27002 .m(m)
27003 .n(8)
27004 .k(8)
27005 .iterations(1)
27006 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27007 }
27008 }
27009
27010 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_eq_8_subtile_n) {
27011 TEST_REQUIRES_X86_AVX2;
27012 for (uint32_t n = 1; n <= 8; n++) {
27013 GemmMicrokernelTester()
27014 .extended_weights(true)
27015 .mr(1)
27016 .nr(8)
27017 .kr(8)
27018 .sr(1)
27019 .m(1)
27020 .n(n)
27021 .k(8)
27022 .iterations(1)
27023 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27024 }
27025 }
27026
27027 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_lt_8) {
27028 TEST_REQUIRES_X86_AVX2;
27029 for (size_t k = 1; k < 8; k++) {
27030 GemmMicrokernelTester()
27031 .extended_weights(true)
27032 .mr(1)
27033 .nr(8)
27034 .kr(8)
27035 .sr(1)
27036 .m(1)
27037 .n(8)
27038 .k(k)
27039 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27040 }
27041 }
27042
27043 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_lt_8_strided_a) {
27044 TEST_REQUIRES_X86_AVX2;
27045 for (size_t k = 1; k < 8; k++) {
27046 GemmMicrokernelTester()
27047 .extended_weights(true)
27048 .mr(1)
27049 .nr(8)
27050 .kr(8)
27051 .sr(1)
27052 .m(1)
27053 .n(8)
27054 .k(k)
27055 .a_stride(11)
27056 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27057 }
27058 }
27059
27060 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_lt_8_subtile) {
27061 TEST_REQUIRES_X86_AVX2;
27062 for (size_t k = 1; k < 8; k++) {
27063 for (uint32_t m = 1; m <= 1; m++) {
27064 for (uint32_t n = 1; n <= 8; n++) {
27065 GemmMicrokernelTester()
27066 .extended_weights(true)
27067 .mr(1)
27068 .nr(8)
27069 .kr(8)
27070 .sr(1)
27071 .m(m)
27072 .n(n)
27073 .k(k)
27074 .iterations(1)
27075 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27076 }
27077 }
27078 }
27079 }
27080
27081 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_gt_8) {
27082 TEST_REQUIRES_X86_AVX2;
27083 for (size_t k = 9; k < 16; k++) {
27084 GemmMicrokernelTester()
27085 .extended_weights(true)
27086 .mr(1)
27087 .nr(8)
27088 .kr(8)
27089 .sr(1)
27090 .m(1)
27091 .n(8)
27092 .k(k)
27093 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27094 }
27095 }
27096
27097 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_gt_8_strided_a) {
27098 TEST_REQUIRES_X86_AVX2;
27099 for (size_t k = 9; k < 16; k++) {
27100 GemmMicrokernelTester()
27101 .extended_weights(true)
27102 .mr(1)
27103 .nr(8)
27104 .kr(8)
27105 .sr(1)
27106 .m(1)
27107 .n(8)
27108 .k(k)
27109 .a_stride(19)
27110 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27111 }
27112 }
27113
27114 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_gt_8_subtile) {
27115 TEST_REQUIRES_X86_AVX2;
27116 for (size_t k = 9; k < 16; k++) {
27117 for (uint32_t m = 1; m <= 1; m++) {
27118 for (uint32_t n = 1; n <= 8; n++) {
27119 GemmMicrokernelTester()
27120 .extended_weights(true)
27121 .mr(1)
27122 .nr(8)
27123 .kr(8)
27124 .sr(1)
27125 .m(m)
27126 .n(n)
27127 .k(k)
27128 .iterations(1)
27129 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27130 }
27131 }
27132 }
27133 }
27134
27135 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_div_8) {
27136 TEST_REQUIRES_X86_AVX2;
27137 for (size_t k = 16; k <= 80; k += 8) {
27138 GemmMicrokernelTester()
27139 .extended_weights(true)
27140 .mr(1)
27141 .nr(8)
27142 .kr(8)
27143 .sr(1)
27144 .m(1)
27145 .n(8)
27146 .k(k)
27147 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27148 }
27149 }
27150
27151 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_div_8_strided_a) {
27152 TEST_REQUIRES_X86_AVX2;
27153 for (size_t k = 16; k <= 80; k += 8) {
27154 GemmMicrokernelTester()
27155 .extended_weights(true)
27156 .mr(1)
27157 .nr(8)
27158 .kr(8)
27159 .sr(1)
27160 .m(1)
27161 .n(8)
27162 .k(k)
27163 .a_stride(83)
27164 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27165 }
27166 }
27167
27168 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, k_div_8_subtile) {
27169 TEST_REQUIRES_X86_AVX2;
27170 for (size_t k = 16; k <= 80; k += 8) {
27171 for (uint32_t m = 1; m <= 1; m++) {
27172 for (uint32_t n = 1; n <= 8; n++) {
27173 GemmMicrokernelTester()
27174 .extended_weights(true)
27175 .mr(1)
27176 .nr(8)
27177 .kr(8)
27178 .sr(1)
27179 .m(m)
27180 .n(n)
27181 .k(k)
27182 .iterations(1)
27183 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27184 }
27185 }
27186 }
27187 }
27188
27189 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, n_gt_8) {
27190 TEST_REQUIRES_X86_AVX2;
27191 for (uint32_t n = 9; n < 16; n++) {
27192 for (size_t k = 1; k <= 40; k += 9) {
27193 GemmMicrokernelTester()
27194 .extended_weights(true)
27195 .mr(1)
27196 .nr(8)
27197 .kr(8)
27198 .sr(1)
27199 .m(1)
27200 .n(8)
27201 .k(k)
27202 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27203 }
27204 }
27205 }
27206
27207 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, n_gt_8_strided_cn) {
27208 TEST_REQUIRES_X86_AVX2;
27209 for (uint32_t n = 9; n < 16; n++) {
27210 for (size_t k = 1; k <= 40; k += 9) {
27211 GemmMicrokernelTester()
27212 .extended_weights(true)
27213 .mr(1)
27214 .nr(8)
27215 .kr(8)
27216 .sr(1)
27217 .m(1)
27218 .n(8)
27219 .k(k)
27220 .cn_stride(11)
27221 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27222 }
27223 }
27224 }
27225
27226 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, n_gt_8_strided_a) {
27227 TEST_REQUIRES_X86_AVX2;
27228 for (uint32_t n = 9; n < 16; n++) {
27229 for (size_t k = 1; k <= 40; k += 9) {
27230 GemmMicrokernelTester()
27231 .extended_weights(true)
27232 .mr(1)
27233 .nr(8)
27234 .kr(8)
27235 .sr(1)
27236 .m(1)
27237 .n(n)
27238 .k(k)
27239 .a_stride(43)
27240 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27241 }
27242 }
27243 }
27244
27245 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, n_gt_8_subtile) {
27246 TEST_REQUIRES_X86_AVX2;
27247 for (uint32_t n = 9; n < 16; n++) {
27248 for (size_t k = 1; k <= 40; k += 9) {
27249 for (uint32_t m = 1; m <= 1; m++) {
27250 GemmMicrokernelTester()
27251 .extended_weights(true)
27252 .mr(1)
27253 .nr(8)
27254 .kr(8)
27255 .sr(1)
27256 .m(m)
27257 .n(n)
27258 .k(k)
27259 .iterations(1)
27260 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27261 }
27262 }
27263 }
27264 }
27265
27266 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, n_div_8) {
27267 TEST_REQUIRES_X86_AVX2;
27268 for (uint32_t n = 16; n <= 24; n += 8) {
27269 for (size_t k = 1; k <= 40; k += 9) {
27270 GemmMicrokernelTester()
27271 .extended_weights(true)
27272 .mr(1)
27273 .nr(8)
27274 .kr(8)
27275 .sr(1)
27276 .m(1)
27277 .n(8)
27278 .k(k)
27279 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27280 }
27281 }
27282 }
27283
27284 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, n_div_8_strided_cn) {
27285 TEST_REQUIRES_X86_AVX2;
27286 for (uint32_t n = 16; n <= 24; n += 8) {
27287 for (size_t k = 1; k <= 40; k += 9) {
27288 GemmMicrokernelTester()
27289 .extended_weights(true)
27290 .mr(1)
27291 .nr(8)
27292 .kr(8)
27293 .sr(1)
27294 .m(1)
27295 .n(n)
27296 .k(k)
27297 .cn_stride(11)
27298 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27299 }
27300 }
27301 }
27302
27303 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, n_div_8_strided_a) {
27304 TEST_REQUIRES_X86_AVX2;
27305 for (uint32_t n = 16; n <= 24; n += 8) {
27306 for (size_t k = 1; k <= 40; k += 9) {
27307 GemmMicrokernelTester()
27308 .extended_weights(true)
27309 .mr(1)
27310 .nr(8)
27311 .kr(8)
27312 .sr(1)
27313 .m(1)
27314 .n(n)
27315 .k(k)
27316 .a_stride(43)
27317 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27318 }
27319 }
27320 }
27321
27322 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, n_div_8_subtile) {
27323 TEST_REQUIRES_X86_AVX2;
27324 for (uint32_t n = 16; n <= 24; n += 8) {
27325 for (size_t k = 1; k <= 40; k += 9) {
27326 for (uint32_t m = 1; m <= 1; m++) {
27327 GemmMicrokernelTester()
27328 .extended_weights(true)
27329 .mr(1)
27330 .nr(8)
27331 .kr(8)
27332 .sr(1)
27333 .m(m)
27334 .n(n)
27335 .k(k)
27336 .iterations(1)
27337 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27338 }
27339 }
27340 }
27341 }
27342
27343 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, strided_cm_subtile) {
27344 TEST_REQUIRES_X86_AVX2;
27345 for (size_t k = 1; k <= 40; k += 9) {
27346 for (uint32_t m = 1; m <= 1; m++) {
27347 for (uint32_t n = 1; n <= 8; n++) {
27348 GemmMicrokernelTester()
27349 .extended_weights(true)
27350 .mr(1)
27351 .nr(8)
27352 .kr(8)
27353 .sr(1)
27354 .m(m)
27355 .n(n)
27356 .k(k)
27357 .cm_stride(11)
27358 .iterations(1)
27359 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27360 }
27361 }
27362 }
27363 }
27364
27365 TEST(QC8_GEMM_XW_MINMAX_FP32_1X8C8__AVX2, strided_cm) {
27366 TEST_REQUIRES_X86_AVX2;
27367 GemmMicrokernelTester()
27368 .extended_weights(true)
27369 .mr(1)
27370 .nr(8)
27371 .kr(8)
27372 .sr(1)
27373 .m(1)
27374 .n(8)
27375 .k(8)
27376 .cm_stride(11)
27377 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27378 }
27379#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
27380
27381
27382#if XNN_ARCH_X86 || XNN_ARCH_X86_64
27383 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_eq_8) {
27384 TEST_REQUIRES_X86_AVX2;
27385 GemmMicrokernelTester()
27386 .extended_weights(true)
27387 .mr(2)
27388 .nr(8)
27389 .kr(8)
27390 .sr(1)
27391 .m(2)
27392 .n(8)
27393 .k(8)
27394 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27395 }
27396
27397 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, strided_cn) {
27398 TEST_REQUIRES_X86_AVX2;
27399 GemmMicrokernelTester()
27400 .extended_weights(true)
27401 .mr(2)
27402 .nr(8)
27403 .kr(8)
27404 .sr(1)
27405 .m(2)
27406 .n(8)
27407 .k(8)
27408 .cn_stride(11)
27409 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27410 }
27411
27412 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_eq_8_strided_a) {
27413 TEST_REQUIRES_X86_AVX2;
27414 GemmMicrokernelTester()
27415 .extended_weights(true)
27416 .mr(2)
27417 .nr(8)
27418 .kr(8)
27419 .sr(1)
27420 .m(2)
27421 .n(8)
27422 .k(8)
27423 .a_stride(11)
27424 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27425 }
27426
27427 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_eq_8_subtile) {
27428 TEST_REQUIRES_X86_AVX2;
27429 for (uint32_t m = 1; m <= 2; m++) {
27430 for (uint32_t n = 1; n <= 8; n++) {
27431 GemmMicrokernelTester()
27432 .extended_weights(true)
27433 .mr(2)
27434 .nr(8)
27435 .kr(8)
27436 .sr(1)
27437 .m(m)
27438 .n(n)
27439 .k(8)
27440 .iterations(1)
27441 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27442 }
27443 }
27444 }
27445
27446 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_eq_8_subtile_m) {
27447 TEST_REQUIRES_X86_AVX2;
27448 for (uint32_t m = 1; m <= 2; m++) {
27449 GemmMicrokernelTester()
27450 .extended_weights(true)
27451 .mr(2)
27452 .nr(8)
27453 .kr(8)
27454 .sr(1)
27455 .m(m)
27456 .n(8)
27457 .k(8)
27458 .iterations(1)
27459 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27460 }
27461 }
27462
27463 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_eq_8_subtile_n) {
27464 TEST_REQUIRES_X86_AVX2;
27465 for (uint32_t n = 1; n <= 8; n++) {
27466 GemmMicrokernelTester()
27467 .extended_weights(true)
27468 .mr(2)
27469 .nr(8)
27470 .kr(8)
27471 .sr(1)
27472 .m(2)
27473 .n(n)
27474 .k(8)
27475 .iterations(1)
27476 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27477 }
27478 }
27479
27480 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_lt_8) {
27481 TEST_REQUIRES_X86_AVX2;
27482 for (size_t k = 1; k < 8; k++) {
27483 GemmMicrokernelTester()
27484 .extended_weights(true)
27485 .mr(2)
27486 .nr(8)
27487 .kr(8)
27488 .sr(1)
27489 .m(2)
27490 .n(8)
27491 .k(k)
27492 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27493 }
27494 }
27495
27496 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_lt_8_strided_a) {
27497 TEST_REQUIRES_X86_AVX2;
27498 for (size_t k = 1; k < 8; k++) {
27499 GemmMicrokernelTester()
27500 .extended_weights(true)
27501 .mr(2)
27502 .nr(8)
27503 .kr(8)
27504 .sr(1)
27505 .m(2)
27506 .n(8)
27507 .k(k)
27508 .a_stride(11)
27509 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27510 }
27511 }
27512
27513 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_lt_8_subtile) {
27514 TEST_REQUIRES_X86_AVX2;
27515 for (size_t k = 1; k < 8; k++) {
27516 for (uint32_t m = 1; m <= 2; m++) {
27517 for (uint32_t n = 1; n <= 8; n++) {
27518 GemmMicrokernelTester()
27519 .extended_weights(true)
27520 .mr(2)
27521 .nr(8)
27522 .kr(8)
27523 .sr(1)
27524 .m(m)
27525 .n(n)
27526 .k(k)
27527 .iterations(1)
27528 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27529 }
27530 }
27531 }
27532 }
27533
27534 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_gt_8) {
27535 TEST_REQUIRES_X86_AVX2;
27536 for (size_t k = 9; k < 16; k++) {
27537 GemmMicrokernelTester()
27538 .extended_weights(true)
27539 .mr(2)
27540 .nr(8)
27541 .kr(8)
27542 .sr(1)
27543 .m(2)
27544 .n(8)
27545 .k(k)
27546 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27547 }
27548 }
27549
27550 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_gt_8_strided_a) {
27551 TEST_REQUIRES_X86_AVX2;
27552 for (size_t k = 9; k < 16; k++) {
27553 GemmMicrokernelTester()
27554 .extended_weights(true)
27555 .mr(2)
27556 .nr(8)
27557 .kr(8)
27558 .sr(1)
27559 .m(2)
27560 .n(8)
27561 .k(k)
27562 .a_stride(19)
27563 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27564 }
27565 }
27566
27567 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_gt_8_subtile) {
27568 TEST_REQUIRES_X86_AVX2;
27569 for (size_t k = 9; k < 16; k++) {
27570 for (uint32_t m = 1; m <= 2; m++) {
27571 for (uint32_t n = 1; n <= 8; n++) {
27572 GemmMicrokernelTester()
27573 .extended_weights(true)
27574 .mr(2)
27575 .nr(8)
27576 .kr(8)
27577 .sr(1)
27578 .m(m)
27579 .n(n)
27580 .k(k)
27581 .iterations(1)
27582 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27583 }
27584 }
27585 }
27586 }
27587
27588 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_div_8) {
27589 TEST_REQUIRES_X86_AVX2;
27590 for (size_t k = 16; k <= 80; k += 8) {
27591 GemmMicrokernelTester()
27592 .extended_weights(true)
27593 .mr(2)
27594 .nr(8)
27595 .kr(8)
27596 .sr(1)
27597 .m(2)
27598 .n(8)
27599 .k(k)
27600 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27601 }
27602 }
27603
27604 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_div_8_strided_a) {
27605 TEST_REQUIRES_X86_AVX2;
27606 for (size_t k = 16; k <= 80; k += 8) {
27607 GemmMicrokernelTester()
27608 .extended_weights(true)
27609 .mr(2)
27610 .nr(8)
27611 .kr(8)
27612 .sr(1)
27613 .m(2)
27614 .n(8)
27615 .k(k)
27616 .a_stride(83)
27617 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27618 }
27619 }
27620
27621 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, k_div_8_subtile) {
27622 TEST_REQUIRES_X86_AVX2;
27623 for (size_t k = 16; k <= 80; k += 8) {
27624 for (uint32_t m = 1; m <= 2; m++) {
27625 for (uint32_t n = 1; n <= 8; n++) {
27626 GemmMicrokernelTester()
27627 .extended_weights(true)
27628 .mr(2)
27629 .nr(8)
27630 .kr(8)
27631 .sr(1)
27632 .m(m)
27633 .n(n)
27634 .k(k)
27635 .iterations(1)
27636 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27637 }
27638 }
27639 }
27640 }
27641
27642 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, n_gt_8) {
27643 TEST_REQUIRES_X86_AVX2;
27644 for (uint32_t n = 9; n < 16; n++) {
27645 for (size_t k = 1; k <= 40; k += 9) {
27646 GemmMicrokernelTester()
27647 .extended_weights(true)
27648 .mr(2)
27649 .nr(8)
27650 .kr(8)
27651 .sr(1)
27652 .m(2)
27653 .n(8)
27654 .k(k)
27655 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27656 }
27657 }
27658 }
27659
27660 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, n_gt_8_strided_cn) {
27661 TEST_REQUIRES_X86_AVX2;
27662 for (uint32_t n = 9; n < 16; n++) {
27663 for (size_t k = 1; k <= 40; k += 9) {
27664 GemmMicrokernelTester()
27665 .extended_weights(true)
27666 .mr(2)
27667 .nr(8)
27668 .kr(8)
27669 .sr(1)
27670 .m(2)
27671 .n(8)
27672 .k(k)
27673 .cn_stride(11)
27674 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27675 }
27676 }
27677 }
27678
27679 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, n_gt_8_strided_a) {
27680 TEST_REQUIRES_X86_AVX2;
27681 for (uint32_t n = 9; n < 16; n++) {
27682 for (size_t k = 1; k <= 40; k += 9) {
27683 GemmMicrokernelTester()
27684 .extended_weights(true)
27685 .mr(2)
27686 .nr(8)
27687 .kr(8)
27688 .sr(1)
27689 .m(2)
27690 .n(n)
27691 .k(k)
27692 .a_stride(43)
27693 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27694 }
27695 }
27696 }
27697
27698 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, n_gt_8_subtile) {
27699 TEST_REQUIRES_X86_AVX2;
27700 for (uint32_t n = 9; n < 16; n++) {
27701 for (size_t k = 1; k <= 40; k += 9) {
27702 for (uint32_t m = 1; m <= 2; m++) {
27703 GemmMicrokernelTester()
27704 .extended_weights(true)
27705 .mr(2)
27706 .nr(8)
27707 .kr(8)
27708 .sr(1)
27709 .m(m)
27710 .n(n)
27711 .k(k)
27712 .iterations(1)
27713 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27714 }
27715 }
27716 }
27717 }
27718
27719 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, n_div_8) {
27720 TEST_REQUIRES_X86_AVX2;
27721 for (uint32_t n = 16; n <= 24; n += 8) {
27722 for (size_t k = 1; k <= 40; k += 9) {
27723 GemmMicrokernelTester()
27724 .extended_weights(true)
27725 .mr(2)
27726 .nr(8)
27727 .kr(8)
27728 .sr(1)
27729 .m(2)
27730 .n(8)
27731 .k(k)
27732 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27733 }
27734 }
27735 }
27736
27737 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, n_div_8_strided_cn) {
27738 TEST_REQUIRES_X86_AVX2;
27739 for (uint32_t n = 16; n <= 24; n += 8) {
27740 for (size_t k = 1; k <= 40; k += 9) {
27741 GemmMicrokernelTester()
27742 .extended_weights(true)
27743 .mr(2)
27744 .nr(8)
27745 .kr(8)
27746 .sr(1)
27747 .m(2)
27748 .n(n)
27749 .k(k)
27750 .cn_stride(11)
27751 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27752 }
27753 }
27754 }
27755
27756 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, n_div_8_strided_a) {
27757 TEST_REQUIRES_X86_AVX2;
27758 for (uint32_t n = 16; n <= 24; n += 8) {
27759 for (size_t k = 1; k <= 40; k += 9) {
27760 GemmMicrokernelTester()
27761 .extended_weights(true)
27762 .mr(2)
27763 .nr(8)
27764 .kr(8)
27765 .sr(1)
27766 .m(2)
27767 .n(n)
27768 .k(k)
27769 .a_stride(43)
27770 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27771 }
27772 }
27773 }
27774
27775 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, n_div_8_subtile) {
27776 TEST_REQUIRES_X86_AVX2;
27777 for (uint32_t n = 16; n <= 24; n += 8) {
27778 for (size_t k = 1; k <= 40; k += 9) {
27779 for (uint32_t m = 1; m <= 2; m++) {
27780 GemmMicrokernelTester()
27781 .extended_weights(true)
27782 .mr(2)
27783 .nr(8)
27784 .kr(8)
27785 .sr(1)
27786 .m(m)
27787 .n(n)
27788 .k(k)
27789 .iterations(1)
27790 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27791 }
27792 }
27793 }
27794 }
27795
27796 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, strided_cm_subtile) {
27797 TEST_REQUIRES_X86_AVX2;
27798 for (size_t k = 1; k <= 40; k += 9) {
27799 for (uint32_t m = 1; m <= 2; m++) {
27800 for (uint32_t n = 1; n <= 8; n++) {
27801 GemmMicrokernelTester()
27802 .extended_weights(true)
27803 .mr(2)
27804 .nr(8)
27805 .kr(8)
27806 .sr(1)
27807 .m(m)
27808 .n(n)
27809 .k(k)
27810 .cm_stride(11)
27811 .iterations(1)
27812 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27813 }
27814 }
27815 }
27816 }
27817
27818 TEST(QC8_GEMM_XW_MINMAX_FP32_2X8C8__AVX2, strided_cm) {
27819 TEST_REQUIRES_X86_AVX2;
27820 GemmMicrokernelTester()
27821 .extended_weights(true)
27822 .mr(2)
27823 .nr(8)
27824 .kr(8)
27825 .sr(1)
27826 .m(2)
27827 .n(8)
27828 .k(8)
27829 .cm_stride(11)
27830 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27831 }
27832#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
27833
27834
27835#if XNN_ARCH_X86 || XNN_ARCH_X86_64
27836 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_eq_8) {
27837 TEST_REQUIRES_X86_AVX2;
27838 GemmMicrokernelTester()
27839 .extended_weights(true)
27840 .mr(3)
27841 .nr(8)
27842 .kr(8)
27843 .sr(1)
27844 .m(3)
27845 .n(8)
27846 .k(8)
27847 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27848 }
27849
27850 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, strided_cn) {
27851 TEST_REQUIRES_X86_AVX2;
27852 GemmMicrokernelTester()
27853 .extended_weights(true)
27854 .mr(3)
27855 .nr(8)
27856 .kr(8)
27857 .sr(1)
27858 .m(3)
27859 .n(8)
27860 .k(8)
27861 .cn_stride(11)
27862 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27863 }
27864
27865 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_eq_8_strided_a) {
27866 TEST_REQUIRES_X86_AVX2;
27867 GemmMicrokernelTester()
27868 .extended_weights(true)
27869 .mr(3)
27870 .nr(8)
27871 .kr(8)
27872 .sr(1)
27873 .m(3)
27874 .n(8)
27875 .k(8)
27876 .a_stride(11)
27877 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27878 }
27879
27880 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_eq_8_subtile) {
27881 TEST_REQUIRES_X86_AVX2;
27882 for (uint32_t m = 1; m <= 3; m++) {
27883 for (uint32_t n = 1; n <= 8; n++) {
27884 GemmMicrokernelTester()
27885 .extended_weights(true)
27886 .mr(3)
27887 .nr(8)
27888 .kr(8)
27889 .sr(1)
27890 .m(m)
27891 .n(n)
27892 .k(8)
27893 .iterations(1)
27894 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27895 }
27896 }
27897 }
27898
27899 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_eq_8_subtile_m) {
27900 TEST_REQUIRES_X86_AVX2;
27901 for (uint32_t m = 1; m <= 3; m++) {
27902 GemmMicrokernelTester()
27903 .extended_weights(true)
27904 .mr(3)
27905 .nr(8)
27906 .kr(8)
27907 .sr(1)
27908 .m(m)
27909 .n(8)
27910 .k(8)
27911 .iterations(1)
27912 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27913 }
27914 }
27915
27916 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_eq_8_subtile_n) {
27917 TEST_REQUIRES_X86_AVX2;
27918 for (uint32_t n = 1; n <= 8; n++) {
27919 GemmMicrokernelTester()
27920 .extended_weights(true)
27921 .mr(3)
27922 .nr(8)
27923 .kr(8)
27924 .sr(1)
27925 .m(3)
27926 .n(n)
27927 .k(8)
27928 .iterations(1)
27929 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27930 }
27931 }
27932
27933 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_lt_8) {
27934 TEST_REQUIRES_X86_AVX2;
27935 for (size_t k = 1; k < 8; k++) {
27936 GemmMicrokernelTester()
27937 .extended_weights(true)
27938 .mr(3)
27939 .nr(8)
27940 .kr(8)
27941 .sr(1)
27942 .m(3)
27943 .n(8)
27944 .k(k)
27945 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27946 }
27947 }
27948
27949 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_lt_8_strided_a) {
27950 TEST_REQUIRES_X86_AVX2;
27951 for (size_t k = 1; k < 8; k++) {
27952 GemmMicrokernelTester()
27953 .extended_weights(true)
27954 .mr(3)
27955 .nr(8)
27956 .kr(8)
27957 .sr(1)
27958 .m(3)
27959 .n(8)
27960 .k(k)
27961 .a_stride(11)
27962 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27963 }
27964 }
27965
27966 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_lt_8_subtile) {
27967 TEST_REQUIRES_X86_AVX2;
27968 for (size_t k = 1; k < 8; k++) {
27969 for (uint32_t m = 1; m <= 3; m++) {
27970 for (uint32_t n = 1; n <= 8; n++) {
27971 GemmMicrokernelTester()
27972 .extended_weights(true)
27973 .mr(3)
27974 .nr(8)
27975 .kr(8)
27976 .sr(1)
27977 .m(m)
27978 .n(n)
27979 .k(k)
27980 .iterations(1)
27981 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
27982 }
27983 }
27984 }
27985 }
27986
27987 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_gt_8) {
27988 TEST_REQUIRES_X86_AVX2;
27989 for (size_t k = 9; k < 16; k++) {
27990 GemmMicrokernelTester()
27991 .extended_weights(true)
27992 .mr(3)
27993 .nr(8)
27994 .kr(8)
27995 .sr(1)
27996 .m(3)
27997 .n(8)
27998 .k(k)
27999 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28000 }
28001 }
28002
28003 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_gt_8_strided_a) {
28004 TEST_REQUIRES_X86_AVX2;
28005 for (size_t k = 9; k < 16; k++) {
28006 GemmMicrokernelTester()
28007 .extended_weights(true)
28008 .mr(3)
28009 .nr(8)
28010 .kr(8)
28011 .sr(1)
28012 .m(3)
28013 .n(8)
28014 .k(k)
28015 .a_stride(19)
28016 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28017 }
28018 }
28019
28020 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_gt_8_subtile) {
28021 TEST_REQUIRES_X86_AVX2;
28022 for (size_t k = 9; k < 16; k++) {
28023 for (uint32_t m = 1; m <= 3; m++) {
28024 for (uint32_t n = 1; n <= 8; n++) {
28025 GemmMicrokernelTester()
28026 .extended_weights(true)
28027 .mr(3)
28028 .nr(8)
28029 .kr(8)
28030 .sr(1)
28031 .m(m)
28032 .n(n)
28033 .k(k)
28034 .iterations(1)
28035 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28036 }
28037 }
28038 }
28039 }
28040
28041 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_div_8) {
28042 TEST_REQUIRES_X86_AVX2;
28043 for (size_t k = 16; k <= 80; k += 8) {
28044 GemmMicrokernelTester()
28045 .extended_weights(true)
28046 .mr(3)
28047 .nr(8)
28048 .kr(8)
28049 .sr(1)
28050 .m(3)
28051 .n(8)
28052 .k(k)
28053 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28054 }
28055 }
28056
28057 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_div_8_strided_a) {
28058 TEST_REQUIRES_X86_AVX2;
28059 for (size_t k = 16; k <= 80; k += 8) {
28060 GemmMicrokernelTester()
28061 .extended_weights(true)
28062 .mr(3)
28063 .nr(8)
28064 .kr(8)
28065 .sr(1)
28066 .m(3)
28067 .n(8)
28068 .k(k)
28069 .a_stride(83)
28070 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28071 }
28072 }
28073
28074 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, k_div_8_subtile) {
28075 TEST_REQUIRES_X86_AVX2;
28076 for (size_t k = 16; k <= 80; k += 8) {
28077 for (uint32_t m = 1; m <= 3; m++) {
28078 for (uint32_t n = 1; n <= 8; n++) {
28079 GemmMicrokernelTester()
28080 .extended_weights(true)
28081 .mr(3)
28082 .nr(8)
28083 .kr(8)
28084 .sr(1)
28085 .m(m)
28086 .n(n)
28087 .k(k)
28088 .iterations(1)
28089 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28090 }
28091 }
28092 }
28093 }
28094
28095 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, n_gt_8) {
28096 TEST_REQUIRES_X86_AVX2;
28097 for (uint32_t n = 9; n < 16; n++) {
28098 for (size_t k = 1; k <= 40; k += 9) {
28099 GemmMicrokernelTester()
28100 .extended_weights(true)
28101 .mr(3)
28102 .nr(8)
28103 .kr(8)
28104 .sr(1)
28105 .m(3)
28106 .n(8)
28107 .k(k)
28108 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28109 }
28110 }
28111 }
28112
28113 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, n_gt_8_strided_cn) {
28114 TEST_REQUIRES_X86_AVX2;
28115 for (uint32_t n = 9; n < 16; n++) {
28116 for (size_t k = 1; k <= 40; k += 9) {
28117 GemmMicrokernelTester()
28118 .extended_weights(true)
28119 .mr(3)
28120 .nr(8)
28121 .kr(8)
28122 .sr(1)
28123 .m(3)
28124 .n(8)
28125 .k(k)
28126 .cn_stride(11)
28127 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28128 }
28129 }
28130 }
28131
28132 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, n_gt_8_strided_a) {
28133 TEST_REQUIRES_X86_AVX2;
28134 for (uint32_t n = 9; n < 16; n++) {
28135 for (size_t k = 1; k <= 40; k += 9) {
28136 GemmMicrokernelTester()
28137 .extended_weights(true)
28138 .mr(3)
28139 .nr(8)
28140 .kr(8)
28141 .sr(1)
28142 .m(3)
28143 .n(n)
28144 .k(k)
28145 .a_stride(43)
28146 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28147 }
28148 }
28149 }
28150
28151 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, n_gt_8_subtile) {
28152 TEST_REQUIRES_X86_AVX2;
28153 for (uint32_t n = 9; n < 16; n++) {
28154 for (size_t k = 1; k <= 40; k += 9) {
28155 for (uint32_t m = 1; m <= 3; m++) {
28156 GemmMicrokernelTester()
28157 .extended_weights(true)
28158 .mr(3)
28159 .nr(8)
28160 .kr(8)
28161 .sr(1)
28162 .m(m)
28163 .n(n)
28164 .k(k)
28165 .iterations(1)
28166 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28167 }
28168 }
28169 }
28170 }
28171
28172 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, n_div_8) {
28173 TEST_REQUIRES_X86_AVX2;
28174 for (uint32_t n = 16; n <= 24; n += 8) {
28175 for (size_t k = 1; k <= 40; k += 9) {
28176 GemmMicrokernelTester()
28177 .extended_weights(true)
28178 .mr(3)
28179 .nr(8)
28180 .kr(8)
28181 .sr(1)
28182 .m(3)
28183 .n(8)
28184 .k(k)
28185 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28186 }
28187 }
28188 }
28189
28190 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, n_div_8_strided_cn) {
28191 TEST_REQUIRES_X86_AVX2;
28192 for (uint32_t n = 16; n <= 24; n += 8) {
28193 for (size_t k = 1; k <= 40; k += 9) {
28194 GemmMicrokernelTester()
28195 .extended_weights(true)
28196 .mr(3)
28197 .nr(8)
28198 .kr(8)
28199 .sr(1)
28200 .m(3)
28201 .n(n)
28202 .k(k)
28203 .cn_stride(11)
28204 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28205 }
28206 }
28207 }
28208
28209 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, n_div_8_strided_a) {
28210 TEST_REQUIRES_X86_AVX2;
28211 for (uint32_t n = 16; n <= 24; n += 8) {
28212 for (size_t k = 1; k <= 40; k += 9) {
28213 GemmMicrokernelTester()
28214 .extended_weights(true)
28215 .mr(3)
28216 .nr(8)
28217 .kr(8)
28218 .sr(1)
28219 .m(3)
28220 .n(n)
28221 .k(k)
28222 .a_stride(43)
28223 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28224 }
28225 }
28226 }
28227
28228 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, n_div_8_subtile) {
28229 TEST_REQUIRES_X86_AVX2;
28230 for (uint32_t n = 16; n <= 24; n += 8) {
28231 for (size_t k = 1; k <= 40; k += 9) {
28232 for (uint32_t m = 1; m <= 3; m++) {
28233 GemmMicrokernelTester()
28234 .extended_weights(true)
28235 .mr(3)
28236 .nr(8)
28237 .kr(8)
28238 .sr(1)
28239 .m(m)
28240 .n(n)
28241 .k(k)
28242 .iterations(1)
28243 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28244 }
28245 }
28246 }
28247 }
28248
28249 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, strided_cm_subtile) {
28250 TEST_REQUIRES_X86_AVX2;
28251 for (size_t k = 1; k <= 40; k += 9) {
28252 for (uint32_t m = 1; m <= 3; m++) {
28253 for (uint32_t n = 1; n <= 8; n++) {
28254 GemmMicrokernelTester()
28255 .extended_weights(true)
28256 .mr(3)
28257 .nr(8)
28258 .kr(8)
28259 .sr(1)
28260 .m(m)
28261 .n(n)
28262 .k(k)
28263 .cm_stride(11)
28264 .iterations(1)
28265 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28266 }
28267 }
28268 }
28269 }
28270
28271 TEST(QC8_GEMM_XW_MINMAX_FP32_3X8C8__AVX2, strided_cm) {
28272 TEST_REQUIRES_X86_AVX2;
28273 GemmMicrokernelTester()
28274 .extended_weights(true)
28275 .mr(3)
28276 .nr(8)
28277 .kr(8)
28278 .sr(1)
28279 .m(3)
28280 .n(8)
28281 .k(8)
28282 .cm_stride(11)
28283 .Test(xnn_qc8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, xnn_init_qs8_minmax_avx2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28284 }
28285#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
Marat Dukhanc3e3f1c2021-06-03 09:56:16 -070028286
28287
28288#if XNN_ARCH_X86 || XNN_ARCH_X86_64
28289 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_eq_8) {
28290 TEST_REQUIRES_X86_AVX512SKX;
28291 GemmMicrokernelTester()
28292 .mr(1)
28293 .nr(16)
28294 .kr(8)
28295 .sr(1)
28296 .m(1)
28297 .n(16)
28298 .k(8)
28299 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28300 }
28301
28302 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, strided_cn) {
28303 TEST_REQUIRES_X86_AVX512SKX;
28304 GemmMicrokernelTester()
28305 .mr(1)
28306 .nr(16)
28307 .kr(8)
28308 .sr(1)
28309 .m(1)
28310 .n(16)
28311 .k(8)
28312 .cn_stride(19)
28313 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28314 }
28315
28316 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_eq_8_strided_a) {
28317 TEST_REQUIRES_X86_AVX512SKX;
28318 GemmMicrokernelTester()
28319 .mr(1)
28320 .nr(16)
28321 .kr(8)
28322 .sr(1)
28323 .m(1)
28324 .n(16)
28325 .k(8)
28326 .a_stride(11)
28327 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28328 }
28329
28330 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_eq_8_subtile) {
28331 TEST_REQUIRES_X86_AVX512SKX;
28332 for (uint32_t m = 1; m <= 1; m++) {
28333 for (uint32_t n = 1; n <= 16; n++) {
28334 GemmMicrokernelTester()
28335 .mr(1)
28336 .nr(16)
28337 .kr(8)
28338 .sr(1)
28339 .m(m)
28340 .n(n)
28341 .k(8)
28342 .iterations(1)
28343 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28344 }
28345 }
28346 }
28347
28348 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_eq_8_subtile_m) {
28349 TEST_REQUIRES_X86_AVX512SKX;
28350 for (uint32_t m = 1; m <= 1; m++) {
28351 GemmMicrokernelTester()
28352 .mr(1)
28353 .nr(16)
28354 .kr(8)
28355 .sr(1)
28356 .m(m)
28357 .n(16)
28358 .k(8)
28359 .iterations(1)
28360 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28361 }
28362 }
28363
28364 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_eq_8_subtile_n) {
28365 TEST_REQUIRES_X86_AVX512SKX;
28366 for (uint32_t n = 1; n <= 16; n++) {
28367 GemmMicrokernelTester()
28368 .mr(1)
28369 .nr(16)
28370 .kr(8)
28371 .sr(1)
28372 .m(1)
28373 .n(n)
28374 .k(8)
28375 .iterations(1)
28376 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28377 }
28378 }
28379
28380 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_lt_8) {
28381 TEST_REQUIRES_X86_AVX512SKX;
28382 for (size_t k = 1; k < 8; k++) {
28383 GemmMicrokernelTester()
28384 .mr(1)
28385 .nr(16)
28386 .kr(8)
28387 .sr(1)
28388 .m(1)
28389 .n(16)
28390 .k(k)
28391 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28392 }
28393 }
28394
28395 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_lt_8_strided_a) {
28396 TEST_REQUIRES_X86_AVX512SKX;
28397 for (size_t k = 1; k < 8; k++) {
28398 GemmMicrokernelTester()
28399 .mr(1)
28400 .nr(16)
28401 .kr(8)
28402 .sr(1)
28403 .m(1)
28404 .n(16)
28405 .k(k)
28406 .a_stride(11)
28407 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28408 }
28409 }
28410
28411 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_lt_8_subtile) {
28412 TEST_REQUIRES_X86_AVX512SKX;
28413 for (size_t k = 1; k < 8; k++) {
28414 for (uint32_t m = 1; m <= 1; m++) {
28415 for (uint32_t n = 1; n <= 16; n++) {
28416 GemmMicrokernelTester()
28417 .mr(1)
28418 .nr(16)
28419 .kr(8)
28420 .sr(1)
28421 .m(m)
28422 .n(n)
28423 .k(k)
28424 .iterations(1)
28425 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28426 }
28427 }
28428 }
28429 }
28430
28431 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_gt_8) {
28432 TEST_REQUIRES_X86_AVX512SKX;
28433 for (size_t k = 9; k < 16; k++) {
28434 GemmMicrokernelTester()
28435 .mr(1)
28436 .nr(16)
28437 .kr(8)
28438 .sr(1)
28439 .m(1)
28440 .n(16)
28441 .k(k)
28442 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28443 }
28444 }
28445
28446 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_gt_8_strided_a) {
28447 TEST_REQUIRES_X86_AVX512SKX;
28448 for (size_t k = 9; k < 16; k++) {
28449 GemmMicrokernelTester()
28450 .mr(1)
28451 .nr(16)
28452 .kr(8)
28453 .sr(1)
28454 .m(1)
28455 .n(16)
28456 .k(k)
28457 .a_stride(19)
28458 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28459 }
28460 }
28461
28462 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_gt_8_subtile) {
28463 TEST_REQUIRES_X86_AVX512SKX;
28464 for (size_t k = 9; k < 16; k++) {
28465 for (uint32_t m = 1; m <= 1; m++) {
28466 for (uint32_t n = 1; n <= 16; n++) {
28467 GemmMicrokernelTester()
28468 .mr(1)
28469 .nr(16)
28470 .kr(8)
28471 .sr(1)
28472 .m(m)
28473 .n(n)
28474 .k(k)
28475 .iterations(1)
28476 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28477 }
28478 }
28479 }
28480 }
28481
28482 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_div_8) {
28483 TEST_REQUIRES_X86_AVX512SKX;
28484 for (size_t k = 16; k <= 80; k += 8) {
28485 GemmMicrokernelTester()
28486 .mr(1)
28487 .nr(16)
28488 .kr(8)
28489 .sr(1)
28490 .m(1)
28491 .n(16)
28492 .k(k)
28493 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28494 }
28495 }
28496
28497 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_div_8_strided_a) {
28498 TEST_REQUIRES_X86_AVX512SKX;
28499 for (size_t k = 16; k <= 80; k += 8) {
28500 GemmMicrokernelTester()
28501 .mr(1)
28502 .nr(16)
28503 .kr(8)
28504 .sr(1)
28505 .m(1)
28506 .n(16)
28507 .k(k)
28508 .a_stride(83)
28509 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28510 }
28511 }
28512
28513 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_div_8_subtile) {
28514 TEST_REQUIRES_X86_AVX512SKX;
28515 for (size_t k = 16; k <= 80; k += 8) {
28516 for (uint32_t m = 1; m <= 1; m++) {
28517 for (uint32_t n = 1; n <= 16; n++) {
28518 GemmMicrokernelTester()
28519 .mr(1)
28520 .nr(16)
28521 .kr(8)
28522 .sr(1)
28523 .m(m)
28524 .n(n)
28525 .k(k)
28526 .iterations(1)
28527 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28528 }
28529 }
28530 }
28531 }
28532
28533 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_gt_16) {
28534 TEST_REQUIRES_X86_AVX512SKX;
28535 for (uint32_t n = 17; n < 32; n++) {
28536 for (size_t k = 1; k <= 40; k += 9) {
28537 GemmMicrokernelTester()
28538 .mr(1)
28539 .nr(16)
28540 .kr(8)
28541 .sr(1)
28542 .m(1)
28543 .n(16)
28544 .k(k)
28545 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28546 }
28547 }
28548 }
28549
28550 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_gt_16_strided_cn) {
28551 TEST_REQUIRES_X86_AVX512SKX;
28552 for (uint32_t n = 17; n < 32; n++) {
28553 for (size_t k = 1; k <= 40; k += 9) {
28554 GemmMicrokernelTester()
28555 .mr(1)
28556 .nr(16)
28557 .kr(8)
28558 .sr(1)
28559 .m(1)
28560 .n(16)
28561 .k(k)
28562 .cn_stride(19)
28563 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28564 }
28565 }
28566 }
28567
28568 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_gt_16_strided_a) {
28569 TEST_REQUIRES_X86_AVX512SKX;
28570 for (uint32_t n = 17; n < 32; n++) {
28571 for (size_t k = 1; k <= 40; k += 9) {
28572 GemmMicrokernelTester()
28573 .mr(1)
28574 .nr(16)
28575 .kr(8)
28576 .sr(1)
28577 .m(1)
28578 .n(n)
28579 .k(k)
28580 .a_stride(43)
28581 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28582 }
28583 }
28584 }
28585
28586 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_gt_16_subtile) {
28587 TEST_REQUIRES_X86_AVX512SKX;
28588 for (uint32_t n = 17; n < 32; n++) {
28589 for (size_t k = 1; k <= 40; k += 9) {
28590 for (uint32_t m = 1; m <= 1; m++) {
28591 GemmMicrokernelTester()
28592 .mr(1)
28593 .nr(16)
28594 .kr(8)
28595 .sr(1)
28596 .m(m)
28597 .n(n)
28598 .k(k)
28599 .iterations(1)
28600 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28601 }
28602 }
28603 }
28604 }
28605
28606 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_div_16) {
28607 TEST_REQUIRES_X86_AVX512SKX;
28608 for (uint32_t n = 32; n <= 48; n += 16) {
28609 for (size_t k = 1; k <= 40; k += 9) {
28610 GemmMicrokernelTester()
28611 .mr(1)
28612 .nr(16)
28613 .kr(8)
28614 .sr(1)
28615 .m(1)
28616 .n(16)
28617 .k(k)
28618 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28619 }
28620 }
28621 }
28622
28623 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_div_16_strided_cn) {
28624 TEST_REQUIRES_X86_AVX512SKX;
28625 for (uint32_t n = 32; n <= 48; n += 16) {
28626 for (size_t k = 1; k <= 40; k += 9) {
28627 GemmMicrokernelTester()
28628 .mr(1)
28629 .nr(16)
28630 .kr(8)
28631 .sr(1)
28632 .m(1)
28633 .n(n)
28634 .k(k)
28635 .cn_stride(19)
28636 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28637 }
28638 }
28639 }
28640
28641 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_div_16_strided_a) {
28642 TEST_REQUIRES_X86_AVX512SKX;
28643 for (uint32_t n = 32; n <= 48; n += 16) {
28644 for (size_t k = 1; k <= 40; k += 9) {
28645 GemmMicrokernelTester()
28646 .mr(1)
28647 .nr(16)
28648 .kr(8)
28649 .sr(1)
28650 .m(1)
28651 .n(n)
28652 .k(k)
28653 .a_stride(43)
28654 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28655 }
28656 }
28657 }
28658
28659 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_div_16_subtile) {
28660 TEST_REQUIRES_X86_AVX512SKX;
28661 for (uint32_t n = 32; n <= 48; n += 16) {
28662 for (size_t k = 1; k <= 40; k += 9) {
28663 for (uint32_t m = 1; m <= 1; m++) {
28664 GemmMicrokernelTester()
28665 .mr(1)
28666 .nr(16)
28667 .kr(8)
28668 .sr(1)
28669 .m(m)
28670 .n(n)
28671 .k(k)
28672 .iterations(1)
28673 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28674 }
28675 }
28676 }
28677 }
28678
28679 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, strided_cm_subtile) {
28680 TEST_REQUIRES_X86_AVX512SKX;
28681 for (size_t k = 1; k <= 40; k += 9) {
28682 for (uint32_t m = 1; m <= 1; m++) {
28683 for (uint32_t n = 1; n <= 16; n++) {
28684 GemmMicrokernelTester()
28685 .mr(1)
28686 .nr(16)
28687 .kr(8)
28688 .sr(1)
28689 .m(m)
28690 .n(n)
28691 .k(k)
28692 .cm_stride(19)
28693 .iterations(1)
28694 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28695 }
28696 }
28697 }
28698 }
28699
28700 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, qmin) {
28701 TEST_REQUIRES_X86_AVX512SKX;
28702 GemmMicrokernelTester()
28703 .mr(1)
28704 .nr(16)
28705 .kr(8)
28706 .sr(1)
28707 .m(1)
28708 .n(16)
28709 .k(8)
28710 .qmin(128)
28711 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28712 }
28713
28714 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, qmax) {
28715 TEST_REQUIRES_X86_AVX512SKX;
28716 GemmMicrokernelTester()
28717 .mr(1)
28718 .nr(16)
28719 .kr(8)
28720 .sr(1)
28721 .m(1)
28722 .n(16)
28723 .k(8)
28724 .qmax(128)
28725 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28726 }
28727
28728 TEST(QC8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, strided_cm) {
28729 TEST_REQUIRES_X86_AVX512SKX;
28730 GemmMicrokernelTester()
28731 .mr(1)
28732 .nr(16)
28733 .kr(8)
28734 .sr(1)
28735 .m(1)
28736 .n(16)
28737 .k(8)
28738 .cm_stride(19)
28739 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28740 }
28741#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
28742
28743
28744#if XNN_ARCH_X86 || XNN_ARCH_X86_64
28745 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_eq_8) {
28746 TEST_REQUIRES_X86_AVX512SKX;
28747 GemmMicrokernelTester()
28748 .mr(2)
28749 .nr(16)
28750 .kr(8)
28751 .sr(1)
28752 .m(2)
28753 .n(16)
28754 .k(8)
28755 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28756 }
28757
28758 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, strided_cn) {
28759 TEST_REQUIRES_X86_AVX512SKX;
28760 GemmMicrokernelTester()
28761 .mr(2)
28762 .nr(16)
28763 .kr(8)
28764 .sr(1)
28765 .m(2)
28766 .n(16)
28767 .k(8)
28768 .cn_stride(19)
28769 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28770 }
28771
28772 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_eq_8_strided_a) {
28773 TEST_REQUIRES_X86_AVX512SKX;
28774 GemmMicrokernelTester()
28775 .mr(2)
28776 .nr(16)
28777 .kr(8)
28778 .sr(1)
28779 .m(2)
28780 .n(16)
28781 .k(8)
28782 .a_stride(11)
28783 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28784 }
28785
28786 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_eq_8_subtile) {
28787 TEST_REQUIRES_X86_AVX512SKX;
28788 for (uint32_t m = 1; m <= 2; m++) {
28789 for (uint32_t n = 1; n <= 16; n++) {
28790 GemmMicrokernelTester()
28791 .mr(2)
28792 .nr(16)
28793 .kr(8)
28794 .sr(1)
28795 .m(m)
28796 .n(n)
28797 .k(8)
28798 .iterations(1)
28799 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28800 }
28801 }
28802 }
28803
28804 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_eq_8_subtile_m) {
28805 TEST_REQUIRES_X86_AVX512SKX;
28806 for (uint32_t m = 1; m <= 2; m++) {
28807 GemmMicrokernelTester()
28808 .mr(2)
28809 .nr(16)
28810 .kr(8)
28811 .sr(1)
28812 .m(m)
28813 .n(16)
28814 .k(8)
28815 .iterations(1)
28816 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28817 }
28818 }
28819
28820 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_eq_8_subtile_n) {
28821 TEST_REQUIRES_X86_AVX512SKX;
28822 for (uint32_t n = 1; n <= 16; n++) {
28823 GemmMicrokernelTester()
28824 .mr(2)
28825 .nr(16)
28826 .kr(8)
28827 .sr(1)
28828 .m(2)
28829 .n(n)
28830 .k(8)
28831 .iterations(1)
28832 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28833 }
28834 }
28835
28836 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_lt_8) {
28837 TEST_REQUIRES_X86_AVX512SKX;
28838 for (size_t k = 1; k < 8; k++) {
28839 GemmMicrokernelTester()
28840 .mr(2)
28841 .nr(16)
28842 .kr(8)
28843 .sr(1)
28844 .m(2)
28845 .n(16)
28846 .k(k)
28847 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28848 }
28849 }
28850
28851 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_lt_8_strided_a) {
28852 TEST_REQUIRES_X86_AVX512SKX;
28853 for (size_t k = 1; k < 8; k++) {
28854 GemmMicrokernelTester()
28855 .mr(2)
28856 .nr(16)
28857 .kr(8)
28858 .sr(1)
28859 .m(2)
28860 .n(16)
28861 .k(k)
28862 .a_stride(11)
28863 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28864 }
28865 }
28866
28867 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_lt_8_subtile) {
28868 TEST_REQUIRES_X86_AVX512SKX;
28869 for (size_t k = 1; k < 8; k++) {
28870 for (uint32_t m = 1; m <= 2; m++) {
28871 for (uint32_t n = 1; n <= 16; n++) {
28872 GemmMicrokernelTester()
28873 .mr(2)
28874 .nr(16)
28875 .kr(8)
28876 .sr(1)
28877 .m(m)
28878 .n(n)
28879 .k(k)
28880 .iterations(1)
28881 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28882 }
28883 }
28884 }
28885 }
28886
28887 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_gt_8) {
28888 TEST_REQUIRES_X86_AVX512SKX;
28889 for (size_t k = 9; k < 16; k++) {
28890 GemmMicrokernelTester()
28891 .mr(2)
28892 .nr(16)
28893 .kr(8)
28894 .sr(1)
28895 .m(2)
28896 .n(16)
28897 .k(k)
28898 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28899 }
28900 }
28901
28902 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_gt_8_strided_a) {
28903 TEST_REQUIRES_X86_AVX512SKX;
28904 for (size_t k = 9; k < 16; k++) {
28905 GemmMicrokernelTester()
28906 .mr(2)
28907 .nr(16)
28908 .kr(8)
28909 .sr(1)
28910 .m(2)
28911 .n(16)
28912 .k(k)
28913 .a_stride(19)
28914 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28915 }
28916 }
28917
28918 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_gt_8_subtile) {
28919 TEST_REQUIRES_X86_AVX512SKX;
28920 for (size_t k = 9; k < 16; k++) {
28921 for (uint32_t m = 1; m <= 2; m++) {
28922 for (uint32_t n = 1; n <= 16; n++) {
28923 GemmMicrokernelTester()
28924 .mr(2)
28925 .nr(16)
28926 .kr(8)
28927 .sr(1)
28928 .m(m)
28929 .n(n)
28930 .k(k)
28931 .iterations(1)
28932 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28933 }
28934 }
28935 }
28936 }
28937
28938 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_div_8) {
28939 TEST_REQUIRES_X86_AVX512SKX;
28940 for (size_t k = 16; k <= 80; k += 8) {
28941 GemmMicrokernelTester()
28942 .mr(2)
28943 .nr(16)
28944 .kr(8)
28945 .sr(1)
28946 .m(2)
28947 .n(16)
28948 .k(k)
28949 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28950 }
28951 }
28952
28953 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_div_8_strided_a) {
28954 TEST_REQUIRES_X86_AVX512SKX;
28955 for (size_t k = 16; k <= 80; k += 8) {
28956 GemmMicrokernelTester()
28957 .mr(2)
28958 .nr(16)
28959 .kr(8)
28960 .sr(1)
28961 .m(2)
28962 .n(16)
28963 .k(k)
28964 .a_stride(83)
28965 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28966 }
28967 }
28968
28969 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_div_8_subtile) {
28970 TEST_REQUIRES_X86_AVX512SKX;
28971 for (size_t k = 16; k <= 80; k += 8) {
28972 for (uint32_t m = 1; m <= 2; m++) {
28973 for (uint32_t n = 1; n <= 16; n++) {
28974 GemmMicrokernelTester()
28975 .mr(2)
28976 .nr(16)
28977 .kr(8)
28978 .sr(1)
28979 .m(m)
28980 .n(n)
28981 .k(k)
28982 .iterations(1)
28983 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
28984 }
28985 }
28986 }
28987 }
28988
28989 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_gt_16) {
28990 TEST_REQUIRES_X86_AVX512SKX;
28991 for (uint32_t n = 17; n < 32; n++) {
28992 for (size_t k = 1; k <= 40; k += 9) {
28993 GemmMicrokernelTester()
28994 .mr(2)
28995 .nr(16)
28996 .kr(8)
28997 .sr(1)
28998 .m(2)
28999 .n(16)
29000 .k(k)
29001 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29002 }
29003 }
29004 }
29005
29006 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_gt_16_strided_cn) {
29007 TEST_REQUIRES_X86_AVX512SKX;
29008 for (uint32_t n = 17; n < 32; n++) {
29009 for (size_t k = 1; k <= 40; k += 9) {
29010 GemmMicrokernelTester()
29011 .mr(2)
29012 .nr(16)
29013 .kr(8)
29014 .sr(1)
29015 .m(2)
29016 .n(16)
29017 .k(k)
29018 .cn_stride(19)
29019 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29020 }
29021 }
29022 }
29023
29024 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_gt_16_strided_a) {
29025 TEST_REQUIRES_X86_AVX512SKX;
29026 for (uint32_t n = 17; n < 32; n++) {
29027 for (size_t k = 1; k <= 40; k += 9) {
29028 GemmMicrokernelTester()
29029 .mr(2)
29030 .nr(16)
29031 .kr(8)
29032 .sr(1)
29033 .m(2)
29034 .n(n)
29035 .k(k)
29036 .a_stride(43)
29037 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29038 }
29039 }
29040 }
29041
29042 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_gt_16_subtile) {
29043 TEST_REQUIRES_X86_AVX512SKX;
29044 for (uint32_t n = 17; n < 32; n++) {
29045 for (size_t k = 1; k <= 40; k += 9) {
29046 for (uint32_t m = 1; m <= 2; m++) {
29047 GemmMicrokernelTester()
29048 .mr(2)
29049 .nr(16)
29050 .kr(8)
29051 .sr(1)
29052 .m(m)
29053 .n(n)
29054 .k(k)
29055 .iterations(1)
29056 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29057 }
29058 }
29059 }
29060 }
29061
29062 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_div_16) {
29063 TEST_REQUIRES_X86_AVX512SKX;
29064 for (uint32_t n = 32; n <= 48; n += 16) {
29065 for (size_t k = 1; k <= 40; k += 9) {
29066 GemmMicrokernelTester()
29067 .mr(2)
29068 .nr(16)
29069 .kr(8)
29070 .sr(1)
29071 .m(2)
29072 .n(16)
29073 .k(k)
29074 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29075 }
29076 }
29077 }
29078
29079 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_div_16_strided_cn) {
29080 TEST_REQUIRES_X86_AVX512SKX;
29081 for (uint32_t n = 32; n <= 48; n += 16) {
29082 for (size_t k = 1; k <= 40; k += 9) {
29083 GemmMicrokernelTester()
29084 .mr(2)
29085 .nr(16)
29086 .kr(8)
29087 .sr(1)
29088 .m(2)
29089 .n(n)
29090 .k(k)
29091 .cn_stride(19)
29092 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29093 }
29094 }
29095 }
29096
29097 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_div_16_strided_a) {
29098 TEST_REQUIRES_X86_AVX512SKX;
29099 for (uint32_t n = 32; n <= 48; n += 16) {
29100 for (size_t k = 1; k <= 40; k += 9) {
29101 GemmMicrokernelTester()
29102 .mr(2)
29103 .nr(16)
29104 .kr(8)
29105 .sr(1)
29106 .m(2)
29107 .n(n)
29108 .k(k)
29109 .a_stride(43)
29110 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29111 }
29112 }
29113 }
29114
29115 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_div_16_subtile) {
29116 TEST_REQUIRES_X86_AVX512SKX;
29117 for (uint32_t n = 32; n <= 48; n += 16) {
29118 for (size_t k = 1; k <= 40; k += 9) {
29119 for (uint32_t m = 1; m <= 2; m++) {
29120 GemmMicrokernelTester()
29121 .mr(2)
29122 .nr(16)
29123 .kr(8)
29124 .sr(1)
29125 .m(m)
29126 .n(n)
29127 .k(k)
29128 .iterations(1)
29129 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29130 }
29131 }
29132 }
29133 }
29134
29135 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, strided_cm_subtile) {
29136 TEST_REQUIRES_X86_AVX512SKX;
29137 for (size_t k = 1; k <= 40; k += 9) {
29138 for (uint32_t m = 1; m <= 2; m++) {
29139 for (uint32_t n = 1; n <= 16; n++) {
29140 GemmMicrokernelTester()
29141 .mr(2)
29142 .nr(16)
29143 .kr(8)
29144 .sr(1)
29145 .m(m)
29146 .n(n)
29147 .k(k)
29148 .cm_stride(19)
29149 .iterations(1)
29150 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29151 }
29152 }
29153 }
29154 }
29155
29156 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, qmin) {
29157 TEST_REQUIRES_X86_AVX512SKX;
29158 GemmMicrokernelTester()
29159 .mr(2)
29160 .nr(16)
29161 .kr(8)
29162 .sr(1)
29163 .m(2)
29164 .n(16)
29165 .k(8)
29166 .qmin(128)
29167 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29168 }
29169
29170 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, qmax) {
29171 TEST_REQUIRES_X86_AVX512SKX;
29172 GemmMicrokernelTester()
29173 .mr(2)
29174 .nr(16)
29175 .kr(8)
29176 .sr(1)
29177 .m(2)
29178 .n(16)
29179 .k(8)
29180 .qmax(128)
29181 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29182 }
29183
29184 TEST(QC8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, strided_cm) {
29185 TEST_REQUIRES_X86_AVX512SKX;
29186 GemmMicrokernelTester()
29187 .mr(2)
29188 .nr(16)
29189 .kr(8)
29190 .sr(1)
29191 .m(2)
29192 .n(16)
29193 .k(8)
29194 .cm_stride(19)
29195 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29196 }
29197#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
29198
29199
29200#if XNN_ARCH_X86 || XNN_ARCH_X86_64
29201 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_eq_8) {
29202 TEST_REQUIRES_X86_AVX512SKX;
29203 GemmMicrokernelTester()
29204 .mr(3)
29205 .nr(16)
29206 .kr(8)
29207 .sr(1)
29208 .m(3)
29209 .n(16)
29210 .k(8)
29211 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29212 }
29213
29214 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, strided_cn) {
29215 TEST_REQUIRES_X86_AVX512SKX;
29216 GemmMicrokernelTester()
29217 .mr(3)
29218 .nr(16)
29219 .kr(8)
29220 .sr(1)
29221 .m(3)
29222 .n(16)
29223 .k(8)
29224 .cn_stride(19)
29225 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29226 }
29227
29228 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_eq_8_strided_a) {
29229 TEST_REQUIRES_X86_AVX512SKX;
29230 GemmMicrokernelTester()
29231 .mr(3)
29232 .nr(16)
29233 .kr(8)
29234 .sr(1)
29235 .m(3)
29236 .n(16)
29237 .k(8)
29238 .a_stride(11)
29239 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29240 }
29241
29242 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_eq_8_subtile) {
29243 TEST_REQUIRES_X86_AVX512SKX;
29244 for (uint32_t m = 1; m <= 3; m++) {
29245 for (uint32_t n = 1; n <= 16; n++) {
29246 GemmMicrokernelTester()
29247 .mr(3)
29248 .nr(16)
29249 .kr(8)
29250 .sr(1)
29251 .m(m)
29252 .n(n)
29253 .k(8)
29254 .iterations(1)
29255 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29256 }
29257 }
29258 }
29259
29260 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_eq_8_subtile_m) {
29261 TEST_REQUIRES_X86_AVX512SKX;
29262 for (uint32_t m = 1; m <= 3; m++) {
29263 GemmMicrokernelTester()
29264 .mr(3)
29265 .nr(16)
29266 .kr(8)
29267 .sr(1)
29268 .m(m)
29269 .n(16)
29270 .k(8)
29271 .iterations(1)
29272 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29273 }
29274 }
29275
29276 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_eq_8_subtile_n) {
29277 TEST_REQUIRES_X86_AVX512SKX;
29278 for (uint32_t n = 1; n <= 16; n++) {
29279 GemmMicrokernelTester()
29280 .mr(3)
29281 .nr(16)
29282 .kr(8)
29283 .sr(1)
29284 .m(3)
29285 .n(n)
29286 .k(8)
29287 .iterations(1)
29288 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29289 }
29290 }
29291
29292 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_lt_8) {
29293 TEST_REQUIRES_X86_AVX512SKX;
29294 for (size_t k = 1; k < 8; k++) {
29295 GemmMicrokernelTester()
29296 .mr(3)
29297 .nr(16)
29298 .kr(8)
29299 .sr(1)
29300 .m(3)
29301 .n(16)
29302 .k(k)
29303 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29304 }
29305 }
29306
29307 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_lt_8_strided_a) {
29308 TEST_REQUIRES_X86_AVX512SKX;
29309 for (size_t k = 1; k < 8; k++) {
29310 GemmMicrokernelTester()
29311 .mr(3)
29312 .nr(16)
29313 .kr(8)
29314 .sr(1)
29315 .m(3)
29316 .n(16)
29317 .k(k)
29318 .a_stride(11)
29319 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29320 }
29321 }
29322
29323 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_lt_8_subtile) {
29324 TEST_REQUIRES_X86_AVX512SKX;
29325 for (size_t k = 1; k < 8; k++) {
29326 for (uint32_t m = 1; m <= 3; m++) {
29327 for (uint32_t n = 1; n <= 16; n++) {
29328 GemmMicrokernelTester()
29329 .mr(3)
29330 .nr(16)
29331 .kr(8)
29332 .sr(1)
29333 .m(m)
29334 .n(n)
29335 .k(k)
29336 .iterations(1)
29337 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29338 }
29339 }
29340 }
29341 }
29342
29343 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_gt_8) {
29344 TEST_REQUIRES_X86_AVX512SKX;
29345 for (size_t k = 9; k < 16; k++) {
29346 GemmMicrokernelTester()
29347 .mr(3)
29348 .nr(16)
29349 .kr(8)
29350 .sr(1)
29351 .m(3)
29352 .n(16)
29353 .k(k)
29354 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29355 }
29356 }
29357
29358 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_gt_8_strided_a) {
29359 TEST_REQUIRES_X86_AVX512SKX;
29360 for (size_t k = 9; k < 16; k++) {
29361 GemmMicrokernelTester()
29362 .mr(3)
29363 .nr(16)
29364 .kr(8)
29365 .sr(1)
29366 .m(3)
29367 .n(16)
29368 .k(k)
29369 .a_stride(19)
29370 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29371 }
29372 }
29373
29374 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_gt_8_subtile) {
29375 TEST_REQUIRES_X86_AVX512SKX;
29376 for (size_t k = 9; k < 16; k++) {
29377 for (uint32_t m = 1; m <= 3; m++) {
29378 for (uint32_t n = 1; n <= 16; n++) {
29379 GemmMicrokernelTester()
29380 .mr(3)
29381 .nr(16)
29382 .kr(8)
29383 .sr(1)
29384 .m(m)
29385 .n(n)
29386 .k(k)
29387 .iterations(1)
29388 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29389 }
29390 }
29391 }
29392 }
29393
29394 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_div_8) {
29395 TEST_REQUIRES_X86_AVX512SKX;
29396 for (size_t k = 16; k <= 80; k += 8) {
29397 GemmMicrokernelTester()
29398 .mr(3)
29399 .nr(16)
29400 .kr(8)
29401 .sr(1)
29402 .m(3)
29403 .n(16)
29404 .k(k)
29405 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29406 }
29407 }
29408
29409 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_div_8_strided_a) {
29410 TEST_REQUIRES_X86_AVX512SKX;
29411 for (size_t k = 16; k <= 80; k += 8) {
29412 GemmMicrokernelTester()
29413 .mr(3)
29414 .nr(16)
29415 .kr(8)
29416 .sr(1)
29417 .m(3)
29418 .n(16)
29419 .k(k)
29420 .a_stride(83)
29421 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29422 }
29423 }
29424
29425 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, k_div_8_subtile) {
29426 TEST_REQUIRES_X86_AVX512SKX;
29427 for (size_t k = 16; k <= 80; k += 8) {
29428 for (uint32_t m = 1; m <= 3; m++) {
29429 for (uint32_t n = 1; n <= 16; n++) {
29430 GemmMicrokernelTester()
29431 .mr(3)
29432 .nr(16)
29433 .kr(8)
29434 .sr(1)
29435 .m(m)
29436 .n(n)
29437 .k(k)
29438 .iterations(1)
29439 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29440 }
29441 }
29442 }
29443 }
29444
29445 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, n_gt_16) {
29446 TEST_REQUIRES_X86_AVX512SKX;
29447 for (uint32_t n = 17; n < 32; n++) {
29448 for (size_t k = 1; k <= 40; k += 9) {
29449 GemmMicrokernelTester()
29450 .mr(3)
29451 .nr(16)
29452 .kr(8)
29453 .sr(1)
29454 .m(3)
29455 .n(16)
29456 .k(k)
29457 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29458 }
29459 }
29460 }
29461
29462 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, n_gt_16_strided_cn) {
29463 TEST_REQUIRES_X86_AVX512SKX;
29464 for (uint32_t n = 17; n < 32; n++) {
29465 for (size_t k = 1; k <= 40; k += 9) {
29466 GemmMicrokernelTester()
29467 .mr(3)
29468 .nr(16)
29469 .kr(8)
29470 .sr(1)
29471 .m(3)
29472 .n(16)
29473 .k(k)
29474 .cn_stride(19)
29475 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29476 }
29477 }
29478 }
29479
29480 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, n_gt_16_strided_a) {
29481 TEST_REQUIRES_X86_AVX512SKX;
29482 for (uint32_t n = 17; n < 32; n++) {
29483 for (size_t k = 1; k <= 40; k += 9) {
29484 GemmMicrokernelTester()
29485 .mr(3)
29486 .nr(16)
29487 .kr(8)
29488 .sr(1)
29489 .m(3)
29490 .n(n)
29491 .k(k)
29492 .a_stride(43)
29493 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29494 }
29495 }
29496 }
29497
29498 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, n_gt_16_subtile) {
29499 TEST_REQUIRES_X86_AVX512SKX;
29500 for (uint32_t n = 17; n < 32; n++) {
29501 for (size_t k = 1; k <= 40; k += 9) {
29502 for (uint32_t m = 1; m <= 3; m++) {
29503 GemmMicrokernelTester()
29504 .mr(3)
29505 .nr(16)
29506 .kr(8)
29507 .sr(1)
29508 .m(m)
29509 .n(n)
29510 .k(k)
29511 .iterations(1)
29512 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29513 }
29514 }
29515 }
29516 }
29517
29518 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, n_div_16) {
29519 TEST_REQUIRES_X86_AVX512SKX;
29520 for (uint32_t n = 32; n <= 48; n += 16) {
29521 for (size_t k = 1; k <= 40; k += 9) {
29522 GemmMicrokernelTester()
29523 .mr(3)
29524 .nr(16)
29525 .kr(8)
29526 .sr(1)
29527 .m(3)
29528 .n(16)
29529 .k(k)
29530 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29531 }
29532 }
29533 }
29534
29535 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, n_div_16_strided_cn) {
29536 TEST_REQUIRES_X86_AVX512SKX;
29537 for (uint32_t n = 32; n <= 48; n += 16) {
29538 for (size_t k = 1; k <= 40; k += 9) {
29539 GemmMicrokernelTester()
29540 .mr(3)
29541 .nr(16)
29542 .kr(8)
29543 .sr(1)
29544 .m(3)
29545 .n(n)
29546 .k(k)
29547 .cn_stride(19)
29548 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29549 }
29550 }
29551 }
29552
29553 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, n_div_16_strided_a) {
29554 TEST_REQUIRES_X86_AVX512SKX;
29555 for (uint32_t n = 32; n <= 48; n += 16) {
29556 for (size_t k = 1; k <= 40; k += 9) {
29557 GemmMicrokernelTester()
29558 .mr(3)
29559 .nr(16)
29560 .kr(8)
29561 .sr(1)
29562 .m(3)
29563 .n(n)
29564 .k(k)
29565 .a_stride(43)
29566 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29567 }
29568 }
29569 }
29570
29571 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, n_div_16_subtile) {
29572 TEST_REQUIRES_X86_AVX512SKX;
29573 for (uint32_t n = 32; n <= 48; n += 16) {
29574 for (size_t k = 1; k <= 40; k += 9) {
29575 for (uint32_t m = 1; m <= 3; m++) {
29576 GemmMicrokernelTester()
29577 .mr(3)
29578 .nr(16)
29579 .kr(8)
29580 .sr(1)
29581 .m(m)
29582 .n(n)
29583 .k(k)
29584 .iterations(1)
29585 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29586 }
29587 }
29588 }
29589 }
29590
29591 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, strided_cm_subtile) {
29592 TEST_REQUIRES_X86_AVX512SKX;
29593 for (size_t k = 1; k <= 40; k += 9) {
29594 for (uint32_t m = 1; m <= 3; m++) {
29595 for (uint32_t n = 1; n <= 16; n++) {
29596 GemmMicrokernelTester()
29597 .mr(3)
29598 .nr(16)
29599 .kr(8)
29600 .sr(1)
29601 .m(m)
29602 .n(n)
29603 .k(k)
29604 .cm_stride(19)
29605 .iterations(1)
29606 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29607 }
29608 }
29609 }
29610 }
29611
29612 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, qmin) {
29613 TEST_REQUIRES_X86_AVX512SKX;
29614 GemmMicrokernelTester()
29615 .mr(3)
29616 .nr(16)
29617 .kr(8)
29618 .sr(1)
29619 .m(3)
29620 .n(16)
29621 .k(8)
29622 .qmin(128)
29623 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29624 }
29625
29626 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, qmax) {
29627 TEST_REQUIRES_X86_AVX512SKX;
29628 GemmMicrokernelTester()
29629 .mr(3)
29630 .nr(16)
29631 .kr(8)
29632 .sr(1)
29633 .m(3)
29634 .n(16)
29635 .k(8)
29636 .qmax(128)
29637 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29638 }
29639
29640 TEST(QC8_GEMM_MINMAX_FP32_3X16C8__AVX512SKX, strided_cm) {
29641 TEST_REQUIRES_X86_AVX512SKX;
29642 GemmMicrokernelTester()
29643 .mr(3)
29644 .nr(16)
29645 .kr(8)
29646 .sr(1)
29647 .m(3)
29648 .n(16)
29649 .k(8)
29650 .cm_stride(19)
29651 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29652 }
29653#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
29654
29655
29656#if XNN_ARCH_X86 || XNN_ARCH_X86_64
29657 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_eq_8) {
29658 TEST_REQUIRES_X86_AVX512SKX;
29659 GemmMicrokernelTester()
29660 .mr(4)
29661 .nr(16)
29662 .kr(8)
29663 .sr(1)
29664 .m(4)
29665 .n(16)
29666 .k(8)
29667 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29668 }
29669
29670 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, strided_cn) {
29671 TEST_REQUIRES_X86_AVX512SKX;
29672 GemmMicrokernelTester()
29673 .mr(4)
29674 .nr(16)
29675 .kr(8)
29676 .sr(1)
29677 .m(4)
29678 .n(16)
29679 .k(8)
29680 .cn_stride(19)
29681 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29682 }
29683
29684 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_eq_8_strided_a) {
29685 TEST_REQUIRES_X86_AVX512SKX;
29686 GemmMicrokernelTester()
29687 .mr(4)
29688 .nr(16)
29689 .kr(8)
29690 .sr(1)
29691 .m(4)
29692 .n(16)
29693 .k(8)
29694 .a_stride(11)
29695 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29696 }
29697
29698 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_eq_8_subtile) {
29699 TEST_REQUIRES_X86_AVX512SKX;
29700 for (uint32_t m = 1; m <= 4; m++) {
29701 for (uint32_t n = 1; n <= 16; n++) {
29702 GemmMicrokernelTester()
29703 .mr(4)
29704 .nr(16)
29705 .kr(8)
29706 .sr(1)
29707 .m(m)
29708 .n(n)
29709 .k(8)
29710 .iterations(1)
29711 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29712 }
29713 }
29714 }
29715
29716 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_eq_8_subtile_m) {
29717 TEST_REQUIRES_X86_AVX512SKX;
29718 for (uint32_t m = 1; m <= 4; m++) {
29719 GemmMicrokernelTester()
29720 .mr(4)
29721 .nr(16)
29722 .kr(8)
29723 .sr(1)
29724 .m(m)
29725 .n(16)
29726 .k(8)
29727 .iterations(1)
29728 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29729 }
29730 }
29731
29732 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_eq_8_subtile_n) {
29733 TEST_REQUIRES_X86_AVX512SKX;
29734 for (uint32_t n = 1; n <= 16; n++) {
29735 GemmMicrokernelTester()
29736 .mr(4)
29737 .nr(16)
29738 .kr(8)
29739 .sr(1)
29740 .m(4)
29741 .n(n)
29742 .k(8)
29743 .iterations(1)
29744 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29745 }
29746 }
29747
29748 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_lt_8) {
29749 TEST_REQUIRES_X86_AVX512SKX;
29750 for (size_t k = 1; k < 8; k++) {
29751 GemmMicrokernelTester()
29752 .mr(4)
29753 .nr(16)
29754 .kr(8)
29755 .sr(1)
29756 .m(4)
29757 .n(16)
29758 .k(k)
29759 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29760 }
29761 }
29762
29763 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_lt_8_strided_a) {
29764 TEST_REQUIRES_X86_AVX512SKX;
29765 for (size_t k = 1; k < 8; k++) {
29766 GemmMicrokernelTester()
29767 .mr(4)
29768 .nr(16)
29769 .kr(8)
29770 .sr(1)
29771 .m(4)
29772 .n(16)
29773 .k(k)
29774 .a_stride(11)
29775 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29776 }
29777 }
29778
29779 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_lt_8_subtile) {
29780 TEST_REQUIRES_X86_AVX512SKX;
29781 for (size_t k = 1; k < 8; k++) {
29782 for (uint32_t m = 1; m <= 4; m++) {
29783 for (uint32_t n = 1; n <= 16; n++) {
29784 GemmMicrokernelTester()
29785 .mr(4)
29786 .nr(16)
29787 .kr(8)
29788 .sr(1)
29789 .m(m)
29790 .n(n)
29791 .k(k)
29792 .iterations(1)
29793 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29794 }
29795 }
29796 }
29797 }
29798
29799 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_gt_8) {
29800 TEST_REQUIRES_X86_AVX512SKX;
29801 for (size_t k = 9; k < 16; k++) {
29802 GemmMicrokernelTester()
29803 .mr(4)
29804 .nr(16)
29805 .kr(8)
29806 .sr(1)
29807 .m(4)
29808 .n(16)
29809 .k(k)
29810 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29811 }
29812 }
29813
29814 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_gt_8_strided_a) {
29815 TEST_REQUIRES_X86_AVX512SKX;
29816 for (size_t k = 9; k < 16; k++) {
29817 GemmMicrokernelTester()
29818 .mr(4)
29819 .nr(16)
29820 .kr(8)
29821 .sr(1)
29822 .m(4)
29823 .n(16)
29824 .k(k)
29825 .a_stride(19)
29826 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29827 }
29828 }
29829
29830 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_gt_8_subtile) {
29831 TEST_REQUIRES_X86_AVX512SKX;
29832 for (size_t k = 9; k < 16; k++) {
29833 for (uint32_t m = 1; m <= 4; m++) {
29834 for (uint32_t n = 1; n <= 16; n++) {
29835 GemmMicrokernelTester()
29836 .mr(4)
29837 .nr(16)
29838 .kr(8)
29839 .sr(1)
29840 .m(m)
29841 .n(n)
29842 .k(k)
29843 .iterations(1)
29844 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29845 }
29846 }
29847 }
29848 }
29849
29850 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_div_8) {
29851 TEST_REQUIRES_X86_AVX512SKX;
29852 for (size_t k = 16; k <= 80; k += 8) {
29853 GemmMicrokernelTester()
29854 .mr(4)
29855 .nr(16)
29856 .kr(8)
29857 .sr(1)
29858 .m(4)
29859 .n(16)
29860 .k(k)
29861 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29862 }
29863 }
29864
29865 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_div_8_strided_a) {
29866 TEST_REQUIRES_X86_AVX512SKX;
29867 for (size_t k = 16; k <= 80; k += 8) {
29868 GemmMicrokernelTester()
29869 .mr(4)
29870 .nr(16)
29871 .kr(8)
29872 .sr(1)
29873 .m(4)
29874 .n(16)
29875 .k(k)
29876 .a_stride(83)
29877 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29878 }
29879 }
29880
29881 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_div_8_subtile) {
29882 TEST_REQUIRES_X86_AVX512SKX;
29883 for (size_t k = 16; k <= 80; k += 8) {
29884 for (uint32_t m = 1; m <= 4; m++) {
29885 for (uint32_t n = 1; n <= 16; n++) {
29886 GemmMicrokernelTester()
29887 .mr(4)
29888 .nr(16)
29889 .kr(8)
29890 .sr(1)
29891 .m(m)
29892 .n(n)
29893 .k(k)
29894 .iterations(1)
29895 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29896 }
29897 }
29898 }
29899 }
29900
29901 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, n_gt_16) {
29902 TEST_REQUIRES_X86_AVX512SKX;
29903 for (uint32_t n = 17; n < 32; n++) {
29904 for (size_t k = 1; k <= 40; k += 9) {
29905 GemmMicrokernelTester()
29906 .mr(4)
29907 .nr(16)
29908 .kr(8)
29909 .sr(1)
29910 .m(4)
29911 .n(16)
29912 .k(k)
29913 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29914 }
29915 }
29916 }
29917
29918 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, n_gt_16_strided_cn) {
29919 TEST_REQUIRES_X86_AVX512SKX;
29920 for (uint32_t n = 17; n < 32; n++) {
29921 for (size_t k = 1; k <= 40; k += 9) {
29922 GemmMicrokernelTester()
29923 .mr(4)
29924 .nr(16)
29925 .kr(8)
29926 .sr(1)
29927 .m(4)
29928 .n(16)
29929 .k(k)
29930 .cn_stride(19)
29931 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29932 }
29933 }
29934 }
29935
29936 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, n_gt_16_strided_a) {
29937 TEST_REQUIRES_X86_AVX512SKX;
29938 for (uint32_t n = 17; n < 32; n++) {
29939 for (size_t k = 1; k <= 40; k += 9) {
29940 GemmMicrokernelTester()
29941 .mr(4)
29942 .nr(16)
29943 .kr(8)
29944 .sr(1)
29945 .m(4)
29946 .n(n)
29947 .k(k)
29948 .a_stride(43)
29949 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29950 }
29951 }
29952 }
29953
29954 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, n_gt_16_subtile) {
29955 TEST_REQUIRES_X86_AVX512SKX;
29956 for (uint32_t n = 17; n < 32; n++) {
29957 for (size_t k = 1; k <= 40; k += 9) {
29958 for (uint32_t m = 1; m <= 4; m++) {
29959 GemmMicrokernelTester()
29960 .mr(4)
29961 .nr(16)
29962 .kr(8)
29963 .sr(1)
29964 .m(m)
29965 .n(n)
29966 .k(k)
29967 .iterations(1)
29968 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29969 }
29970 }
29971 }
29972 }
29973
29974 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, n_div_16) {
29975 TEST_REQUIRES_X86_AVX512SKX;
29976 for (uint32_t n = 32; n <= 48; n += 16) {
29977 for (size_t k = 1; k <= 40; k += 9) {
29978 GemmMicrokernelTester()
29979 .mr(4)
29980 .nr(16)
29981 .kr(8)
29982 .sr(1)
29983 .m(4)
29984 .n(16)
29985 .k(k)
29986 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
29987 }
29988 }
29989 }
29990
29991 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, n_div_16_strided_cn) {
29992 TEST_REQUIRES_X86_AVX512SKX;
29993 for (uint32_t n = 32; n <= 48; n += 16) {
29994 for (size_t k = 1; k <= 40; k += 9) {
29995 GemmMicrokernelTester()
29996 .mr(4)
29997 .nr(16)
29998 .kr(8)
29999 .sr(1)
30000 .m(4)
30001 .n(n)
30002 .k(k)
30003 .cn_stride(19)
30004 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
30005 }
30006 }
30007 }
30008
30009 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, n_div_16_strided_a) {
30010 TEST_REQUIRES_X86_AVX512SKX;
30011 for (uint32_t n = 32; n <= 48; n += 16) {
30012 for (size_t k = 1; k <= 40; k += 9) {
30013 GemmMicrokernelTester()
30014 .mr(4)
30015 .nr(16)
30016 .kr(8)
30017 .sr(1)
30018 .m(4)
30019 .n(n)
30020 .k(k)
30021 .a_stride(43)
30022 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
30023 }
30024 }
30025 }
30026
30027 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, n_div_16_subtile) {
30028 TEST_REQUIRES_X86_AVX512SKX;
30029 for (uint32_t n = 32; n <= 48; n += 16) {
30030 for (size_t k = 1; k <= 40; k += 9) {
30031 for (uint32_t m = 1; m <= 4; m++) {
30032 GemmMicrokernelTester()
30033 .mr(4)
30034 .nr(16)
30035 .kr(8)
30036 .sr(1)
30037 .m(m)
30038 .n(n)
30039 .k(k)
30040 .iterations(1)
30041 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
30042 }
30043 }
30044 }
30045 }
30046
30047 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, strided_cm_subtile) {
30048 TEST_REQUIRES_X86_AVX512SKX;
30049 for (size_t k = 1; k <= 40; k += 9) {
30050 for (uint32_t m = 1; m <= 4; m++) {
30051 for (uint32_t n = 1; n <= 16; n++) {
30052 GemmMicrokernelTester()
30053 .mr(4)
30054 .nr(16)
30055 .kr(8)
30056 .sr(1)
30057 .m(m)
30058 .n(n)
30059 .k(k)
30060 .cm_stride(19)
30061 .iterations(1)
30062 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
30063 }
30064 }
30065 }
30066 }
30067
30068 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, qmin) {
30069 TEST_REQUIRES_X86_AVX512SKX;
30070 GemmMicrokernelTester()
30071 .mr(4)
30072 .nr(16)
30073 .kr(8)
30074 .sr(1)
30075 .m(4)
30076 .n(16)
30077 .k(8)
30078 .qmin(128)
30079 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
30080 }
30081
30082 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, qmax) {
30083 TEST_REQUIRES_X86_AVX512SKX;
30084 GemmMicrokernelTester()
30085 .mr(4)
30086 .nr(16)
30087 .kr(8)
30088 .sr(1)
30089 .m(4)
30090 .n(16)
30091 .k(8)
30092 .qmax(128)
30093 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
30094 }
30095
30096 TEST(QC8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, strided_cm) {
30097 TEST_REQUIRES_X86_AVX512SKX;
30098 GemmMicrokernelTester()
30099 .mr(4)
30100 .nr(16)
30101 .kr(8)
30102 .sr(1)
30103 .m(4)
30104 .n(16)
30105 .k(8)
30106 .cm_stride(19)
30107 .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qs8_minmax_avx512_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
30108 }
30109#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64