Fix QS8 IGEMM with FP32 requantization for SSE/AVX/XOP

PiperOrigin-RevId: 377211275
diff --git a/src/qs8-igemm/MRx4c2-sse.c.in b/src/qs8-igemm/MRx4c2-sse.c.in
index 342721b..59eb3bd 100644
--- a/src/qs8-igemm/MRx4c2-sse.c.in
+++ b/src/qs8-igemm/MRx4c2-sse.c.in
@@ -288,8 +288,8 @@
       __m128i vacc${M}${min(M+1, MR-1)}x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc${M}x0123, vacc${min(M+1, MR-1)}x0123), voutput_zero_point);
 
     $if SSE < 4:
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max);
       $for M in range(0, MR, 2):
         vacc${M}${min(M+1, MR-1)}x0123 = _mm_min_epi16(_mm_max_epi16(vacc${M}${min(M+1, MR-1)}x0123, voutput_min), voutput_max);
 
@@ -299,8 +299,8 @@
       __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc0${min(1, MR-1)}x0123);
 
     $if SSE == 4:
-      vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-      vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+      vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min));
+      vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max));
 
     if (nc >= 4) {
       $for M in reversed(range(1, MR)):
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c
index 1d8f14c..37bde25 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c
@@ -131,8 +131,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c
index 66ea6ea..0b84a74 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c
@@ -131,8 +131,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
index ac4c742..a5cbc1d 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
@@ -128,8 +128,8 @@
     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
     vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
index 6c892af..55e4b0c 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
@@ -128,8 +128,8 @@
     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
     vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
index 866740d..f9324e3 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
@@ -131,8 +131,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
index 091c8f0..abed29a 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
@@ -131,8 +131,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c
index eaa6e92..f26464a 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c
@@ -136,8 +136,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c
index 1cb8c87..3170471 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c
@@ -136,8 +136,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c
index ddc1725..2a6314a 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c
@@ -163,8 +163,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c
index a63c500..4ade5aa 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c
@@ -163,8 +163,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
index 51cd3f6..2cebe9c 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
@@ -160,8 +160,8 @@
     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
     vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
index 6f16a59..a373cc6 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
@@ -160,8 +160,8 @@
     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
     vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
index 1f99b51..b2232e7 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
@@ -163,8 +163,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
index 3f96f5d..074b5d0 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
@@ -163,8 +163,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c
index 00a4b2c..a53f14d 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c
@@ -168,8 +168,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c
index 3710f0f..a5349ee 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c
@@ -168,8 +168,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c
index cde0021..76801ec 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c
@@ -196,8 +196,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c
index cb62d7a..60b05d1 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c
@@ -196,8 +196,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
index 6385818..1ec2f1e 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
@@ -193,8 +193,8 @@
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
     vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
     vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
index e5d00ed..90828cd 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
@@ -193,8 +193,8 @@
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
     vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
     vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
index 02ef5f2..72b509c 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
@@ -196,8 +196,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
index 1ba142c..0fba4e3 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
@@ -196,8 +196,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c
index 0204f3a..cfc480e 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c
@@ -201,8 +201,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c
index 2b600be..cf86f08 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c
@@ -201,8 +201,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c
index 10c39c7..20e55bd 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c
@@ -228,8 +228,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c
index c1fd952..d5e1122 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c
@@ -228,8 +228,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
index 31299ee..e8144fe 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
@@ -225,8 +225,8 @@
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
     vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
     vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
 
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
index ea52094..b305c8d 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
@@ -225,8 +225,8 @@
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
     vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
     vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
 
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
index 1ba4d7d..48ff03e 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
@@ -228,8 +228,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
index f383fa7..cb3166d 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
@@ -228,8 +228,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c
index 08bc5cd..48d99be 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c
@@ -233,8 +233,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c
index f660147..9387431 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c
@@ -233,8 +233,8 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
diff --git a/test/qs8-igemm-minmax-fp32.cc b/test/qs8-igemm-minmax-fp32.cc
index 319f854..4479947 100644
--- a/test/qs8-igemm-minmax-fp32.cc
+++ b/test/qs8-igemm-minmax-fp32.cc
@@ -23,7 +23,7 @@
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(1)
@@ -33,10 +33,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(1)
@@ -47,24 +47,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -77,12 +63,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -94,11 +80,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -110,11 +96,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_lt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -125,27 +111,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -159,13 +129,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_gt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -176,27 +146,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -210,13 +164,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_div_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -227,27 +181,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -261,13 +199,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -279,12 +217,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -297,30 +235,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -334,13 +254,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -352,12 +272,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -370,30 +290,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -407,13 +309,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -428,13 +403,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, qmin) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(1)
@@ -445,10 +457,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, qmax) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(1)
@@ -459,10 +471,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, strided_cm) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(1)
@@ -473,13 +485,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(2)
@@ -489,10 +501,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(2)
@@ -503,24 +515,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -533,12 +531,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -550,11 +548,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -566,11 +564,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_lt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -581,27 +579,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -615,13 +597,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_gt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -632,27 +614,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -666,13 +632,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_div_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -683,27 +649,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -717,13 +667,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -735,12 +685,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -753,30 +703,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -790,13 +722,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -808,12 +740,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -826,30 +758,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -863,13 +777,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -884,13 +871,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, qmin) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(2)
@@ -901,10 +925,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, qmax) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(2)
@@ -915,10 +939,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, strided_cm) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(2)
@@ -929,13 +953,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(3)
@@ -945,10 +969,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(3)
@@ -959,24 +983,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -989,12 +999,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -1006,11 +1016,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -1022,11 +1032,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_lt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -1037,27 +1047,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -1071,13 +1065,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_gt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1088,27 +1082,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -1122,13 +1100,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_div_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -1139,27 +1117,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -1173,13 +1135,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1191,12 +1153,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1209,30 +1171,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1246,13 +1190,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1264,12 +1208,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1282,30 +1226,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1319,13 +1245,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -1340,13 +1339,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, qmin) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(3)
@@ -1357,10 +1393,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, qmax) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(3)
@@ -1371,10 +1407,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, strided_cm) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(3)
@@ -1385,13 +1421,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(4)
@@ -1401,10 +1437,10 @@
       .m(4)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(4)
@@ -1415,24 +1451,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -1445,12 +1467,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -1462,11 +1484,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -1478,11 +1500,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_lt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -1493,27 +1515,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -1527,13 +1533,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_gt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -1544,27 +1550,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -1578,13 +1568,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_div_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -1595,27 +1585,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -1629,13 +1603,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1647,12 +1621,12 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1665,30 +1639,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1702,13 +1658,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1720,12 +1676,12 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1738,30 +1694,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -1775,13 +1713,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -1796,13 +1807,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, qmin) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(4)
@@ -1813,10 +1861,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, qmax) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(4)
@@ -1827,10 +1875,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, strided_cm) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(4)
@@ -1841,13 +1889,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(1)
@@ -1857,10 +1905,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(1)
@@ -1871,24 +1919,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -1901,12 +1935,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -1918,11 +1952,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -1934,11 +1968,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_lt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -1949,27 +1983,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -1983,13 +2001,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_gt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2000,27 +2018,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2034,13 +2036,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_div_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -2051,27 +2053,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2085,13 +2071,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2103,12 +2089,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2121,30 +2107,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2158,13 +2126,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2176,12 +2144,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2194,30 +2162,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2231,13 +2181,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -2252,13 +2275,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, qmin) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(1)
@@ -2269,10 +2329,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, qmax) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(1)
@@ -2283,10 +2343,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, strided_cm) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(1)
@@ -2297,13 +2357,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(2)
@@ -2313,10 +2373,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(2)
@@ -2327,24 +2387,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -2357,12 +2403,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -2374,11 +2420,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -2390,11 +2436,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_lt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -2405,27 +2451,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2439,13 +2469,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_gt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2456,27 +2486,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2490,13 +2504,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_div_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -2507,27 +2521,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2541,13 +2539,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2559,12 +2557,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2577,30 +2575,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2614,13 +2594,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2632,12 +2612,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2650,30 +2630,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -2687,13 +2649,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -2708,13 +2743,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, qmin) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(2)
@@ -2725,10 +2797,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, qmax) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(2)
@@ -2739,10 +2811,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, strided_cm) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(2)
@@ -2753,13 +2825,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(3)
@@ -2769,10 +2841,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(3)
@@ -2783,24 +2855,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -2813,12 +2871,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -2830,11 +2888,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -2846,11 +2904,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_lt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -2861,27 +2919,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -2895,13 +2937,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_gt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -2912,27 +2954,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -2946,13 +2972,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_div_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -2963,27 +2989,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -2997,13 +3007,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3015,12 +3025,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3033,30 +3043,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3070,13 +3062,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3088,12 +3080,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3106,30 +3098,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3143,13 +3117,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -3164,13 +3211,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, qmin) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(3)
@@ -3181,10 +3265,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, qmax) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(3)
@@ -3195,10 +3279,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, strided_cm) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(3)
@@ -3209,13 +3293,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(4)
@@ -3225,10 +3309,10 @@
       .m(4)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(4)
@@ -3239,24 +3323,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -3269,12 +3339,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -3286,11 +3356,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -3302,11 +3372,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_lt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -3317,27 +3387,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -3351,13 +3405,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_gt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3368,27 +3422,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -3402,13 +3440,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_div_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -3419,27 +3457,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -3453,13 +3475,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3471,12 +3493,12 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3489,30 +3511,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3526,13 +3530,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3544,12 +3548,12 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3562,30 +3566,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3599,13 +3585,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -3620,13 +3679,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, qmin) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(4)
@@ -3637,10 +3733,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, qmax) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(4)
@@ -3651,10 +3747,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, strided_cm) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(4)
@@ -3665,13 +3761,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(1)
@@ -3681,10 +3777,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, strided_cn) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(1)
@@ -3695,24 +3791,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -3725,12 +3807,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -3742,11 +3824,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -3758,11 +3840,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_lt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -3773,27 +3855,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3807,13 +3873,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_gt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -3824,27 +3890,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3858,13 +3908,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_div_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -3875,27 +3925,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -3909,13 +3943,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3927,12 +3961,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3945,30 +3979,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -3982,13 +3998,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -4000,12 +4016,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -4018,30 +4034,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -4055,13 +4053,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -4076,13 +4147,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, qmin) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(1)
@@ -4093,10 +4201,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, qmax) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(1)
@@ -4107,10 +4215,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, strided_cm) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(1)
@@ -4121,13 +4229,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(2)
@@ -4137,10 +4245,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, strided_cn) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(2)
@@ -4151,24 +4259,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -4181,12 +4275,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -4198,11 +4292,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -4214,11 +4308,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_lt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -4229,27 +4323,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4263,13 +4341,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_gt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4280,27 +4358,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4314,13 +4376,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_div_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -4331,27 +4393,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4365,13 +4411,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_gt_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -4383,12 +4429,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -4401,30 +4447,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -4438,13 +4466,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_div_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -4456,12 +4484,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -4474,30 +4502,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -4511,13 +4521,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -4532,13 +4615,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, qmin) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(2)
@@ -4549,10 +4669,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, qmax) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(2)
@@ -4563,10 +4683,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, strided_cm) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(2)
@@ -4577,13 +4697,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(3)
@@ -4593,10 +4713,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, strided_cn) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(3)
@@ -4607,24 +4727,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -4637,12 +4743,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -4654,11 +4760,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -4670,11 +4776,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_lt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -4685,27 +4791,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -4719,13 +4809,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_gt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -4736,27 +4826,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -4770,13 +4844,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_div_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -4787,27 +4861,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -4821,13 +4879,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -4839,12 +4897,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -4857,30 +4915,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -4894,13 +4934,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -4912,12 +4952,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -4930,30 +4970,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -4967,13 +4989,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -4988,13 +5083,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, qmin) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(3)
@@ -5005,10 +5137,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, qmax) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(3)
@@ -5019,10 +5151,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, strided_cm) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(3)
@@ -5033,13 +5165,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(4)
@@ -5049,10 +5181,10 @@
       .m(4)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, strided_cn) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(4)
@@ -5063,24 +5195,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -5093,12 +5211,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -5110,11 +5228,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -5126,11 +5244,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_lt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -5141,27 +5259,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -5175,13 +5277,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_gt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5192,27 +5294,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -5226,13 +5312,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_div_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -5243,27 +5329,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -5277,13 +5347,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -5295,12 +5365,12 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -5313,30 +5383,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -5350,13 +5402,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -5368,12 +5420,12 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -5386,30 +5438,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -5423,13 +5457,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -5444,13 +5551,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, qmin) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(4)
@@ -5461,10 +5605,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, qmax) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(4)
@@ -5475,10 +5619,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, strided_cm) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(4)
@@ -5489,13 +5633,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(1)
@@ -5505,10 +5649,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, strided_cn) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(1)
@@ -5519,24 +5663,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -5549,12 +5679,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -5566,11 +5696,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -5582,11 +5712,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_lt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -5597,27 +5727,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5631,13 +5745,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_gt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5648,27 +5762,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5682,13 +5780,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_div_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -5699,27 +5797,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5733,13 +5815,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -5751,12 +5833,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -5769,30 +5851,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -5806,13 +5870,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -5824,12 +5888,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -5842,30 +5906,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -5879,13 +5925,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -5900,13 +6019,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, qmin) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(1)
@@ -5917,10 +6073,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, qmax) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(1)
@@ -5931,10 +6087,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, strided_cm) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(1)
@@ -5945,13 +6101,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(2)
@@ -5961,10 +6117,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, strided_cn) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(2)
@@ -5975,24 +6131,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -6005,12 +6147,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -6022,11 +6164,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -6038,11 +6180,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_lt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -6053,27 +6195,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6087,13 +6213,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_gt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6104,27 +6230,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6138,13 +6248,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_div_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -6155,27 +6265,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6189,13 +6283,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -6207,12 +6301,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -6225,30 +6319,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -6262,13 +6338,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -6280,12 +6356,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -6298,30 +6374,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -6335,13 +6393,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -6356,13 +6487,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, qmin) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(2)
@@ -6373,10 +6541,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, qmax) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(2)
@@ -6387,10 +6555,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, strided_cm) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(2)
@@ -6401,13 +6569,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(3)
@@ -6417,10 +6585,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, strided_cn) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(3)
@@ -6431,24 +6599,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -6461,12 +6615,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -6478,11 +6632,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -6494,11 +6648,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_lt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -6509,27 +6663,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -6543,13 +6681,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_gt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -6560,27 +6698,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -6594,13 +6716,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_div_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -6611,27 +6733,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -6645,13 +6751,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_gt_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -6663,12 +6769,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -6681,30 +6787,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -6718,13 +6806,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_div_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -6736,12 +6824,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -6754,30 +6842,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -6791,13 +6861,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -6812,13 +6955,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, qmin) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(3)
@@ -6829,10 +7009,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, qmax) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(3)
@@ -6843,10 +7023,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, strided_cm) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(3)
@@ -6857,13 +7037,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(4)
@@ -6873,10 +7053,10 @@
       .m(4)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, strided_cn) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(4)
@@ -6887,24 +7067,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -6917,12 +7083,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -6934,11 +7100,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -6950,11 +7116,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_lt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -6965,27 +7131,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -6999,13 +7149,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_gt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -7016,27 +7166,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7050,13 +7184,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_div_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -7067,27 +7201,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7101,13 +7219,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7119,12 +7237,12 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7137,30 +7255,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7174,13 +7274,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7192,12 +7292,12 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7210,30 +7310,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7247,13 +7329,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7268,13 +7423,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, qmin) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(4)
@@ -7285,10 +7477,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, qmax) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(4)
@@ -7299,10 +7491,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, strided_cm) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(4)
@@ -7313,13 +7505,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(1)
@@ -7329,10 +7521,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(1)
@@ -7343,24 +7535,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -7373,12 +7551,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -7390,11 +7568,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -7406,11 +7584,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_lt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -7421,27 +7599,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -7455,13 +7617,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_gt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -7472,27 +7634,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -7506,13 +7652,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_div_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -7523,27 +7669,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -7557,13 +7687,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_gt_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7575,12 +7705,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7593,30 +7723,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7630,13 +7742,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_div_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7648,12 +7760,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7666,30 +7778,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -7703,13 +7797,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -7724,13 +7891,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, qmin) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(1)
@@ -7741,10 +7945,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, qmax) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(1)
@@ -7755,10 +7959,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, strided_cm) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(1)
@@ -7769,13 +7973,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(2)
@@ -7785,10 +7989,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(2)
@@ -7799,24 +8003,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -7829,12 +8019,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -7846,11 +8036,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -7862,11 +8052,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_lt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -7877,27 +8067,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7911,13 +8085,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_gt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -7928,27 +8102,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -7962,13 +8120,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_div_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -7979,27 +8137,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8013,13 +8155,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8031,12 +8173,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8049,30 +8191,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8086,13 +8210,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8104,12 +8228,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8122,30 +8246,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8159,13 +8265,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -8180,13 +8359,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, qmin) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(2)
@@ -8197,10 +8413,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, qmax) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(2)
@@ -8211,10 +8427,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, strided_cm) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(2)
@@ -8225,13 +8441,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(3)
@@ -8241,10 +8457,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(3)
@@ -8255,24 +8471,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -8285,12 +8487,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -8302,11 +8504,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -8318,11 +8520,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_lt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -8333,27 +8535,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -8367,13 +8553,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_gt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -8384,27 +8570,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -8418,13 +8588,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_div_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -8435,27 +8605,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -8469,13 +8623,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8487,12 +8641,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8505,30 +8659,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8542,13 +8678,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8560,12 +8696,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8578,30 +8714,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8615,13 +8733,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -8636,13 +8827,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, qmin) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(3)
@@ -8653,10 +8881,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, qmax) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(3)
@@ -8667,10 +8895,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, strided_cm) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(3)
@@ -8681,13 +8909,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(4)
@@ -8697,10 +8925,10 @@
       .m(4)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(4)
@@ -8711,24 +8939,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -8741,12 +8955,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -8758,11 +8972,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -8774,11 +8988,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_lt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -8789,27 +9003,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -8823,13 +9021,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_gt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -8840,27 +9038,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -8874,13 +9056,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_div_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -8891,27 +9073,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -8925,13 +9091,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8943,12 +9109,12 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8961,30 +9127,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -8998,13 +9146,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9016,12 +9164,12 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9034,30 +9182,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9071,13 +9201,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -9092,13 +9295,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, qmin) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(4)
@@ -9109,10 +9349,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, qmax) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(4)
@@ -9123,10 +9363,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, strided_cm) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(4)
@@ -9137,13 +9377,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(1)
@@ -9153,10 +9393,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(1)
@@ -9167,24 +9407,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -9197,12 +9423,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -9214,11 +9440,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -9230,11 +9456,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_lt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -9245,27 +9471,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -9279,13 +9489,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_gt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9296,27 +9506,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -9330,13 +9524,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_div_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -9347,27 +9541,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -9381,13 +9559,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9399,12 +9577,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9417,30 +9595,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9454,13 +9614,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9472,12 +9632,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9490,30 +9650,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9527,13 +9669,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -9548,13 +9763,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, qmin) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(1)
@@ -9565,10 +9817,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, qmax) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(1)
@@ -9579,10 +9831,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, strided_cm) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(1)
@@ -9593,13 +9845,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(2)
@@ -9609,10 +9861,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(2)
@@ -9623,24 +9875,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -9653,12 +9891,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -9670,11 +9908,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -9686,11 +9924,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_lt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -9701,27 +9939,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9735,13 +9957,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_gt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -9752,27 +9974,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9786,13 +9992,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_div_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -9803,27 +10009,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -9837,13 +10027,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9855,12 +10045,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9873,30 +10063,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9910,13 +10082,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9928,12 +10100,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9946,30 +10118,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -9983,13 +10137,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -10004,13 +10231,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, qmin) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(2)
@@ -10021,10 +10285,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, qmax) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(2)
@@ -10035,10 +10299,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, strided_cm) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(2)
@@ -10049,13 +10313,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(3)
@@ -10065,10 +10329,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(3)
@@ -10079,24 +10343,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -10109,12 +10359,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -10126,11 +10376,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -10142,11 +10392,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_lt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -10157,27 +10407,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -10191,13 +10425,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_gt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -10208,27 +10442,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -10242,13 +10460,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_div_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -10259,27 +10477,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -10293,13 +10495,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10311,12 +10513,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10329,30 +10531,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10366,13 +10550,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10384,12 +10568,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10402,30 +10586,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10439,13 +10605,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -10460,13 +10699,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, qmin) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(3)
@@ -10477,10 +10753,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, qmax) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(3)
@@ -10491,10 +10767,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, strided_cm) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(3)
@@ -10505,13 +10781,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(4)
@@ -10521,10 +10797,10 @@
       .m(4)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(4)
@@ -10535,24 +10811,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -10565,12 +10827,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -10582,11 +10844,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -10598,11 +10860,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_lt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -10613,27 +10875,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -10647,13 +10893,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_gt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -10664,27 +10910,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -10698,13 +10928,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_div_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -10715,27 +10945,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -10749,13 +10963,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10767,12 +10981,12 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10785,30 +10999,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10822,13 +11018,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10840,12 +11036,12 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10858,30 +11054,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -10895,13 +11073,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -10916,13 +11167,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, qmin) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(4)
@@ -10933,10 +11221,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, qmax) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(4)
@@ -10947,10 +11235,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, strided_cm) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(4)
@@ -10961,13 +11249,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(1)
@@ -10977,10 +11265,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, strided_cn) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(1)
@@ -10991,24 +11279,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -11021,12 +11295,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -11038,11 +11312,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -11054,11 +11328,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_lt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -11069,27 +11343,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11103,13 +11361,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_gt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -11120,27 +11378,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11154,13 +11396,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_div_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -11171,27 +11413,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11205,13 +11431,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -11223,12 +11449,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -11241,30 +11467,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -11278,13 +11486,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -11296,12 +11504,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -11314,30 +11522,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -11351,13 +11541,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -11372,13 +11635,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, qmin) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(1)
@@ -11389,10 +11689,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, qmax) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(1)
@@ -11403,10 +11703,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, strided_cm) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(1)
@@ -11417,13 +11717,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(2)
@@ -11433,10 +11733,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, strided_cn) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(2)
@@ -11447,24 +11747,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -11477,12 +11763,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -11494,11 +11780,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -11510,11 +11796,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_lt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -11525,27 +11811,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -11559,13 +11829,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_gt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -11576,27 +11846,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -11610,13 +11864,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_div_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -11627,27 +11881,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -11661,13 +11899,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -11679,12 +11917,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -11697,30 +11935,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -11734,13 +11954,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -11752,12 +11972,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -11770,30 +11990,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -11807,13 +12009,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -11828,13 +12103,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, qmin) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(2)
@@ -11845,10 +12157,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, qmax) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(2)
@@ -11859,10 +12171,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, strided_cm) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(2)
@@ -11873,13 +12185,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(3)
@@ -11889,10 +12201,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, strided_cn) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(3)
@@ -11903,24 +12215,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -11933,12 +12231,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -11950,11 +12248,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -11966,11 +12264,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_lt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -11981,27 +12279,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -12015,13 +12297,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_gt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -12032,27 +12314,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -12066,13 +12332,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_div_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -12083,27 +12349,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -12117,13 +12367,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12135,12 +12385,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12153,30 +12403,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12190,13 +12422,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12208,12 +12440,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12226,30 +12458,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12263,13 +12477,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -12284,13 +12571,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, qmin) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(3)
@@ -12301,10 +12625,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, qmax) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(3)
@@ -12315,10 +12639,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, strided_cm) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(3)
@@ -12329,13 +12653,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(4)
@@ -12345,10 +12669,10 @@
       .m(4)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, strided_cn) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(4)
@@ -12359,24 +12683,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -12389,12 +12699,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -12406,11 +12716,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -12422,11 +12732,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_lt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -12437,27 +12747,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -12471,13 +12765,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_gt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -12488,27 +12782,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -12522,13 +12800,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_div_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -12539,27 +12817,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -12573,13 +12835,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12591,12 +12853,12 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12609,30 +12871,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12646,13 +12890,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12664,12 +12908,12 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12682,30 +12926,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -12719,13 +12945,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -12740,13 +13039,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, qmin) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(4)
@@ -12757,10 +13093,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, qmax) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(4)
@@ -12771,10 +13107,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, strided_cm) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(4)
@@ -12785,13 +13121,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(1)
@@ -12801,10 +13137,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, strided_cn) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(1)
@@ -12815,24 +13151,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -12845,12 +13167,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -12862,11 +13184,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -12878,11 +13200,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_lt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -12893,27 +13215,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -12927,13 +13233,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_gt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -12944,27 +13250,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -12978,13 +13268,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_div_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -12995,27 +13285,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -13029,13 +13303,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13047,12 +13321,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13065,30 +13339,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13102,13 +13358,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13120,12 +13376,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13138,30 +13394,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13175,13 +13413,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -13196,13 +13507,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, qmin) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(1)
@@ -13213,10 +13561,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, qmax) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(1)
@@ -13227,10 +13575,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, strided_cm) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(1)
@@ -13241,13 +13589,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(2)
@@ -13257,10 +13605,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, strided_cn) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(2)
@@ -13271,24 +13619,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -13301,12 +13635,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -13318,11 +13652,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -13334,11 +13668,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_lt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -13349,27 +13683,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -13383,13 +13701,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_gt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -13400,27 +13718,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -13434,13 +13736,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_div_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -13451,27 +13753,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -13485,13 +13771,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_gt_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13503,12 +13789,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13521,30 +13807,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13558,13 +13826,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_div_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13576,12 +13844,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13594,30 +13862,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13631,13 +13881,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -13652,13 +13975,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, qmin) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(2)
@@ -13669,10 +14029,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, qmax) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(2)
@@ -13683,10 +14043,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, strided_cm) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(2)
@@ -13697,13 +14057,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(3)
@@ -13713,10 +14073,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, strided_cn) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(3)
@@ -13727,24 +14087,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -13757,12 +14103,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -13774,11 +14120,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -13790,11 +14136,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_lt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -13805,27 +14151,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -13839,13 +14169,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_gt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -13856,27 +14186,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -13890,13 +14204,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_div_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -13907,27 +14221,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -13941,13 +14239,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13959,12 +14257,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -13977,30 +14275,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14014,13 +14294,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14032,12 +14312,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14050,30 +14330,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14087,13 +14349,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -14108,13 +14443,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, qmin) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(3)
@@ -14125,10 +14497,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, qmax) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(3)
@@ -14139,10 +14511,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, strided_cm) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(3)
@@ -14153,13 +14525,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(4)
@@ -14169,10 +14541,10 @@
       .m(4)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, strided_cn) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(4)
@@ -14183,24 +14555,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -14213,12 +14571,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -14230,11 +14588,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -14246,11 +14604,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_lt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -14261,27 +14619,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -14295,13 +14637,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_gt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -14312,27 +14654,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -14346,13 +14672,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_div_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -14363,27 +14689,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -14397,13 +14707,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14415,12 +14725,12 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14433,30 +14743,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14470,13 +14762,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14488,12 +14780,12 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14506,30 +14798,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14543,13 +14817,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -14564,13 +14911,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, qmin) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(4)
@@ -14581,10 +14965,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, qmax) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(4)
@@ -14595,10 +14979,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, strided_cm) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(4)
@@ -14609,13 +14993,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(1)
@@ -14625,10 +15009,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(1)
@@ -14639,24 +15023,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -14669,12 +15039,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -14686,11 +15056,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -14702,11 +15072,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_lt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -14717,27 +15087,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -14751,13 +15105,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_gt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -14768,27 +15122,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -14802,13 +15140,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_div_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -14819,27 +15157,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -14853,13 +15175,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14871,12 +15193,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14889,30 +15211,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14926,13 +15230,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14944,12 +15248,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14962,30 +15266,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -14999,13 +15285,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -15020,13 +15379,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, qmin) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(1)
@@ -15037,10 +15433,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, qmax) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(1)
@@ -15051,10 +15447,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, strided_cm) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(1)
@@ -15065,13 +15461,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(2)
@@ -15081,10 +15477,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(2)
@@ -15095,24 +15491,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -15125,12 +15507,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -15142,11 +15524,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -15158,11 +15540,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_lt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -15173,27 +15555,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -15207,13 +15573,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_gt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -15224,27 +15590,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -15258,13 +15608,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_div_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -15275,27 +15625,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -15309,13 +15643,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_gt_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15327,12 +15661,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15345,30 +15679,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15382,13 +15698,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_div_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15400,12 +15716,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15418,30 +15734,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15455,13 +15753,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -15476,13 +15847,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, qmin) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(2)
@@ -15493,10 +15901,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, qmax) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(2)
@@ -15507,10 +15915,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, strided_cm) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(2)
@@ -15521,13 +15929,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(3)
@@ -15537,10 +15945,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(3)
@@ -15551,24 +15959,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -15581,12 +15975,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -15598,11 +15992,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -15614,11 +16008,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_lt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -15629,27 +16023,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -15663,13 +16041,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_gt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -15680,27 +16058,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -15714,13 +16076,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_div_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -15731,27 +16093,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -15765,13 +16111,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15783,12 +16129,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15801,30 +16147,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15838,13 +16166,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15856,12 +16184,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15874,30 +16202,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -15911,13 +16221,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -15932,13 +16315,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, qmin) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(3)
@@ -15949,10 +16369,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, qmax) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(3)
@@ -15963,10 +16383,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, strided_cm) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(3)
@@ -15977,13 +16397,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_eq_8) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(1)
@@ -15993,10 +16413,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, strided_cn) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(1)
@@ -16007,24 +16427,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -16037,12 +16443,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -16054,11 +16460,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -16070,11 +16476,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_lt_8) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -16085,27 +16491,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16119,13 +16509,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_gt_8) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16136,27 +16526,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16170,13 +16544,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_div_8) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -16187,27 +16561,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16221,13 +16579,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, n_gt_4) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16239,12 +16597,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16257,30 +16615,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16294,13 +16634,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, n_div_4) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16312,12 +16652,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16330,30 +16670,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16367,13 +16689,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16388,13 +16783,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, zero) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, qmin) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(1)
@@ -16405,10 +16837,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, qmax) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(1)
@@ -16419,10 +16851,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD64, strided_cm) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(1)
@@ -16433,13 +16865,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_eq_8) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(2)
@@ -16449,10 +16881,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, strided_cn) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(2)
@@ -16463,24 +16895,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -16493,12 +16911,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -16510,11 +16928,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -16526,11 +16944,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_lt_8) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -16541,27 +16959,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16575,13 +16977,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_gt_8) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16592,27 +16994,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16626,13 +17012,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_div_8) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -16643,27 +17029,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16677,13 +17047,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, n_gt_4) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16695,12 +17065,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16713,30 +17083,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16750,13 +17102,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, n_div_4) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16768,12 +17120,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16786,30 +17138,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -16823,13 +17157,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -16844,13 +17251,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, zero) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, qmin) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(2)
@@ -16861,10 +17305,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, qmax) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(2)
@@ -16875,10 +17319,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD64, strided_cm) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(2)
@@ -16889,13 +17333,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_eq_8) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(3)
@@ -16905,10 +17349,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, strided_cn) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(3)
@@ -16919,24 +17363,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -16949,12 +17379,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -16966,11 +17396,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -16982,11 +17412,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_lt_8) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -16997,27 +17427,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -17031,13 +17445,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_gt_8) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17048,27 +17462,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -17082,13 +17480,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_div_8) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -17099,27 +17497,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -17133,13 +17515,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, n_gt_4) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17151,12 +17533,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17169,30 +17551,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17206,13 +17570,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, n_div_4) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17224,12 +17588,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17242,30 +17606,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17279,13 +17625,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -17300,13 +17719,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, zero) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, qmin) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(3)
@@ -17317,10 +17773,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, qmax) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(3)
@@ -17331,10 +17787,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD64, strided_cm) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(3)
@@ -17345,13 +17801,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(1)
@@ -17361,10 +17817,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(1)
@@ -17375,24 +17831,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -17405,12 +17847,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -17422,11 +17864,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -17438,11 +17880,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_lt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -17453,27 +17895,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17487,13 +17913,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_gt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17504,27 +17930,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17538,13 +17948,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_div_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -17555,27 +17965,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17589,13 +17983,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17607,12 +18001,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17625,30 +18019,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17662,13 +18038,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17680,12 +18056,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17698,30 +18074,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17735,13 +18093,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -17756,13 +18187,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, qmin) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(1)
@@ -17773,10 +18241,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, qmax) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(1)
@@ -17787,10 +18255,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, strided_cm) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(1)
@@ -17801,13 +18269,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(2)
@@ -17817,10 +18285,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(2)
@@ -17831,24 +18299,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -17861,12 +18315,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -17878,11 +18332,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -17894,11 +18348,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_lt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -17909,27 +18363,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17943,13 +18381,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_gt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -17960,27 +18398,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -17994,13 +18416,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_div_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -18011,27 +18433,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18045,13 +18451,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18063,12 +18469,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18081,30 +18487,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18118,13 +18506,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18136,12 +18524,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18154,30 +18542,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18191,13 +18561,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -18212,13 +18655,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, qmin) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(2)
@@ -18229,10 +18709,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, qmax) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(2)
@@ -18243,10 +18723,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, strided_cm) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(2)
@@ -18257,13 +18737,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(3)
@@ -18273,10 +18753,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(3)
@@ -18287,24 +18767,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -18317,12 +18783,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -18334,11 +18800,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -18350,11 +18816,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_lt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -18365,27 +18831,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -18399,13 +18849,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_gt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18416,27 +18866,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -18450,13 +18884,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_div_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -18467,27 +18901,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -18501,13 +18919,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18519,12 +18937,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18537,30 +18955,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18574,13 +18974,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18592,12 +18992,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18610,30 +19010,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18647,13 +19029,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -18668,13 +19123,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, qmin) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(3)
@@ -18685,10 +19177,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, qmax) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(3)
@@ -18699,10 +19191,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, strided_cm) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(3)
@@ -18713,13 +19205,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(1)
@@ -18729,10 +19221,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, strided_cn) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(1)
@@ -18743,24 +19235,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -18773,12 +19251,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -18790,11 +19268,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -18806,11 +19284,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_lt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -18821,27 +19299,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18855,13 +19317,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_gt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18872,27 +19334,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18906,13 +19352,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_div_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -18923,27 +19369,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18957,13 +19387,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18975,12 +19405,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -18993,30 +19423,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -19030,13 +19442,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -19048,12 +19460,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -19066,30 +19478,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -19103,13 +19497,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -19124,13 +19591,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, qmin) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(1)
@@ -19141,10 +19645,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, qmax) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(1)
@@ -19155,10 +19659,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, strided_cm) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(1)
@@ -19169,13 +19673,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(2)
@@ -19185,10 +19689,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, strided_cn) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(2)
@@ -19199,24 +19703,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -19229,12 +19719,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -19246,11 +19736,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -19262,11 +19752,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_lt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -19277,27 +19767,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -19311,13 +19785,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_gt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -19328,27 +19802,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -19362,13 +19820,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_div_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -19379,27 +19837,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -19413,13 +19855,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -19431,12 +19873,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -19449,30 +19891,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -19486,13 +19910,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -19504,12 +19928,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -19522,30 +19946,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -19559,13 +19965,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -19580,13 +20059,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, qmin) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(2)
@@ -19597,10 +20113,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, qmax) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(2)
@@ -19611,10 +20127,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, strided_cm) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(2)
@@ -19625,13 +20141,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(3)
@@ -19641,10 +20157,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, strided_cn) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(3)
@@ -19655,24 +20171,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -19685,12 +20187,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -19702,11 +20204,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -19718,11 +20220,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_lt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -19733,27 +20235,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -19767,13 +20253,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_gt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -19784,27 +20270,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -19818,13 +20288,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_div_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -19835,27 +20305,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -19869,13 +20323,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_gt_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -19887,12 +20341,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -19905,30 +20359,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -19942,13 +20378,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_div_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -19960,12 +20396,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -19978,30 +20414,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -20015,13 +20433,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -20036,13 +20527,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, qmin) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(3)
@@ -20053,10 +20581,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, qmax) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(3)
@@ -20067,10 +20595,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, strided_cm) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(3)
@@ -20081,13 +20609,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(1)
@@ -20097,10 +20625,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, strided_cn) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(1)
@@ -20111,24 +20639,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -20141,12 +20655,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -20158,11 +20672,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -20174,11 +20688,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_lt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -20189,27 +20703,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -20223,13 +20721,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_gt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -20240,27 +20738,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -20274,13 +20756,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_div_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -20291,27 +20773,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -20325,13 +20791,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -20343,12 +20809,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -20361,30 +20827,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -20398,13 +20846,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -20416,12 +20864,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -20434,30 +20882,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -20471,13 +20901,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -20492,13 +20995,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, qmin) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(1)
@@ -20509,10 +21049,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, qmax) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(1)
@@ -20523,10 +21063,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, strided_cm) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(1)
@@ -20537,13 +21077,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(2)
@@ -20553,10 +21093,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, strided_cn) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(2)
@@ -20567,24 +21107,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -20597,12 +21123,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -20614,11 +21140,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -20630,11 +21156,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_lt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -20645,27 +21171,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -20679,13 +21189,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_gt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -20696,27 +21206,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -20730,13 +21224,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_div_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -20747,27 +21241,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -20781,13 +21259,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -20799,12 +21277,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -20817,30 +21295,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -20854,13 +21314,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -20872,12 +21332,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -20890,30 +21350,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -20927,13 +21369,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -20948,13 +21463,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, qmin) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(2)
@@ -20965,10 +21517,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, qmax) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(2)
@@ -20979,10 +21531,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, strided_cm) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(2)
@@ -20993,13 +21545,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(3)
@@ -21009,10 +21561,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, strided_cn) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(3)
@@ -21023,24 +21575,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -21053,12 +21591,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -21070,11 +21608,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -21086,11 +21624,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_lt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -21101,27 +21639,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -21135,13 +21657,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_gt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -21152,27 +21674,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -21186,13 +21692,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_div_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -21203,27 +21709,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_div_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -21237,13 +21727,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_gt_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21255,12 +21745,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21273,30 +21763,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21310,13 +21782,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_div_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21328,12 +21800,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21346,30 +21818,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_div_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21383,13 +21837,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, strided_cm_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -21404,13 +21931,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, qmin) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(3)
@@ -21421,10 +21985,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, qmax) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(3)
@@ -21435,10 +21999,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, strided_cm) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(3)
@@ -21449,13 +22013,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(1)
@@ -21465,10 +22029,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(1)
@@ -21479,24 +22043,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -21509,12 +22059,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -21526,11 +22076,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -21542,11 +22092,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_lt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -21557,27 +22107,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -21591,13 +22125,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_gt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -21608,27 +22142,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -21642,13 +22160,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_div_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -21659,27 +22177,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -21693,13 +22195,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21711,12 +22213,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21729,30 +22231,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21766,13 +22250,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21784,12 +22268,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21802,30 +22286,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21839,13 +22305,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -21860,13 +22399,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, qmin) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(1)
@@ -21877,10 +22453,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, qmax) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(1)
@@ -21891,10 +22467,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, strided_cm) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(1)
@@ -21905,13 +22481,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(2)
@@ -21921,10 +22497,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(2)
@@ -21935,24 +22511,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -21965,12 +22527,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -21982,11 +22544,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -21998,11 +22560,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_lt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -22013,27 +22575,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -22047,13 +22593,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_gt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -22064,27 +22610,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -22098,13 +22628,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_div_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -22115,27 +22645,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -22149,13 +22663,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_gt_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22167,12 +22681,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22185,30 +22699,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22222,13 +22718,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_div_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22240,12 +22736,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22258,30 +22754,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22295,13 +22773,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -22316,13 +22867,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, qmin) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(2)
@@ -22333,10 +22921,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, qmax) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(2)
@@ -22347,10 +22935,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, strided_cm) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(2)
@@ -22361,13 +22949,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(3)
@@ -22377,10 +22965,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(3)
@@ -22391,24 +22979,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -22421,12 +22995,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -22438,11 +23012,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -22454,11 +23028,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_lt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -22469,27 +23043,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -22503,13 +23061,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_gt_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -22520,27 +23078,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -22554,13 +23096,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_div_8) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -22571,27 +23113,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -22605,13 +23131,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22623,12 +23149,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22641,30 +23167,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22678,13 +23186,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22696,12 +23204,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22714,30 +23222,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -22751,13 +23241,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE2;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -22772,13 +23335,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, qmin) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(3)
@@ -22789,10 +23389,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, qmax) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(3)
@@ -22803,10 +23403,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, strided_cm) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
       .mr(3)
@@ -22817,13 +23417,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_eq_8) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(1)
@@ -22833,10 +23433,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, strided_cn) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(1)
@@ -22847,24 +23447,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -22877,12 +23463,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -22894,11 +23480,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -22910,11 +23496,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_lt_8) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -22925,27 +23511,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -22959,13 +23529,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_gt_8) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -22976,27 +23546,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -23010,13 +23564,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_div_8) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -23027,27 +23581,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -23061,13 +23599,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, n_gt_4) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23079,12 +23617,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23097,30 +23635,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23134,13 +23654,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, n_div_4) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23152,12 +23672,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23170,30 +23690,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23207,13 +23709,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -23228,13 +23803,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, zero) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, qmin) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(1)
@@ -23245,10 +23857,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, qmax) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(1)
@@ -23259,10 +23871,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSSE3_LD128, strided_cm) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(1)
@@ -23273,13 +23885,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_eq_8) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(2)
@@ -23289,10 +23901,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, strided_cn) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(2)
@@ -23303,24 +23915,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -23333,12 +23931,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -23350,11 +23948,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -23366,11 +23964,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_lt_8) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -23381,27 +23979,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -23415,13 +23997,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_gt_8) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -23432,27 +24014,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -23466,13 +24032,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_div_8) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -23483,27 +24049,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -23517,13 +24067,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, n_gt_4) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23535,12 +24085,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23553,30 +24103,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23590,13 +24122,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, n_div_4) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23608,12 +24140,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23626,30 +24158,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23663,13 +24177,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -23684,13 +24271,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, zero) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, qmin) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(2)
@@ -23701,10 +24325,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, qmax) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(2)
@@ -23715,10 +24339,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSSE3_LD128, strided_cm) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(2)
@@ -23729,13 +24353,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_eq_8) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(3)
@@ -23745,10 +24369,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, strided_cn) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(3)
@@ -23759,24 +24383,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -23789,12 +24399,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -23806,11 +24416,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -23822,11 +24432,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_lt_8) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -23837,27 +24447,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -23871,13 +24465,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_gt_8) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -23888,27 +24482,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -23922,13 +24500,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_div_8) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -23939,27 +24517,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -23973,13 +24535,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, n_gt_4) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -23991,12 +24553,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24009,30 +24571,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24046,13 +24590,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, n_div_4) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24064,12 +24608,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24082,30 +24626,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24119,13 +24645,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSSE3;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -24140,13 +24739,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, zero) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, qmin) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(3)
@@ -24157,10 +24793,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, qmax) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(3)
@@ -24171,10 +24807,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSSE3_LD128, strided_cm) {
     TEST_REQUIRES_X86_SSSE3;
     GemmMicrokernelTester()
       .mr(3)
@@ -24185,13 +24821,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(1)
@@ -24201,10 +24837,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(1)
@@ -24215,24 +24851,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -24245,12 +24867,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -24262,11 +24884,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -24278,11 +24900,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_lt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -24293,27 +24915,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -24327,13 +24933,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_gt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -24344,27 +24950,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -24378,13 +24968,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_div_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -24395,27 +24985,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -24429,13 +25003,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24447,12 +25021,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24465,30 +25039,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24502,13 +25058,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24520,12 +25076,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24538,30 +25094,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24575,13 +25113,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -24596,13 +25207,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, qmin) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(1)
@@ -24613,10 +25261,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, qmax) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(1)
@@ -24627,10 +25275,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, strided_cm) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(1)
@@ -24641,13 +25289,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(2)
@@ -24657,10 +25305,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(2)
@@ -24671,24 +25319,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -24701,12 +25335,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -24718,11 +25352,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -24734,11 +25368,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_lt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -24749,27 +25383,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -24783,13 +25401,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_gt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -24800,27 +25418,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -24834,13 +25436,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_div_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -24851,27 +25453,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -24885,13 +25471,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_gt_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24903,12 +25489,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24921,30 +25507,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24958,13 +25526,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_div_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24976,12 +25544,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -24994,30 +25562,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25031,13 +25581,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -25052,13 +25675,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, qmin) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(2)
@@ -25069,10 +25729,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, qmax) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(2)
@@ -25083,10 +25743,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, strided_cm) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(2)
@@ -25097,13 +25757,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(3)
@@ -25113,10 +25773,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(3)
@@ -25127,24 +25787,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -25157,12 +25803,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -25174,11 +25820,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -25190,11 +25836,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_lt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -25205,27 +25851,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -25239,13 +25869,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_gt_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -25256,27 +25886,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -25290,13 +25904,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_div_8) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -25307,27 +25921,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -25341,13 +25939,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25359,12 +25957,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25377,30 +25975,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25414,13 +25994,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25432,12 +26012,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25450,30 +26030,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25487,13 +26049,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_SSE41;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -25508,13 +26143,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, qmin) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(3)
@@ -25525,10 +26197,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, qmax) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(3)
@@ -25539,10 +26211,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, strided_cm) {
     TEST_REQUIRES_X86_SSE41;
     GemmMicrokernelTester()
       .mr(3)
@@ -25553,13 +26225,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(1)
@@ -25569,10 +26241,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, strided_cn) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(1)
@@ -25583,24 +26255,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -25613,12 +26271,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -25630,11 +26288,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -25646,11 +26304,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_lt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -25661,27 +26319,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -25695,13 +26337,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_gt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -25712,27 +26354,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -25746,13 +26372,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_div_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -25763,27 +26389,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -25797,13 +26407,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25815,12 +26425,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25833,30 +26443,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25870,13 +26462,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25888,12 +26480,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25906,30 +26498,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25943,13 +26517,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -25964,13 +26611,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, qmin) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(1)
@@ -25981,10 +26665,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, qmax) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(1)
@@ -25995,10 +26679,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, strided_cm) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(1)
@@ -26009,13 +26693,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(2)
@@ -26025,10 +26709,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, strided_cn) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(2)
@@ -26039,24 +26723,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -26069,12 +26739,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -26086,11 +26756,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -26102,11 +26772,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_lt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -26117,27 +26787,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -26151,13 +26805,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_gt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -26168,27 +26822,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -26202,13 +26840,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_div_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -26219,27 +26857,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -26253,13 +26875,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26271,12 +26893,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26289,30 +26911,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26326,13 +26930,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26344,12 +26948,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26362,30 +26966,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26399,13 +26985,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -26420,13 +27079,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, qmin) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(2)
@@ -26437,10 +27133,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, qmax) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(2)
@@ -26451,10 +27147,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, strided_cm) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(2)
@@ -26465,13 +27161,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(3)
@@ -26481,10 +27177,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, strided_cn) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(3)
@@ -26495,24 +27191,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -26525,12 +27207,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -26542,11 +27224,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -26558,11 +27240,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_lt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -26573,27 +27255,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -26607,13 +27273,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_gt_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -26624,27 +27290,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -26658,13 +27308,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_div_8) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -26675,27 +27325,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -26709,13 +27343,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26727,12 +27361,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26745,30 +27379,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26782,13 +27398,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26800,12 +27416,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26818,30 +27434,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26855,13 +27453,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_AVX;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -26876,13 +27547,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, qmin) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(3)
@@ -26893,10 +27601,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, qmax) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(3)
@@ -26907,10 +27615,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, strided_cm) {
     TEST_REQUIRES_X86_AVX;
     GemmMicrokernelTester()
       .mr(3)
@@ -26921,13 +27629,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(1)
@@ -26937,10 +27645,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, strided_cn) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(1)
@@ -26951,24 +27659,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -26981,12 +27675,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -26998,11 +27692,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -27014,11 +27708,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_lt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -27029,27 +27723,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -27063,13 +27741,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_gt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -27080,27 +27758,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -27114,13 +27776,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_div_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -27131,27 +27793,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -27165,13 +27811,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -27183,12 +27829,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -27201,30 +27847,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -27238,13 +27866,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -27256,12 +27884,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -27274,30 +27902,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -27311,13 +27921,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -27332,13 +28015,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, qmin) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(1)
@@ -27349,10 +28069,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, qmax) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(1)
@@ -27363,10 +28083,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, strided_cm) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(1)
@@ -27377,13 +28097,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(2)
@@ -27393,10 +28113,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, strided_cn) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(2)
@@ -27407,24 +28127,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -27437,12 +28143,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -27454,11 +28160,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -27470,11 +28176,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_lt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -27485,27 +28191,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -27519,13 +28209,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_gt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -27536,27 +28226,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -27570,13 +28244,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_div_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -27587,27 +28261,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -27621,13 +28279,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -27639,12 +28297,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -27657,30 +28315,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -27694,13 +28334,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -27712,12 +28352,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -27730,30 +28370,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -27767,13 +28389,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -27788,13 +28483,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, qmin) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(2)
@@ -27805,10 +28537,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, qmax) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(2)
@@ -27819,10 +28551,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, strided_cm) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(2)
@@ -27833,13 +28565,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(3)
@@ -27849,10 +28581,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, strided_cn) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(3)
@@ -27863,24 +28595,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -27893,12 +28611,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8_subtile_m) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -27910,11 +28628,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8_subtile_n) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -27926,11 +28644,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_lt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -27941,27 +28659,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_lt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -27975,13 +28677,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_gt_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -27992,27 +28694,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_gt_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -28026,13 +28712,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_div_8) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -28043,27 +28729,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_div_8_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -28077,13 +28747,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -28095,12 +28765,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -28113,30 +28783,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -28150,13 +28802,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -28168,12 +28820,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4_strided_cn) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -28186,30 +28838,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -28223,13 +28857,86 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, strided_cm_subtile) {
     TEST_REQUIRES_X86_XOP;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -28244,13 +28951,50 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, qmin) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(3)
@@ -28261,10 +29005,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, qmax) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(3)
@@ -28275,10 +29019,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, strided_cm) {
     TEST_REQUIRES_X86_XOP;
     GemmMicrokernelTester()
       .mr(3)
@@ -28289,7 +29033,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
diff --git a/test/qs8-igemm-minmax-fp32.yaml b/test/qs8-igemm-minmax-fp32.yaml
index 2b6fa36..9f060e5 100644
--- a/test/qs8-igemm-minmax-fp32.yaml
+++ b/test/qs8-igemm-minmax-fp32.yaml
@@ -3,190 +3,190 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse2_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse2_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse2_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse2_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse2_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse2_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse2_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse2_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse2_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse2_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse2_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse2_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse2_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse2_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse2_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse2_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse2_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse2_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse2_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse2_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2