Support QC8 GEMM microkernels

- Minimal set of 8-bit fixed-point microkernels with per-channel quantization
(QC8) optimized for AVX2.
- Extend packing functions to allow extra space after the kernel data.
Per-channel quantization parameters are later packed into that space.
- Extend GemmMicrokernelTester to support unit testing of QC8 GEMM.

PiperOrigin-RevId: 377191090
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc
index 7207a86..c1db5bd 100644
--- a/bench/qs8-gemm.cc
+++ b/bench/qs8-gemm.cc
@@ -73,9 +73,9 @@
   std::fill(w.begin(), w.end(), 0);
   const xnn_qs8_packing_params packing_params = { 127 };
   if (extended_weights) {
-    xnn_pack_qs8_gemm_xw_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), &packing_params);
+    xnn_pack_qs8_gemm_xw_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, &packing_params);
   } else {
-    xnn_pack_qs8_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), &packing_params);
+    xnn_pack_qs8_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, &packing_params);
   }
   std::vector<int8_t> c(c_elements * num_buffers);
   std::fill(c.begin(), c.end(), 0xA5);