LD128 versions of QS8 GEMM SSE2/SSSE3/SSE4.1 microkernels

PiperOrigin-RevId: 324316646
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc
index f620dc8..efd4363 100644
--- a/bench/qs8-gemm.cc
+++ b/bench/qs8-gemm.cc
@@ -112,6 +112,16 @@
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, 4, 4, 2, 1);
   }
 
+  static void qs8_gemm_4x4c2__sse2_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128, 4, 4, 2, 1);
+  }
+  static void qs8_gemm_4x4c2__ssse3_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128, 4, 4, 2, 1);
+  }
+  static void qs8_gemm_4x4c2__sse41_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, 4, 4, 2, 1);
+  }
+
   static void qs8_gemm_2x4c8__sse2_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x4c8__sse2_ld64, 2, 4, 8, 1);
   }
@@ -122,13 +132,31 @@
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, 2, 4, 8, 1);
   }
 
+  static void qs8_gemm_2x4c8__sse2_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x4c8__sse2_ld128, 2, 4, 8, 1);
+  }
+  static void qs8_gemm_2x4c8__ssse3_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x4c8__ssse3_ld128, 2, 4, 8, 1);
+  }
+  static void qs8_gemm_2x4c8__sse41_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, 2, 4, 8, 1);
+  }
+
   BENCHMARK_GEMM(qs8_gemm_4x4c2__sse2_ld64)
   BENCHMARK_GEMM(qs8_gemm_4x4c2__ssse3_ld64)
   BENCHMARK_GEMM(qs8_gemm_4x4c2__sse41_ld64)
 
+  BENCHMARK_GEMM(qs8_gemm_4x4c2__sse2_ld128)
+  BENCHMARK_GEMM(qs8_gemm_4x4c2__ssse3_ld128)
+  BENCHMARK_GEMM(qs8_gemm_4x4c2__sse41_ld128)
+
   BENCHMARK_GEMM(qs8_gemm_2x4c8__sse2_ld64)
   BENCHMARK_GEMM(qs8_gemm_2x4c8__ssse3_ld64)
   BENCHMARK_GEMM(qs8_gemm_2x4c8__sse41_ld64)
+
+  BENCHMARK_GEMM(qs8_gemm_2x4c8__sse2_ld128)
+  BENCHMARK_GEMM(qs8_gemm_2x4c8__ssse3_ld128)
+  BENCHMARK_GEMM(qs8_gemm_2x4c8__sse41_ld128)
 #endif
 
 #ifndef XNNPACK_BENCHMARK_NO_MAIN