FP32 requantization in QS8 GEMM/IGEMM microkernels for SSE/AVX/XOP

PiperOrigin-RevId: 376966195
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc
index c5ac7f6..48c41ab 100644
--- a/bench/qs8-gemm.cc
+++ b/bench/qs8-gemm.cc
@@ -742,27 +742,51 @@
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__xop_ld64, 2, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP);
   }
+  static void qs8_gemm_2x4c2_fp32__xop_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, 2, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
+  }
   static void qs8_gemm_3x4c2_gemmlowp__xop_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__xop_ld64, 3, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP);
   }
+  static void qs8_gemm_3x4c2_fp32__xop_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, 3, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
+  }
   static void qs8_gemm_4x4c2_gemmlowp__xop_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__xop_ld64, 4, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP);
   }
+  static void qs8_gemm_4x4c2_fp32__xop_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, 4, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
+  }
 
   static void qs8_gemm_2x4c2_gemmlowp__xop_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__xop_ld128, 2, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP);
   }
+  static void qs8_gemm_2x4c2_fp32__xop_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, 2, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
+  }
   static void qs8_gemm_3x4c2_gemmlowp__xop_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__xop_ld128, 3, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP);
   }
+  static void qs8_gemm_3x4c2_fp32__xop_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, 3, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
+  }
   static void qs8_gemm_4x4c2_gemmlowp__xop_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__xop_ld128, 4, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP);
   }
+  static void qs8_gemm_4x4c2_fp32__xop_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, 4, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
+  }
 
   static void qs8_gemm_xw_2x4c2_gemmlowp__xop(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__xop, 2, 4, 2, 1,
@@ -781,19 +805,35 @@
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__xop_ld64, 2, 4, 8, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP);
   }
+  static void qs8_gemm_2x4c8_fp32__xop_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, 2, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
+  }
   static void qs8_gemm_3x4c8_gemmlowp__xop_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__xop_ld64, 3, 4, 8, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP);
   }
+  static void qs8_gemm_3x4c8_fp32__xop_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, 3, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
+  }
 
   static void qs8_gemm_2x4c8_gemmlowp__xop_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__xop_ld128, 2, 4, 8, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP);
   }
+  static void qs8_gemm_2x4c8_fp32__xop_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, 2, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
+  }
   static void qs8_gemm_3x4c8_gemmlowp__xop_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__xop_ld128, 3, 4, 8, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP);
   }
+  static void qs8_gemm_3x4c8_fp32__xop_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, 3, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
+  }
 
   static void qs8_gemm_xw_2x4c8_gemmlowp__xop(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, 2, 4, 8, 1,
@@ -808,27 +848,51 @@
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__avx_ld64, 2, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX);
   }
+  static void qs8_gemm_2x4c2_fp32__avx_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, 2, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
+  }
   static void qs8_gemm_3x4c2_gemmlowp__avx_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__avx_ld64, 3, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX);
   }
+  static void qs8_gemm_3x4c2_fp32__avx_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, 3, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
+  }
   static void qs8_gemm_4x4c2_gemmlowp__avx_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__avx_ld64, 4, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX);
   }
+  static void qs8_gemm_4x4c2_fp32__avx_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, 4, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
+  }
 
   static void qs8_gemm_2x4c2_gemmlowp__avx_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__avx_ld128, 2, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX);
   }
+  static void qs8_gemm_2x4c2_fp32__avx_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, 2, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
+  }
   static void qs8_gemm_3x4c2_gemmlowp__avx_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__avx_ld128, 3, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX);
   }
+  static void qs8_gemm_3x4c2_fp32__avx_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, 3, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
+  }
   static void qs8_gemm_4x4c2_gemmlowp__avx_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__avx_ld128, 4, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX);
   }
+  static void qs8_gemm_4x4c2_fp32__avx_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, 4, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
+  }
 
   static void qs8_gemm_xw_2x4c2_gemmlowp__avx(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__avx, 2, 4, 2, 1,
@@ -847,19 +911,35 @@
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__avx_ld64, 2, 4, 8, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX);
   }
+  static void qs8_gemm_2x4c8_fp32__avx_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, 2, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
+  }
   static void qs8_gemm_3x4c8_gemmlowp__avx_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__avx_ld64, 3, 4, 8, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX);
   }
+  static void qs8_gemm_3x4c8_fp32__avx_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, 3, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
+  }
 
   static void qs8_gemm_2x4c8_gemmlowp__avx_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__avx_ld128, 2, 4, 8, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX);
   }
+  static void qs8_gemm_2x4c8_fp32__avx_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, 2, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
+  }
   static void qs8_gemm_3x4c8_gemmlowp__avx_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__avx_ld128, 3, 4, 8, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX);
   }
+  static void qs8_gemm_3x4c8_fp32__avx_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, 3, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
+  }
 
   static void qs8_gemm_xw_2x4c8_gemmlowp__avx(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, 2, 4, 8, 1,
@@ -874,27 +954,51 @@
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__sse41_ld64, 2, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41);
   }
+  static void qs8_gemm_2x4c2_fp32__sse41_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, 2, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
+  }
   static void qs8_gemm_3x4c2_gemmlowp__sse41_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__sse41_ld64, 3, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41);
   }
+  static void qs8_gemm_3x4c2_fp32__sse41_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, 3, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
+  }
   static void qs8_gemm_4x4c2_gemmlowp__sse41_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse41_ld64, 4, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41);
   }
+  static void qs8_gemm_4x4c2_fp32__sse41_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, 4, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
+  }
 
   static void qs8_gemm_2x4c2_gemmlowp__sse41_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__sse41_ld128, 2, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41);
   }
+  static void qs8_gemm_2x4c2_fp32__sse41_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, 2, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
+  }
   static void qs8_gemm_3x4c2_gemmlowp__sse41_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__sse41_ld128, 3, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41);
   }
+  static void qs8_gemm_3x4c2_fp32__sse41_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, 3, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
+  }
   static void qs8_gemm_4x4c2_gemmlowp__sse41_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse41_ld128, 4, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41);
   }
+  static void qs8_gemm_4x4c2_fp32__sse41_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, 4, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
+  }
 
   static void qs8_gemm_xw_2x4c2_gemmlowp__sse41(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse41, 2, 4, 2, 1,
@@ -913,19 +1017,35 @@
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse41_ld64, 2, 4, 8, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41);
   }
+  static void qs8_gemm_2x4c8_fp32__sse41_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, 2, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
+  }
   static void qs8_gemm_3x4c8_gemmlowp__sse41_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__sse41_ld64, 3, 4, 8, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41);
   }
+  static void qs8_gemm_3x4c8_fp32__sse41_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, 3, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
+  }
 
   static void qs8_gemm_2x4c8_gemmlowp__sse41_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse41_ld128, 2, 4, 8, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41);
   }
+  static void qs8_gemm_2x4c8_fp32__sse41_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, 2, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
+  }
   static void qs8_gemm_3x4c8_gemmlowp__sse41_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__sse41_ld128, 3, 4, 8, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41);
   }
+  static void qs8_gemm_3x4c8_fp32__sse41_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, 3, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
+  }
 
   static void qs8_gemm_xw_2x4c8_gemmlowp__sse41(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, 2, 4, 8, 1,
@@ -940,27 +1060,51 @@
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__ssse3_ld64, 2, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3);
   }
+  static void qs8_gemm_2x4c2_fp32__ssse3_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__ssse3_ld64, 2, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
+  }
   static void qs8_gemm_3x4c2_gemmlowp__ssse3_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__ssse3_ld64, 3, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3);
   }
+  static void qs8_gemm_3x4c2_fp32__ssse3_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__ssse3_ld64, 3, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
+  }
   static void qs8_gemm_4x4c2_gemmlowp__ssse3_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__ssse3_ld64, 4, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3);
   }
+  static void qs8_gemm_4x4c2_fp32__ssse3_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__ssse3_ld64, 4, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
+  }
 
   static void qs8_gemm_2x4c2_gemmlowp__ssse3_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__ssse3_ld128, 2, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3);
   }
+  static void qs8_gemm_2x4c2_fp32__ssse3_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__ssse3_ld128, 2, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
+  }
   static void qs8_gemm_3x4c2_gemmlowp__ssse3_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__ssse3_ld128, 3, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3);
   }
+  static void qs8_gemm_3x4c2_fp32__ssse3_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__ssse3_ld128, 3, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
+  }
   static void qs8_gemm_4x4c2_gemmlowp__ssse3_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__ssse3_ld128, 4, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3);
   }
+  static void qs8_gemm_4x4c2_fp32__ssse3_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__ssse3_ld128, 4, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
+  }
 
   static void qs8_gemm_xw_2x4c2_gemmlowp__ssse3(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__ssse3, 2, 4, 2, 1,
@@ -979,19 +1123,35 @@
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__ssse3_ld64, 2, 4, 8, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3);
   }
+  static void qs8_gemm_2x4c8_fp32__ssse3_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, 2, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
+  }
   static void qs8_gemm_3x4c8_gemmlowp__ssse3_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__ssse3_ld64, 3, 4, 8, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3);
   }
+  static void qs8_gemm_3x4c8_fp32__ssse3_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, 3, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
+  }
 
   static void qs8_gemm_2x4c8_gemmlowp__ssse3_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__ssse3_ld128, 2, 4, 8, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3);
   }
+  static void qs8_gemm_2x4c8_fp32__ssse3_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, 2, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
+  }
   static void qs8_gemm_3x4c8_gemmlowp__ssse3_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__ssse3_ld128, 3, 4, 8, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3);
   }
+  static void qs8_gemm_3x4c8_fp32__ssse3_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, 3, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
+  }
 
   static void qs8_gemm_xw_2x4c8_gemmlowp__ssse3(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, 2, 4, 8, 1,
@@ -1006,27 +1166,51 @@
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__sse2_ld64, 2, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse2_params);
   }
+  static void qs8_gemm_2x4c2_fp32__sse2_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, 2, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params);
+  }
   static void qs8_gemm_3x4c2_gemmlowp__sse2_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__sse2_ld64, 3, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse2_params);
   }
+  static void qs8_gemm_3x4c2_fp32__sse2_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, 3, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params);
+  }
   static void qs8_gemm_4x4c2_gemmlowp__sse2_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse2_ld64, 4, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse2_params);
   }
+  static void qs8_gemm_4x4c2_fp32__sse2_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, 4, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params);
+  }
 
   static void qs8_gemm_2x4c2_gemmlowp__sse2_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__sse2_ld128, 2, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse2_params);
   }
+  static void qs8_gemm_2x4c2_fp32__sse2_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, 2, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params);
+  }
   static void qs8_gemm_3x4c2_gemmlowp__sse2_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__sse2_ld128, 3, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse2_params);
   }
+  static void qs8_gemm_3x4c2_fp32__sse2_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, 3, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params);
+  }
   static void qs8_gemm_4x4c2_gemmlowp__sse2_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse2_ld128, 4, 4, 2, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse2_params);
   }
+  static void qs8_gemm_4x4c2_fp32__sse2_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, 4, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params);
+  }
 
   static void qs8_gemm_xw_2x4c2_gemmlowp__sse2(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse2, 2, 4, 2, 1,
@@ -1045,19 +1229,35 @@
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse2_ld64, 2, 4, 8, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse2_params);
   }
+  static void qs8_gemm_2x4c8_fp32__sse2_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, 2, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params);
+  }
   static void qs8_gemm_3x4c8_gemmlowp__sse2_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__sse2_ld64, 3, 4, 8, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse2_params);
   }
+  static void qs8_gemm_3x4c8_fp32__sse2_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, 3, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params);
+  }
 
   static void qs8_gemm_2x4c8_gemmlowp__sse2_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse2_ld128, 2, 4, 8, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse2_params);
   }
+  static void qs8_gemm_2x4c8_fp32__sse2_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, 2, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params);
+  }
   static void qs8_gemm_3x4c8_gemmlowp__sse2_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__sse2_ld128, 3, 4, 8, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_sse2_params);
   }
+  static void qs8_gemm_3x4c8_fp32__sse2_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, 3, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params);
+  }
 
   static void qs8_gemm_xw_2x4c8_gemmlowp__sse2(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, 2, 4, 8, 1,
@@ -1083,82 +1283,132 @@
   BENCHMARK_GEMM(qs8_gemm_xw_3x8c8_gemmlowp__avx2)
 
   BENCHMARK_GEMM(qs8_gemm_2x4c2_gemmlowp__xop_ld64)
+  BENCHMARK_GEMM(qs8_gemm_2x4c2_fp32__xop_ld64)
   BENCHMARK_GEMM(qs8_gemm_3x4c2_gemmlowp__xop_ld64)
+  BENCHMARK_GEMM(qs8_gemm_3x4c2_fp32__xop_ld64)
   BENCHMARK_GEMM(qs8_gemm_4x4c2_gemmlowp__xop_ld64)
+  BENCHMARK_GEMM(qs8_gemm_4x4c2_fp32__xop_ld64)
   BENCHMARK_GEMM(qs8_gemm_2x4c2_gemmlowp__xop_ld128)
+  BENCHMARK_GEMM(qs8_gemm_2x4c2_fp32__xop_ld128)
   BENCHMARK_GEMM(qs8_gemm_3x4c2_gemmlowp__xop_ld128)
+  BENCHMARK_GEMM(qs8_gemm_3x4c2_fp32__xop_ld128)
   BENCHMARK_GEMM(qs8_gemm_4x4c2_gemmlowp__xop_ld128)
+  BENCHMARK_GEMM(qs8_gemm_4x4c2_fp32__xop_ld128)
   BENCHMARK_GEMM(qs8_gemm_xw_2x4c2_gemmlowp__xop)
   BENCHMARK_GEMM(qs8_gemm_xw_3x4c2_gemmlowp__xop)
   BENCHMARK_GEMM(qs8_gemm_xw_4x4c2_gemmlowp__xop)
   BENCHMARK_GEMM(qs8_gemm_2x4c8_gemmlowp__xop_ld64)
+  BENCHMARK_GEMM(qs8_gemm_2x4c8_fp32__xop_ld64)
   BENCHMARK_GEMM(qs8_gemm_3x4c8_gemmlowp__xop_ld64)
+  BENCHMARK_GEMM(qs8_gemm_3x4c8_fp32__xop_ld64)
   BENCHMARK_GEMM(qs8_gemm_2x4c8_gemmlowp__xop_ld128)
+  BENCHMARK_GEMM(qs8_gemm_2x4c8_fp32__xop_ld128)
   BENCHMARK_GEMM(qs8_gemm_3x4c8_gemmlowp__xop_ld128)
+  BENCHMARK_GEMM(qs8_gemm_3x4c8_fp32__xop_ld128)
   BENCHMARK_GEMM(qs8_gemm_xw_2x4c8_gemmlowp__xop)
   BENCHMARK_GEMM(qs8_gemm_xw_3x4c8_gemmlowp__xop)
 
   BENCHMARK_GEMM(qs8_gemm_2x4c2_gemmlowp__avx_ld64)
+  BENCHMARK_GEMM(qs8_gemm_2x4c2_fp32__avx_ld64)
   BENCHMARK_GEMM(qs8_gemm_3x4c2_gemmlowp__avx_ld64)
+  BENCHMARK_GEMM(qs8_gemm_3x4c2_fp32__avx_ld64)
   BENCHMARK_GEMM(qs8_gemm_4x4c2_gemmlowp__avx_ld64)
+  BENCHMARK_GEMM(qs8_gemm_4x4c2_fp32__avx_ld64)
   BENCHMARK_GEMM(qs8_gemm_2x4c2_gemmlowp__avx_ld128)
+  BENCHMARK_GEMM(qs8_gemm_2x4c2_fp32__avx_ld128)
   BENCHMARK_GEMM(qs8_gemm_3x4c2_gemmlowp__avx_ld128)
+  BENCHMARK_GEMM(qs8_gemm_3x4c2_fp32__avx_ld128)
   BENCHMARK_GEMM(qs8_gemm_4x4c2_gemmlowp__avx_ld128)
+  BENCHMARK_GEMM(qs8_gemm_4x4c2_fp32__avx_ld128)
   BENCHMARK_GEMM(qs8_gemm_xw_2x4c2_gemmlowp__avx)
   BENCHMARK_GEMM(qs8_gemm_xw_3x4c2_gemmlowp__avx)
   BENCHMARK_GEMM(qs8_gemm_xw_4x4c2_gemmlowp__avx)
   BENCHMARK_GEMM(qs8_gemm_2x4c8_gemmlowp__avx_ld64)
+  BENCHMARK_GEMM(qs8_gemm_2x4c8_fp32__avx_ld64)
   BENCHMARK_GEMM(qs8_gemm_3x4c8_gemmlowp__avx_ld64)
+  BENCHMARK_GEMM(qs8_gemm_3x4c8_fp32__avx_ld64)
   BENCHMARK_GEMM(qs8_gemm_2x4c8_gemmlowp__avx_ld128)
+  BENCHMARK_GEMM(qs8_gemm_2x4c8_fp32__avx_ld128)
   BENCHMARK_GEMM(qs8_gemm_3x4c8_gemmlowp__avx_ld128)
+  BENCHMARK_GEMM(qs8_gemm_3x4c8_fp32__avx_ld128)
   BENCHMARK_GEMM(qs8_gemm_xw_2x4c8_gemmlowp__avx)
   BENCHMARK_GEMM(qs8_gemm_xw_3x4c8_gemmlowp__avx)
 
   BENCHMARK_GEMM(qs8_gemm_2x4c2_gemmlowp__sse41_ld64)
+  BENCHMARK_GEMM(qs8_gemm_2x4c2_fp32__sse41_ld64)
   BENCHMARK_GEMM(qs8_gemm_3x4c2_gemmlowp__sse41_ld64)
+  BENCHMARK_GEMM(qs8_gemm_3x4c2_fp32__sse41_ld64)
   BENCHMARK_GEMM(qs8_gemm_4x4c2_gemmlowp__sse41_ld64)
+  BENCHMARK_GEMM(qs8_gemm_4x4c2_fp32__sse41_ld64)
   BENCHMARK_GEMM(qs8_gemm_2x4c2_gemmlowp__sse41_ld128)
+  BENCHMARK_GEMM(qs8_gemm_2x4c2_fp32__sse41_ld128)
   BENCHMARK_GEMM(qs8_gemm_3x4c2_gemmlowp__sse41_ld128)
+  BENCHMARK_GEMM(qs8_gemm_3x4c2_fp32__sse41_ld128)
   BENCHMARK_GEMM(qs8_gemm_4x4c2_gemmlowp__sse41_ld128)
+  BENCHMARK_GEMM(qs8_gemm_4x4c2_fp32__sse41_ld128)
   BENCHMARK_GEMM(qs8_gemm_xw_2x4c2_gemmlowp__sse41)
   BENCHMARK_GEMM(qs8_gemm_xw_3x4c2_gemmlowp__sse41)
   BENCHMARK_GEMM(qs8_gemm_xw_4x4c2_gemmlowp__sse41)
   BENCHMARK_GEMM(qs8_gemm_2x4c8_gemmlowp__sse41_ld64)
+  BENCHMARK_GEMM(qs8_gemm_2x4c8_fp32__sse41_ld64)
   BENCHMARK_GEMM(qs8_gemm_3x4c8_gemmlowp__sse41_ld64)
+  BENCHMARK_GEMM(qs8_gemm_3x4c8_fp32__sse41_ld64)
   BENCHMARK_GEMM(qs8_gemm_2x4c8_gemmlowp__sse41_ld128)
+  BENCHMARK_GEMM(qs8_gemm_2x4c8_fp32__sse41_ld128)
   BENCHMARK_GEMM(qs8_gemm_3x4c8_gemmlowp__sse41_ld128)
+  BENCHMARK_GEMM(qs8_gemm_3x4c8_fp32__sse41_ld128)
   BENCHMARK_GEMM(qs8_gemm_xw_2x4c8_gemmlowp__sse41)
   BENCHMARK_GEMM(qs8_gemm_xw_3x4c8_gemmlowp__sse41)
 
   BENCHMARK_GEMM(qs8_gemm_2x4c2_gemmlowp__ssse3_ld64)
+  BENCHMARK_GEMM(qs8_gemm_2x4c2_fp32__ssse3_ld64)
   BENCHMARK_GEMM(qs8_gemm_3x4c2_gemmlowp__ssse3_ld64)
+  BENCHMARK_GEMM(qs8_gemm_3x4c2_fp32__ssse3_ld64)
   BENCHMARK_GEMM(qs8_gemm_4x4c2_gemmlowp__ssse3_ld64)
+  BENCHMARK_GEMM(qs8_gemm_4x4c2_fp32__ssse3_ld64)
   BENCHMARK_GEMM(qs8_gemm_2x4c2_gemmlowp__ssse3_ld128)
+  BENCHMARK_GEMM(qs8_gemm_2x4c2_fp32__ssse3_ld128)
   BENCHMARK_GEMM(qs8_gemm_3x4c2_gemmlowp__ssse3_ld128)
+  BENCHMARK_GEMM(qs8_gemm_3x4c2_fp32__ssse3_ld128)
   BENCHMARK_GEMM(qs8_gemm_4x4c2_gemmlowp__ssse3_ld128)
+  BENCHMARK_GEMM(qs8_gemm_4x4c2_fp32__ssse3_ld128)
   BENCHMARK_GEMM(qs8_gemm_xw_2x4c2_gemmlowp__ssse3)
   BENCHMARK_GEMM(qs8_gemm_xw_3x4c2_gemmlowp__ssse3)
   BENCHMARK_GEMM(qs8_gemm_xw_4x4c2_gemmlowp__ssse3)
   BENCHMARK_GEMM(qs8_gemm_2x4c8_gemmlowp__ssse3_ld64)
+  BENCHMARK_GEMM(qs8_gemm_2x4c8_fp32__ssse3_ld64)
   BENCHMARK_GEMM(qs8_gemm_3x4c8_gemmlowp__ssse3_ld64)
+  BENCHMARK_GEMM(qs8_gemm_3x4c8_fp32__ssse3_ld64)
   BENCHMARK_GEMM(qs8_gemm_2x4c8_gemmlowp__ssse3_ld128)
+  BENCHMARK_GEMM(qs8_gemm_2x4c8_fp32__ssse3_ld128)
   BENCHMARK_GEMM(qs8_gemm_3x4c8_gemmlowp__ssse3_ld128)
+  BENCHMARK_GEMM(qs8_gemm_3x4c8_fp32__ssse3_ld128)
   BENCHMARK_GEMM(qs8_gemm_xw_2x4c8_gemmlowp__ssse3)
   BENCHMARK_GEMM(qs8_gemm_xw_3x4c8_gemmlowp__ssse3)
 
   BENCHMARK_GEMM(qs8_gemm_2x4c2_gemmlowp__sse2_ld64)
+  BENCHMARK_GEMM(qs8_gemm_2x4c2_fp32__sse2_ld64)
   BENCHMARK_GEMM(qs8_gemm_3x4c2_gemmlowp__sse2_ld64)
+  BENCHMARK_GEMM(qs8_gemm_3x4c2_fp32__sse2_ld64)
   BENCHMARK_GEMM(qs8_gemm_4x4c2_gemmlowp__sse2_ld64)
+  BENCHMARK_GEMM(qs8_gemm_4x4c2_fp32__sse2_ld64)
   BENCHMARK_GEMM(qs8_gemm_2x4c2_gemmlowp__sse2_ld128)
+  BENCHMARK_GEMM(qs8_gemm_2x4c2_fp32__sse2_ld128)
   BENCHMARK_GEMM(qs8_gemm_3x4c2_gemmlowp__sse2_ld128)
+  BENCHMARK_GEMM(qs8_gemm_3x4c2_fp32__sse2_ld128)
   BENCHMARK_GEMM(qs8_gemm_4x4c2_gemmlowp__sse2_ld128)
+  BENCHMARK_GEMM(qs8_gemm_4x4c2_fp32__sse2_ld128)
   BENCHMARK_GEMM(qs8_gemm_xw_2x4c2_gemmlowp__sse2)
   BENCHMARK_GEMM(qs8_gemm_xw_3x4c2_gemmlowp__sse2)
   BENCHMARK_GEMM(qs8_gemm_xw_4x4c2_gemmlowp__sse2)
   BENCHMARK_GEMM(qs8_gemm_2x4c8_gemmlowp__sse2_ld64)
+  BENCHMARK_GEMM(qs8_gemm_2x4c8_fp32__sse2_ld64)
   BENCHMARK_GEMM(qs8_gemm_3x4c8_gemmlowp__sse2_ld64)
+  BENCHMARK_GEMM(qs8_gemm_3x4c8_fp32__sse2_ld64)
   BENCHMARK_GEMM(qs8_gemm_2x4c8_gemmlowp__sse2_ld128)
+  BENCHMARK_GEMM(qs8_gemm_2x4c8_fp32__sse2_ld128)
   BENCHMARK_GEMM(qs8_gemm_3x4c8_gemmlowp__sse2_ld128)
+  BENCHMARK_GEMM(qs8_gemm_3x4c8_fp32__sse2_ld128)
   BENCHMARK_GEMM(qs8_gemm_xw_2x4c8_gemmlowp__sse2)
   BENCHMARK_GEMM(qs8_gemm_xw_3x4c8_gemmlowp__sse2)
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64