Benchmarks rename sgemm and sppmm to f32_gemm and f32_ppmm

PiperOrigin-RevId: 280775266
diff --git a/bench/f32-gemm.cc b/bench/f32-gemm.cc
index cdc0606..b9095ec 100644
--- a/bench/f32-gemm.cc
+++ b/bench/f32-gemm.cc
@@ -336,284 +336,284 @@
 
 
 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
-  static void sgemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
+  static void f32_gemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53, 1, 12, 1, 1);
   }
-  static void sgemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
+  static void f32_gemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53, 1, 8, 1, 1);
   }
-  static void sgemm_1x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
+  static void f32_gemm_1x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57, 1, 8, 1, 1);
   }
-  static void sgemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
+  static void f32_gemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1);
   }
-  static void sgemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
+  static void f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53, 4, 12, 1, 1);
   }
-  static void sgemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
+  static void f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53, 4, 8, 1, 1);
   }
-  static void sgemm_4x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
+  static void f32_gemm_4x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57, 4, 8, 1, 1);
   }
-  static void sgemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
+  static void f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1);
   }
-  static void sgemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
+  static void f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64, 4, 8, 1, 1);
   }
-  static void sgemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
+  static void f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128, 4, 8, 1, 1);
   }
-  static void sgemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
+  static void f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75, 5, 8, 1, 1);
   }
-  static void sgemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
+  static void f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64, 6, 8, 1, 1);
   }
-  static void sgemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
+  static void f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1);
   }
-  static void sgemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
+  static void f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53, 6, 8, 1, 1);
   }
-  static void sgemm_6x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
+  static void f32_gemm_6x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57, 6, 8, 1, 1);
   }
-  static void sgemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) {
+  static void f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73, 6, 8, 1, 1);
   }
-  static void sgemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
+  static void f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1);
   }
 
-  BENCHMARK_GEMM(sgemm_1x12__aarch64_neonfma_cortex_a53)
-  BENCHMARK_GEMM(sgemm_1x8__aarch64_neonfma_cortex_a53)
-  BENCHMARK_GEMM(sgemm_1x8__aarch64_neonfma_cortex_a57)
-  BENCHMARK_GEMM(sgemm_1x8__aarch64_neonfma_cortex_a75)
-  BENCHMARK_GEMM(sgemm_4x12__aarch64_neonfma_cortex_a53)
-  BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_cortex_a53)
-  BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_cortex_a57)
-  BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_cortex_a75)
-  BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_ld128)
-  BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_ld64)
-  BENCHMARK_GEMM(sgemm_5x8__aarch64_neonfma_cortex_a75)
-  BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_cortex_a53)
-  BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_cortex_a57)
-  BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_cortex_a73)
-  BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_cortex_a75)
-  BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_ld64)
-  BENCHMARK_GEMM(sgemm_6x8__aarch64_neonfma_ld128)
+  BENCHMARK_GEMM(f32_gemm_1x12__aarch64_neonfma_cortex_a53)
+  BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a53)
+  BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a57)
+  BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a75)
+  BENCHMARK_GEMM(f32_gemm_4x12__aarch64_neonfma_cortex_a53)
+  BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a53)
+  BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a57)
+  BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a75)
+  BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_ld128)
+  BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_ld64)
+  BENCHMARK_GEMM(f32_gemm_5x8__aarch64_neonfma_cortex_a75)
+  BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a53)
+  BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a57)
+  BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a73)
+  BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a75)
+  BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld64)
+  BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld128)
 #endif  // XNN_ARCH_ARM64
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  static void sgemm_1x8__neon_ld64(benchmark::State& state, const char* net) {
+  static void f32_gemm_1x8__neon_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neon_ld64, 1, 8, 1, 1);
   }
 
-  static void sgemm_1x8__neonfma_ld64(benchmark::State& state, const char* net) {
+  static void f32_gemm_1x8__neonfma_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neonfma_ld64, 1, 8, 1, 1);
   }
 
-  static void sgemm_4x8__neon_ld64(benchmark::State& state, const char* net) {
+  static void f32_gemm_4x8__neon_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_ld64, 4, 8, 1, 1);
   }
 
-  static void sgemm_4x8__neon_ld128(benchmark::State& state, const char* net) {
+  static void f32_gemm_4x8__neon_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_ld128, 4, 8, 1, 1);
   }
 
-  static void sgemm_5x8__neon_ld64(benchmark::State& state, const char* net) {
+  static void f32_gemm_5x8__neon_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neon_ld64, 5, 8, 1, 1);
   }
 
-  static void sgemm_6x8__neon_ld64(benchmark::State& state, const char* net) {
+  static void f32_gemm_6x8__neon_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neon_ld64, 6, 8, 1, 1);
   }
 
-  static void sgemm_4x8__neonfma_ld64(benchmark::State& state, const char* net) {
+  static void f32_gemm_4x8__neonfma_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_ld64, 4, 8, 1, 1);
   }
 
-  static void sgemm_4x8__neonfma_ld128(benchmark::State& state, const char* net) {
+  static void f32_gemm_4x8__neonfma_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_ld128, 4, 8, 1, 1);
   }
 
-  static void sgemm_5x8__neonfma_ld64(benchmark::State& state, const char* net) {
+  static void f32_gemm_5x8__neonfma_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neonfma_ld64, 5, 8, 1, 1);
   }
 
-  static void sgemm_6x8__neonfma_ld64(benchmark::State& state, const char* net) {
+  static void f32_gemm_6x8__neonfma_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_ld64, 6, 8, 1, 1);
   }
 
-  static void sppmm_4x8_unipass__neonfma(benchmark::State& state, const char* net) {
+  static void f32_ppmm_4x8_unipass__neonfma(benchmark::State& state, const char* net) {
     PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8);
   }
 
-  static void sppmm_4x8_twopass__neonfma(benchmark::State& state, const char* net) {
+  static void f32_ppmm_4x8_twopass__neonfma(benchmark::State& state, const char* net) {
     PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8);
   }
 
-  BENCHMARK_GEMM(sgemm_1x8__neon_ld64)
-  BENCHMARK_GEMM(sgemm_1x8__neonfma_ld64)
-  BENCHMARK_GEMM(sgemm_4x8__neon_ld128)
-  BENCHMARK_GEMM(sgemm_4x8__neon_ld64)
-  BENCHMARK_GEMM(sgemm_4x8__neonfma_ld128)
-  BENCHMARK_GEMM(sgemm_4x8__neonfma_ld64)
-  BENCHMARK_GEMM(sgemm_5x8__neon_ld64)
-  BENCHMARK_GEMM(sgemm_5x8__neonfma_ld64)
-  BENCHMARK_GEMM(sgemm_6x8__neon_ld64)
-  BENCHMARK_GEMM(sgemm_6x8__neonfma_ld64)
+  BENCHMARK_GEMM(f32_gemm_1x8__neon_ld64)
+  BENCHMARK_GEMM(f32_gemm_1x8__neonfma_ld64)
+  BENCHMARK_GEMM(f32_gemm_4x8__neon_ld128)
+  BENCHMARK_GEMM(f32_gemm_4x8__neon_ld64)
+  BENCHMARK_GEMM(f32_gemm_4x8__neonfma_ld128)
+  BENCHMARK_GEMM(f32_gemm_4x8__neonfma_ld64)
+  BENCHMARK_GEMM(f32_gemm_5x8__neon_ld64)
+  BENCHMARK_GEMM(f32_gemm_5x8__neonfma_ld64)
+  BENCHMARK_GEMM(f32_gemm_6x8__neon_ld64)
+  BENCHMARK_GEMM(f32_gemm_6x8__neonfma_ld64)
 
-  BENCHMARK_GEMM(sppmm_4x8_unipass__neonfma)
-  BENCHMARK_GEMM(sppmm_4x8_twopass__neonfma)
+  BENCHMARK_GEMM(f32_ppmm_4x8_unipass__neonfma)
+  BENCHMARK_GEMM(f32_ppmm_4x8_twopass__neonfma)
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  static void sgemm_1x8__sse_load1(benchmark::State& state, const char* net) {
+  static void f32_gemm_1x8__sse_load1(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__sse_load1, 1, 8, 1, 1);
   }
 
-  static void sgemm_4x8__sse_load1(benchmark::State& state, const char* net) {
+  static void f32_gemm_4x8__sse_load1(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__sse_load1, 4, 8, 1, 1);
   }
 
-  static void sgemm_1x8__sse_dup(benchmark::State& state, const char* net) {
+  static void f32_gemm_1x8__sse_dup(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__sse_dup, 1, 8, 1, 1);
   }
 
-  static void sgemm_4x8__sse_dup(benchmark::State& state, const char* net) {
+  static void f32_gemm_4x8__sse_dup(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__sse_dup, 4, 8, 1, 1);
   }
 
-  static void sgemm_1x8s4__sse(benchmark::State& state, const char* net) {
+  static void f32_gemm_1x8s4__sse(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__sse, 1, 8, 1, 4);
   }
 
-  static void sgemm_4x8s4__sse(benchmark::State& state, const char* net) {
+  static void f32_gemm_4x8s4__sse(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__sse, 4, 8, 1, 4);
   }
 
-  static void sppmm_4x8_unipass__sse(benchmark::State& state, const char* net) {
+  static void f32_ppmm_4x8_unipass__sse(benchmark::State& state, const char* net) {
     PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8);
   }
 
-  static void sppmm_4x8_twopass__sse(benchmark::State& state, const char* net) {
+  static void f32_ppmm_4x8_twopass__sse(benchmark::State& state, const char* net) {
     PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8);
   }
 
-  BENCHMARK_GEMM(sgemm_1x8__sse_load1)
-  BENCHMARK_GEMM(sgemm_4x8__sse_load1)
-  BENCHMARK_GEMM(sgemm_1x8__sse_dup)
-  BENCHMARK_GEMM(sgemm_4x8__sse_dup)
-  BENCHMARK_GEMM(sgemm_1x8s4__sse)
-  BENCHMARK_GEMM(sgemm_4x8s4__sse)
-  BENCHMARK_GEMM(sppmm_4x8_unipass__sse)
-  BENCHMARK_GEMM(sppmm_4x8_twopass__sse)
+  BENCHMARK_GEMM(f32_gemm_1x8__sse_load1)
+  BENCHMARK_GEMM(f32_gemm_4x8__sse_load1)
+  BENCHMARK_GEMM(f32_gemm_1x8__sse_dup)
+  BENCHMARK_GEMM(f32_gemm_4x8__sse_dup)
+  BENCHMARK_GEMM(f32_gemm_1x8s4__sse)
+  BENCHMARK_GEMM(f32_gemm_4x8s4__sse)
+  BENCHMARK_GEMM(f32_ppmm_4x8_unipass__sse)
+  BENCHMARK_GEMM(f32_ppmm_4x8_twopass__sse)
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 #if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
-  static void sgemm_4x8__psimd_loadsplat(benchmark::State& state, const char* net) {
+  static void f32_gemm_4x8__psimd_loadsplat(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, 4, 8, 1, 1);
   }
 
-  static void sgemm_6x8__psimd_loadsplat(benchmark::State& state, const char* net) {
+  static void f32_gemm_6x8__psimd_loadsplat(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, 6, 8, 1, 1);
   }
 
-  static void sgemm_4x8__psimd_splat(benchmark::State& state, const char* net) {
+  static void f32_gemm_4x8__psimd_splat(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__psimd_splat, 4, 8, 1, 1);
   }
 
-  static void sgemm_6x8__psimd_splat(benchmark::State& state, const char* net) {
+  static void f32_gemm_6x8__psimd_splat(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__psimd_splat, 6, 8, 1, 1);
   }
 
-  static void sgemm_4x8s4__psimd(benchmark::State& state, const char* net) {
+  static void f32_gemm_4x8s4__psimd(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__psimd, 4, 8, 1, 4);
   }
 
-  static void sgemm_6x8s4__psimd(benchmark::State& state, const char* net) {
+  static void f32_gemm_6x8s4__psimd(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__psimd, 6, 8, 1, 4);
   }
 
-  static void sppmm_4x8_unipass__psimd(benchmark::State& state, const char* net) {
+  static void f32_ppmm_4x8_unipass__psimd(benchmark::State& state, const char* net) {
     PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__psimd, xnn_x32_packx_ukernel_4x__psimd, 4, 8);
   }
 
-  static void sppmm_4x8_twopass__psimd(benchmark::State& state, const char* net) {
+  static void f32_ppmm_4x8_twopass__psimd(benchmark::State& state, const char* net) {
     PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__psimd, xnn_x32_packx_ukernel_4x__psimd, 4, 8);
   }
 
-  BENCHMARK_GEMM(sgemm_4x8__psimd_loadsplat)
-  BENCHMARK_GEMM(sgemm_6x8__psimd_loadsplat)
-  BENCHMARK_GEMM(sgemm_4x8__psimd_splat)
-  BENCHMARK_GEMM(sgemm_6x8__psimd_splat)
-  BENCHMARK_GEMM(sgemm_4x8s4__psimd)
-  BENCHMARK_GEMM(sgemm_6x8s4__psimd)
-  BENCHMARK_GEMM(sppmm_4x8_unipass__psimd)
-  BENCHMARK_GEMM(sppmm_4x8_twopass__psimd)
+  BENCHMARK_GEMM(f32_gemm_4x8__psimd_loadsplat)
+  BENCHMARK_GEMM(f32_gemm_6x8__psimd_loadsplat)
+  BENCHMARK_GEMM(f32_gemm_4x8__psimd_splat)
+  BENCHMARK_GEMM(f32_gemm_6x8__psimd_splat)
+  BENCHMARK_GEMM(f32_gemm_4x8s4__psimd)
+  BENCHMARK_GEMM(f32_gemm_6x8s4__psimd)
+  BENCHMARK_GEMM(f32_ppmm_4x8_unipass__psimd)
+  BENCHMARK_GEMM(f32_ppmm_4x8_twopass__psimd)
 #endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
 
-static void sgemm_1x4__scalar(benchmark::State& state, const char* net) {
+static void f32_gemm_1x4__scalar(benchmark::State& state, const char* net) {
   GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x4__scalar, 1, 4, 1, 1);
 }
 
-static void sgemm_2x4__scalar(benchmark::State& state, const char* net) {
+static void f32_gemm_2x4__scalar(benchmark::State& state, const char* net) {
   GEMMBenchmark(state, xnn_f32_gemm_ukernel_2x4__scalar, 2, 4, 1, 1);
 }
 
-static void sgemm_4x4__scalar(benchmark::State& state, const char* net) {
+static void f32_gemm_4x4__scalar(benchmark::State& state, const char* net) {
   GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x4__scalar, 4, 4, 1, 1);
 }
 
-static void sppmm_2x4_unipass__scalar(benchmark::State& state, const char* net) {
+static void f32_ppmm_2x4_unipass__scalar(benchmark::State& state, const char* net) {
   PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4);
 }
 
-static void sppmm_4x2_unipass__scalar(benchmark::State& state, const char* net) {
+static void f32_ppmm_4x2_unipass__scalar(benchmark::State& state, const char* net) {
   PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2);
 }
 
-static void sppmm_4x4_unipass__scalar(benchmark::State& state, const char* net) {
+static void f32_ppmm_4x4_unipass__scalar(benchmark::State& state, const char* net) {
   PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4);
 }
 
-static void sppmm_3x3_unipass__scalar(benchmark::State& state, const char* net) {
+static void f32_ppmm_3x3_unipass__scalar(benchmark::State& state, const char* net) {
   PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3);
 }
 
-static void sppmm_2x4_twopass__scalar(benchmark::State& state, const char* net) {
+static void f32_ppmm_2x4_twopass__scalar(benchmark::State& state, const char* net) {
   PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4);
 }
 
-static void sppmm_4x2_twopass__scalar(benchmark::State& state, const char* net) {
+static void f32_ppmm_4x2_twopass__scalar(benchmark::State& state, const char* net) {
   PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2);
 }
 
-static void sppmm_4x4_twopass__scalar(benchmark::State& state, const char* net) {
+static void f32_ppmm_4x4_twopass__scalar(benchmark::State& state, const char* net) {
   PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4);
 }
 
-static void sppmm_3x3_twopass__scalar(benchmark::State& state, const char* net) {
+static void f32_ppmm_3x3_twopass__scalar(benchmark::State& state, const char* net) {
   PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3);
 }
 
-BENCHMARK_GEMM(sgemm_1x4__scalar)
-BENCHMARK_GEMM(sgemm_2x4__scalar)
-BENCHMARK_GEMM(sgemm_4x4__scalar)
+BENCHMARK_GEMM(f32_gemm_1x4__scalar)
+BENCHMARK_GEMM(f32_gemm_2x4__scalar)
+BENCHMARK_GEMM(f32_gemm_4x4__scalar)
 
-BENCHMARK_GEMM(sppmm_2x4_unipass__scalar)
-BENCHMARK_GEMM(sppmm_4x2_unipass__scalar)
-BENCHMARK_GEMM(sppmm_4x4_unipass__scalar)
-BENCHMARK_GEMM(sppmm_3x3_unipass__scalar)
+BENCHMARK_GEMM(f32_ppmm_2x4_unipass__scalar)
+BENCHMARK_GEMM(f32_ppmm_4x2_unipass__scalar)
+BENCHMARK_GEMM(f32_ppmm_4x4_unipass__scalar)
+BENCHMARK_GEMM(f32_ppmm_3x3_unipass__scalar)
 
-BENCHMARK_GEMM(sppmm_2x4_twopass__scalar)
-BENCHMARK_GEMM(sppmm_4x2_twopass__scalar)
-BENCHMARK_GEMM(sppmm_4x4_twopass__scalar)
-BENCHMARK_GEMM(sppmm_3x3_twopass__scalar)
+BENCHMARK_GEMM(f32_ppmm_2x4_twopass__scalar)
+BENCHMARK_GEMM(f32_ppmm_4x2_twopass__scalar)
+BENCHMARK_GEMM(f32_ppmm_4x4_twopass__scalar)
+BENCHMARK_GEMM(f32_ppmm_3x3_twopass__scalar)
 
 
 #ifdef BENCHMARK_RUY
diff --git a/bench/f32-im2col-gemm.cc b/bench/f32-im2col-gemm.cc
index c258a25..11ae5ee 100644
--- a/bench/f32-im2col-gemm.cc
+++ b/bench/f32-im2col-gemm.cc
@@ -25,7 +25,7 @@
 
 
 static void Im2ColGEMMBenchmark(benchmark::State& state,
-  xnn_f32_gemm_ukernel_function sgemm,
+  xnn_f32_gemm_ukernel_function f32_gemm,
   uint32_t mr, uint32_t nr, uint32_t kr, uint32_t sr)
 {
   if (!cpuinfo_initialize()) {
@@ -114,7 +114,7 @@
       const uint32_t mb = min(output_size - m, mr);
       for (uint32_t n = 0; n < group_output_channels; n += nr) {
         const uint32_t nb = min(group_output_channels - n, nr);
-        sgemm(
+        f32_gemm(
           mb, nb, kernel_size * group_input_channels * sizeof(float),
           inputData + m * kernel_size * group_input_channels, kernel_size * group_input_channels * sizeof(float),
           w.data() + (buffer_index * nc_stride + n) * (kernel_size * kc_stride + 1),
@@ -135,31 +135,31 @@
 
 
 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
-  static void sgemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
+  static void f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
     Im2ColGEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1);
   }
 
-  BENCHMARK_CONV(sgemm_4x8__aarch64_neonfma_cortex_a75)
+  BENCHMARK_CONV(f32_gemm_4x8__aarch64_neonfma_cortex_a75)
 #endif  // XNN_ARCH_ARM64
 
 #if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
-  static void sgemm_6x8__psimd_loadsplat(benchmark::State& state, const char* net) {
+  static void f32_gemm_6x8__psimd_loadsplat(benchmark::State& state, const char* net) {
     Im2ColGEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, 6, 8, 1, 1);
   }
 
-  BENCHMARK_CONV(sgemm_6x8__psimd_loadsplat)
+  BENCHMARK_CONV(f32_gemm_6x8__psimd_loadsplat)
 #endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
 
-static void sgemm_2x4__scalar(benchmark::State& state, const char* net) {
+static void f32_gemm_2x4__scalar(benchmark::State& state, const char* net) {
   Im2ColGEMMBenchmark(state, xnn_f32_gemm_ukernel_2x4__scalar, 2, 4, 1, 1);
 }
 
-static void sgemm_4x4__scalar(benchmark::State& state, const char* net) {
+static void f32_gemm_4x4__scalar(benchmark::State& state, const char* net) {
   Im2ColGEMMBenchmark(state, xnn_f32_gemm_ukernel_4x4__scalar, 4, 4, 1, 1);
 }
 
-BENCHMARK_CONV(sgemm_2x4__scalar)
-BENCHMARK_CONV(sgemm_4x4__scalar)
+BENCHMARK_CONV(f32_gemm_2x4__scalar)
+BENCHMARK_CONV(f32_gemm_4x4__scalar)
 
 
 #ifndef XNNPACK_BENCHMARK_NO_MAIN