FP32 requantization in QS8 GEMM/IGEMM microkernels for SSE/AVX/XOP
PiperOrigin-RevId: 376966195
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc
index c5ac7f6..48c41ab 100644
--- a/bench/qs8-gemm.cc
+++ b/bench/qs8-gemm.cc
@@ -742,27 +742,51 @@
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__xop_ld64, 2, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP);
}
+ static void qs8_gemm_2x4c2_fp32__xop_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, 2, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
+ }
static void qs8_gemm_3x4c2_gemmlowp__xop_ld64(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__xop_ld64, 3, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP);
}
+ static void qs8_gemm_3x4c2_fp32__xop_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, 3, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
+ }
static void qs8_gemm_4x4c2_gemmlowp__xop_ld64(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__xop_ld64, 4, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP);
}
+ static void qs8_gemm_4x4c2_fp32__xop_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, 4, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
+ }
static void qs8_gemm_2x4c2_gemmlowp__xop_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__xop_ld128, 2, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP);
}
+ static void qs8_gemm_2x4c2_fp32__xop_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, 2, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
+ }
static void qs8_gemm_3x4c2_gemmlowp__xop_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__xop_ld128, 3, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP);
}
+ static void qs8_gemm_3x4c2_fp32__xop_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, 3, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
+ }
static void qs8_gemm_4x4c2_gemmlowp__xop_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__xop_ld128, 4, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP);
}
+ static void qs8_gemm_4x4c2_fp32__xop_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, 4, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
+ }
static void qs8_gemm_xw_2x4c2_gemmlowp__xop(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__xop, 2, 4, 2, 1,
@@ -781,19 +805,35 @@
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__xop_ld64, 2, 4, 8, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP);
}
+ static void qs8_gemm_2x4c8_fp32__xop_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, 2, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
+ }
static void qs8_gemm_3x4c8_gemmlowp__xop_ld64(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__xop_ld64, 3, 4, 8, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP);
}
+ static void qs8_gemm_3x4c8_fp32__xop_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, 3, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
+ }
static void qs8_gemm_2x4c8_gemmlowp__xop_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__xop_ld128, 2, 4, 8, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP);
}
+ static void qs8_gemm_2x4c8_fp32__xop_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, 2, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
+ }
static void qs8_gemm_3x4c8_gemmlowp__xop_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__xop_ld128, 3, 4, 8, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP);
}
+ static void qs8_gemm_3x4c8_fp32__xop_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, 3, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
+ }
static void qs8_gemm_xw_2x4c8_gemmlowp__xop(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, 2, 4, 8, 1,
@@ -808,27 +848,51 @@
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__avx_ld64, 2, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX);
}
+ static void qs8_gemm_2x4c2_fp32__avx_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, 2, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
+ }
static void qs8_gemm_3x4c2_gemmlowp__avx_ld64(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__avx_ld64, 3, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX);
}
+ static void qs8_gemm_3x4c2_fp32__avx_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, 3, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
+ }
static void qs8_gemm_4x4c2_gemmlowp__avx_ld64(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__avx_ld64, 4, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX);
}
+ static void qs8_gemm_4x4c2_fp32__avx_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, 4, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
+ }
static void qs8_gemm_2x4c2_gemmlowp__avx_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__avx_ld128, 2, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX);
}
+ static void qs8_gemm_2x4c2_fp32__avx_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, 2, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
+ }
static void qs8_gemm_3x4c2_gemmlowp__avx_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__avx_ld128, 3, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX);
}
+ static void qs8_gemm_3x4c2_fp32__avx_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, 3, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
+ }
static void qs8_gemm_4x4c2_gemmlowp__avx_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__avx_ld128, 4, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX);
}
+ static void qs8_gemm_4x4c2_fp32__avx_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, 4, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
+ }
static void qs8_gemm_xw_2x4c2_gemmlowp__avx(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__avx, 2, 4, 2, 1,
@@ -847,19 +911,35 @@
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__avx_ld64, 2, 4, 8, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX);
}
+ static void qs8_gemm_2x4c8_fp32__avx_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, 2, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
+ }
static void qs8_gemm_3x4c8_gemmlowp__avx_ld64(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__avx_ld64, 3, 4, 8, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX);
}
+ static void qs8_gemm_3x4c8_fp32__avx_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, 3, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
+ }
static void qs8_gemm_2x4c8_gemmlowp__avx_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__avx_ld128, 2, 4, 8, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX);
}
+ static void qs8_gemm_2x4c8_fp32__avx_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, 2, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
+ }
static void qs8_gemm_3x4c8_gemmlowp__avx_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__avx_ld128, 3, 4, 8, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX);
}
+ static void qs8_gemm_3x4c8_fp32__avx_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, 3, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
+ }
static void qs8_gemm_xw_2x4c8_gemmlowp__avx(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, 2, 4, 8, 1,
@@ -874,27 +954,51 @@
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__sse41_ld64, 2, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41);
}
+ static void qs8_gemm_2x4c2_fp32__sse41_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, 2, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
+ }
static void qs8_gemm_3x4c2_gemmlowp__sse41_ld64(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__sse41_ld64, 3, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41);
}
+ static void qs8_gemm_3x4c2_fp32__sse41_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, 3, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
+ }
static void qs8_gemm_4x4c2_gemmlowp__sse41_ld64(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse41_ld64, 4, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41);
}
+ static void qs8_gemm_4x4c2_fp32__sse41_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, 4, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
+ }
static void qs8_gemm_2x4c2_gemmlowp__sse41_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__sse41_ld128, 2, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41);
}
+ static void qs8_gemm_2x4c2_fp32__sse41_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, 2, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
+ }
static void qs8_gemm_3x4c2_gemmlowp__sse41_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__sse41_ld128, 3, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41);
}
+ static void qs8_gemm_3x4c2_fp32__sse41_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, 3, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
+ }
static void qs8_gemm_4x4c2_gemmlowp__sse41_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse41_ld128, 4, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41);
}
+ static void qs8_gemm_4x4c2_fp32__sse41_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, 4, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
+ }
static void qs8_gemm_xw_2x4c2_gemmlowp__sse41(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse41, 2, 4, 2, 1,
@@ -913,19 +1017,35 @@
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse41_ld64, 2, 4, 8, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41);
}
+ static void qs8_gemm_2x4c8_fp32__sse41_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, 2, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
+ }
static void qs8_gemm_3x4c8_gemmlowp__sse41_ld64(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__sse41_ld64, 3, 4, 8, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41);
}
+ static void qs8_gemm_3x4c8_fp32__sse41_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, 3, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
+ }
static void qs8_gemm_2x4c8_gemmlowp__sse41_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse41_ld128, 2, 4, 8, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41);
}
+ static void qs8_gemm_2x4c8_fp32__sse41_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, 2, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
+ }
static void qs8_gemm_3x4c8_gemmlowp__sse41_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__sse41_ld128, 3, 4, 8, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41);
}
+ static void qs8_gemm_3x4c8_fp32__sse41_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, 3, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
+ }
static void qs8_gemm_xw_2x4c8_gemmlowp__sse41(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, 2, 4, 8, 1,
@@ -940,27 +1060,51 @@
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__ssse3_ld64, 2, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3);
}
+ static void qs8_gemm_2x4c2_fp32__ssse3_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__ssse3_ld64, 2, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
+ }
static void qs8_gemm_3x4c2_gemmlowp__ssse3_ld64(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__ssse3_ld64, 3, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3);
}
+ static void qs8_gemm_3x4c2_fp32__ssse3_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__ssse3_ld64, 3, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
+ }
static void qs8_gemm_4x4c2_gemmlowp__ssse3_ld64(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__ssse3_ld64, 4, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3);
}
+ static void qs8_gemm_4x4c2_fp32__ssse3_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__ssse3_ld64, 4, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
+ }
static void qs8_gemm_2x4c2_gemmlowp__ssse3_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__ssse3_ld128, 2, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3);
}
+ static void qs8_gemm_2x4c2_fp32__ssse3_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__ssse3_ld128, 2, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
+ }
static void qs8_gemm_3x4c2_gemmlowp__ssse3_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__ssse3_ld128, 3, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3);
}
+ static void qs8_gemm_3x4c2_fp32__ssse3_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__ssse3_ld128, 3, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
+ }
static void qs8_gemm_4x4c2_gemmlowp__ssse3_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__ssse3_ld128, 4, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3);
}
+ static void qs8_gemm_4x4c2_fp32__ssse3_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__ssse3_ld128, 4, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
+ }
static void qs8_gemm_xw_2x4c2_gemmlowp__ssse3(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__ssse3, 2, 4, 2, 1,
@@ -979,19 +1123,35 @@
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__ssse3_ld64, 2, 4, 8, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3);
}
+ static void qs8_gemm_2x4c8_fp32__ssse3_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, 2, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
+ }
static void qs8_gemm_3x4c8_gemmlowp__ssse3_ld64(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__ssse3_ld64, 3, 4, 8, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3);
}
+ static void qs8_gemm_3x4c8_fp32__ssse3_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, 3, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
+ }
static void qs8_gemm_2x4c8_gemmlowp__ssse3_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__ssse3_ld128, 2, 4, 8, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3);
}
+ static void qs8_gemm_2x4c8_fp32__ssse3_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, 2, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
+ }
static void qs8_gemm_3x4c8_gemmlowp__ssse3_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__ssse3_ld128, 3, 4, 8, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3);
}
+ static void qs8_gemm_3x4c8_fp32__ssse3_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, 3, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
+ }
static void qs8_gemm_xw_2x4c8_gemmlowp__ssse3(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, 2, 4, 8, 1,
@@ -1006,27 +1166,51 @@
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__sse2_ld64, 2, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse2_params);
}
+ static void qs8_gemm_2x4c2_fp32__sse2_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, 2, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params);
+ }
static void qs8_gemm_3x4c2_gemmlowp__sse2_ld64(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__sse2_ld64, 3, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse2_params);
}
+ static void qs8_gemm_3x4c2_fp32__sse2_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, 3, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params);
+ }
static void qs8_gemm_4x4c2_gemmlowp__sse2_ld64(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse2_ld64, 4, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse2_params);
}
+ static void qs8_gemm_4x4c2_fp32__sse2_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, 4, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params);
+ }
static void qs8_gemm_2x4c2_gemmlowp__sse2_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__sse2_ld128, 2, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse2_params);
}
+ static void qs8_gemm_2x4c2_fp32__sse2_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, 2, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params);
+ }
static void qs8_gemm_3x4c2_gemmlowp__sse2_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__sse2_ld128, 3, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse2_params);
}
+ static void qs8_gemm_3x4c2_fp32__sse2_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, 3, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params);
+ }
static void qs8_gemm_4x4c2_gemmlowp__sse2_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse2_ld128, 4, 4, 2, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse2_params);
}
+ static void qs8_gemm_4x4c2_fp32__sse2_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, 4, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params);
+ }
static void qs8_gemm_xw_2x4c2_gemmlowp__sse2(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse2, 2, 4, 2, 1,
@@ -1045,19 +1229,35 @@
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse2_ld64, 2, 4, 8, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse2_params);
}
+ static void qs8_gemm_2x4c8_fp32__sse2_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, 2, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params);
+ }
static void qs8_gemm_3x4c8_gemmlowp__sse2_ld64(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__sse2_ld64, 3, 4, 8, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse2_params);
}
+ static void qs8_gemm_3x4c8_fp32__sse2_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, 3, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params);
+ }
static void qs8_gemm_2x4c8_gemmlowp__sse2_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse2_ld128, 2, 4, 8, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse2_params);
}
+ static void qs8_gemm_2x4c8_fp32__sse2_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, 2, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params);
+ }
static void qs8_gemm_3x4c8_gemmlowp__sse2_ld128(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__sse2_ld128, 3, 4, 8, 1,
xnn_init_qs8_conv_minmax_gemmlowp_sse2_params);
}
+ static void qs8_gemm_3x4c8_fp32__sse2_ld128(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, 3, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params);
+ }
static void qs8_gemm_xw_2x4c8_gemmlowp__sse2(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, 2, 4, 8, 1,
@@ -1083,82 +1283,132 @@
BENCHMARK_GEMM(qs8_gemm_xw_3x8c8_gemmlowp__avx2)
BENCHMARK_GEMM(qs8_gemm_2x4c2_gemmlowp__xop_ld64)
+ BENCHMARK_GEMM(qs8_gemm_2x4c2_fp32__xop_ld64)
BENCHMARK_GEMM(qs8_gemm_3x4c2_gemmlowp__xop_ld64)
+ BENCHMARK_GEMM(qs8_gemm_3x4c2_fp32__xop_ld64)
BENCHMARK_GEMM(qs8_gemm_4x4c2_gemmlowp__xop_ld64)
+ BENCHMARK_GEMM(qs8_gemm_4x4c2_fp32__xop_ld64)
BENCHMARK_GEMM(qs8_gemm_2x4c2_gemmlowp__xop_ld128)
+ BENCHMARK_GEMM(qs8_gemm_2x4c2_fp32__xop_ld128)
BENCHMARK_GEMM(qs8_gemm_3x4c2_gemmlowp__xop_ld128)
+ BENCHMARK_GEMM(qs8_gemm_3x4c2_fp32__xop_ld128)
BENCHMARK_GEMM(qs8_gemm_4x4c2_gemmlowp__xop_ld128)
+ BENCHMARK_GEMM(qs8_gemm_4x4c2_fp32__xop_ld128)
BENCHMARK_GEMM(qs8_gemm_xw_2x4c2_gemmlowp__xop)
BENCHMARK_GEMM(qs8_gemm_xw_3x4c2_gemmlowp__xop)
BENCHMARK_GEMM(qs8_gemm_xw_4x4c2_gemmlowp__xop)
BENCHMARK_GEMM(qs8_gemm_2x4c8_gemmlowp__xop_ld64)
+ BENCHMARK_GEMM(qs8_gemm_2x4c8_fp32__xop_ld64)
BENCHMARK_GEMM(qs8_gemm_3x4c8_gemmlowp__xop_ld64)
+ BENCHMARK_GEMM(qs8_gemm_3x4c8_fp32__xop_ld64)
BENCHMARK_GEMM(qs8_gemm_2x4c8_gemmlowp__xop_ld128)
+ BENCHMARK_GEMM(qs8_gemm_2x4c8_fp32__xop_ld128)
BENCHMARK_GEMM(qs8_gemm_3x4c8_gemmlowp__xop_ld128)
+ BENCHMARK_GEMM(qs8_gemm_3x4c8_fp32__xop_ld128)
BENCHMARK_GEMM(qs8_gemm_xw_2x4c8_gemmlowp__xop)
BENCHMARK_GEMM(qs8_gemm_xw_3x4c8_gemmlowp__xop)
BENCHMARK_GEMM(qs8_gemm_2x4c2_gemmlowp__avx_ld64)
+ BENCHMARK_GEMM(qs8_gemm_2x4c2_fp32__avx_ld64)
BENCHMARK_GEMM(qs8_gemm_3x4c2_gemmlowp__avx_ld64)
+ BENCHMARK_GEMM(qs8_gemm_3x4c2_fp32__avx_ld64)
BENCHMARK_GEMM(qs8_gemm_4x4c2_gemmlowp__avx_ld64)
+ BENCHMARK_GEMM(qs8_gemm_4x4c2_fp32__avx_ld64)
BENCHMARK_GEMM(qs8_gemm_2x4c2_gemmlowp__avx_ld128)
+ BENCHMARK_GEMM(qs8_gemm_2x4c2_fp32__avx_ld128)
BENCHMARK_GEMM(qs8_gemm_3x4c2_gemmlowp__avx_ld128)
+ BENCHMARK_GEMM(qs8_gemm_3x4c2_fp32__avx_ld128)
BENCHMARK_GEMM(qs8_gemm_4x4c2_gemmlowp__avx_ld128)
+ BENCHMARK_GEMM(qs8_gemm_4x4c2_fp32__avx_ld128)
BENCHMARK_GEMM(qs8_gemm_xw_2x4c2_gemmlowp__avx)
BENCHMARK_GEMM(qs8_gemm_xw_3x4c2_gemmlowp__avx)
BENCHMARK_GEMM(qs8_gemm_xw_4x4c2_gemmlowp__avx)
BENCHMARK_GEMM(qs8_gemm_2x4c8_gemmlowp__avx_ld64)
+ BENCHMARK_GEMM(qs8_gemm_2x4c8_fp32__avx_ld64)
BENCHMARK_GEMM(qs8_gemm_3x4c8_gemmlowp__avx_ld64)
+ BENCHMARK_GEMM(qs8_gemm_3x4c8_fp32__avx_ld64)
BENCHMARK_GEMM(qs8_gemm_2x4c8_gemmlowp__avx_ld128)
+ BENCHMARK_GEMM(qs8_gemm_2x4c8_fp32__avx_ld128)
BENCHMARK_GEMM(qs8_gemm_3x4c8_gemmlowp__avx_ld128)
+ BENCHMARK_GEMM(qs8_gemm_3x4c8_fp32__avx_ld128)
BENCHMARK_GEMM(qs8_gemm_xw_2x4c8_gemmlowp__avx)
BENCHMARK_GEMM(qs8_gemm_xw_3x4c8_gemmlowp__avx)
BENCHMARK_GEMM(qs8_gemm_2x4c2_gemmlowp__sse41_ld64)
+ BENCHMARK_GEMM(qs8_gemm_2x4c2_fp32__sse41_ld64)
BENCHMARK_GEMM(qs8_gemm_3x4c2_gemmlowp__sse41_ld64)
+ BENCHMARK_GEMM(qs8_gemm_3x4c2_fp32__sse41_ld64)
BENCHMARK_GEMM(qs8_gemm_4x4c2_gemmlowp__sse41_ld64)
+ BENCHMARK_GEMM(qs8_gemm_4x4c2_fp32__sse41_ld64)
BENCHMARK_GEMM(qs8_gemm_2x4c2_gemmlowp__sse41_ld128)
+ BENCHMARK_GEMM(qs8_gemm_2x4c2_fp32__sse41_ld128)
BENCHMARK_GEMM(qs8_gemm_3x4c2_gemmlowp__sse41_ld128)
+ BENCHMARK_GEMM(qs8_gemm_3x4c2_fp32__sse41_ld128)
BENCHMARK_GEMM(qs8_gemm_4x4c2_gemmlowp__sse41_ld128)
+ BENCHMARK_GEMM(qs8_gemm_4x4c2_fp32__sse41_ld128)
BENCHMARK_GEMM(qs8_gemm_xw_2x4c2_gemmlowp__sse41)
BENCHMARK_GEMM(qs8_gemm_xw_3x4c2_gemmlowp__sse41)
BENCHMARK_GEMM(qs8_gemm_xw_4x4c2_gemmlowp__sse41)
BENCHMARK_GEMM(qs8_gemm_2x4c8_gemmlowp__sse41_ld64)
+ BENCHMARK_GEMM(qs8_gemm_2x4c8_fp32__sse41_ld64)
BENCHMARK_GEMM(qs8_gemm_3x4c8_gemmlowp__sse41_ld64)
+ BENCHMARK_GEMM(qs8_gemm_3x4c8_fp32__sse41_ld64)
BENCHMARK_GEMM(qs8_gemm_2x4c8_gemmlowp__sse41_ld128)
+ BENCHMARK_GEMM(qs8_gemm_2x4c8_fp32__sse41_ld128)
BENCHMARK_GEMM(qs8_gemm_3x4c8_gemmlowp__sse41_ld128)
+ BENCHMARK_GEMM(qs8_gemm_3x4c8_fp32__sse41_ld128)
BENCHMARK_GEMM(qs8_gemm_xw_2x4c8_gemmlowp__sse41)
BENCHMARK_GEMM(qs8_gemm_xw_3x4c8_gemmlowp__sse41)
BENCHMARK_GEMM(qs8_gemm_2x4c2_gemmlowp__ssse3_ld64)
+ BENCHMARK_GEMM(qs8_gemm_2x4c2_fp32__ssse3_ld64)
BENCHMARK_GEMM(qs8_gemm_3x4c2_gemmlowp__ssse3_ld64)
+ BENCHMARK_GEMM(qs8_gemm_3x4c2_fp32__ssse3_ld64)
BENCHMARK_GEMM(qs8_gemm_4x4c2_gemmlowp__ssse3_ld64)
+ BENCHMARK_GEMM(qs8_gemm_4x4c2_fp32__ssse3_ld64)
BENCHMARK_GEMM(qs8_gemm_2x4c2_gemmlowp__ssse3_ld128)
+ BENCHMARK_GEMM(qs8_gemm_2x4c2_fp32__ssse3_ld128)
BENCHMARK_GEMM(qs8_gemm_3x4c2_gemmlowp__ssse3_ld128)
+ BENCHMARK_GEMM(qs8_gemm_3x4c2_fp32__ssse3_ld128)
BENCHMARK_GEMM(qs8_gemm_4x4c2_gemmlowp__ssse3_ld128)
+ BENCHMARK_GEMM(qs8_gemm_4x4c2_fp32__ssse3_ld128)
BENCHMARK_GEMM(qs8_gemm_xw_2x4c2_gemmlowp__ssse3)
BENCHMARK_GEMM(qs8_gemm_xw_3x4c2_gemmlowp__ssse3)
BENCHMARK_GEMM(qs8_gemm_xw_4x4c2_gemmlowp__ssse3)
BENCHMARK_GEMM(qs8_gemm_2x4c8_gemmlowp__ssse3_ld64)
+ BENCHMARK_GEMM(qs8_gemm_2x4c8_fp32__ssse3_ld64)
BENCHMARK_GEMM(qs8_gemm_3x4c8_gemmlowp__ssse3_ld64)
+ BENCHMARK_GEMM(qs8_gemm_3x4c8_fp32__ssse3_ld64)
BENCHMARK_GEMM(qs8_gemm_2x4c8_gemmlowp__ssse3_ld128)
+ BENCHMARK_GEMM(qs8_gemm_2x4c8_fp32__ssse3_ld128)
BENCHMARK_GEMM(qs8_gemm_3x4c8_gemmlowp__ssse3_ld128)
+ BENCHMARK_GEMM(qs8_gemm_3x4c8_fp32__ssse3_ld128)
BENCHMARK_GEMM(qs8_gemm_xw_2x4c8_gemmlowp__ssse3)
BENCHMARK_GEMM(qs8_gemm_xw_3x4c8_gemmlowp__ssse3)
BENCHMARK_GEMM(qs8_gemm_2x4c2_gemmlowp__sse2_ld64)
+ BENCHMARK_GEMM(qs8_gemm_2x4c2_fp32__sse2_ld64)
BENCHMARK_GEMM(qs8_gemm_3x4c2_gemmlowp__sse2_ld64)
+ BENCHMARK_GEMM(qs8_gemm_3x4c2_fp32__sse2_ld64)
BENCHMARK_GEMM(qs8_gemm_4x4c2_gemmlowp__sse2_ld64)
+ BENCHMARK_GEMM(qs8_gemm_4x4c2_fp32__sse2_ld64)
BENCHMARK_GEMM(qs8_gemm_2x4c2_gemmlowp__sse2_ld128)
+ BENCHMARK_GEMM(qs8_gemm_2x4c2_fp32__sse2_ld128)
BENCHMARK_GEMM(qs8_gemm_3x4c2_gemmlowp__sse2_ld128)
+ BENCHMARK_GEMM(qs8_gemm_3x4c2_fp32__sse2_ld128)
BENCHMARK_GEMM(qs8_gemm_4x4c2_gemmlowp__sse2_ld128)
+ BENCHMARK_GEMM(qs8_gemm_4x4c2_fp32__sse2_ld128)
BENCHMARK_GEMM(qs8_gemm_xw_2x4c2_gemmlowp__sse2)
BENCHMARK_GEMM(qs8_gemm_xw_3x4c2_gemmlowp__sse2)
BENCHMARK_GEMM(qs8_gemm_xw_4x4c2_gemmlowp__sse2)
BENCHMARK_GEMM(qs8_gemm_2x4c8_gemmlowp__sse2_ld64)
+ BENCHMARK_GEMM(qs8_gemm_2x4c8_fp32__sse2_ld64)
BENCHMARK_GEMM(qs8_gemm_3x4c8_gemmlowp__sse2_ld64)
+ BENCHMARK_GEMM(qs8_gemm_3x4c8_fp32__sse2_ld64)
BENCHMARK_GEMM(qs8_gemm_2x4c8_gemmlowp__sse2_ld128)
+ BENCHMARK_GEMM(qs8_gemm_2x4c8_fp32__sse2_ld128)
BENCHMARK_GEMM(qs8_gemm_3x4c8_gemmlowp__sse2_ld128)
+ BENCHMARK_GEMM(qs8_gemm_3x4c8_fp32__sse2_ld128)
BENCHMARK_GEMM(qs8_gemm_xw_2x4c8_gemmlowp__sse2)
BENCHMARK_GEMM(qs8_gemm_xw_3x4c8_gemmlowp__sse2)
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64