QS8 C4 Neon GEMM and E2E benchmarks
PiperOrigin-RevId: 407932393
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc
index 430832e..a125e50 100644
--- a/bench/qs8-gemm.cc
+++ b/bench/qs8-gemm.cc
@@ -427,6 +427,70 @@
GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, 4, 16, 2, 1,
xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
}
+ static void qs8_gemm_1x8c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, 1, 8, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_2x8c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, 2, 8, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_3x8c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, 3, 8, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_4x8c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, 4, 8, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_1x16c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, 1, 16, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_2x16c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, 2, 16, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_3x16c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, 3, 16, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_4x16c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, 4, 16, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_1x8c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, 1, 8, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_2x8c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, 2, 8, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_3x8c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, 3, 8, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_4x8c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, 4, 8, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_1x16c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, 1, 16, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_2x16c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, 2, 16, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_3x16c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, 3, 16, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_4x16c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, 4, 16, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
static void qs8_gemm_1x8c8_gemmlowp__neon_mull_padal(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, 1, 8, 8, 1,
xnn_init_qs8_conv_minmax_gemmlowp_neon_params, benchmark::utils::CheckNEON);
@@ -555,21 +619,26 @@
GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, 8, 16, 4, 1,
xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
}
-
- BENCHMARK_GEMM(qs8_gemm_1x8_gemmlowp__neon_mlal_lane)
- BENCHMARK_GEMM(qs8_gemm_2x8_gemmlowp__neon_mlal_lane)
- BENCHMARK_GEMM(qs8_gemm_3x8_gemmlowp__neon_mlal_lane)
- BENCHMARK_GEMM(qs8_gemm_4x8_gemmlowp__neon_mlal_lane)
- BENCHMARK_GEMM(qs8_gemm_6x8_gemmlowp__neon_mlal_lane)
- BENCHMARK_GEMM(qs8_gemm_1x16_gemmlowp__neon_mlal_lane)
- BENCHMARK_GEMM(qs8_gemm_2x16_gemmlowp__neon_mlal_lane)
- BENCHMARK_GEMM(qs8_gemm_3x16_gemmlowp__neon_mlal_lane)
- BENCHMARK_GEMM(qs8_gemm_4x16_gemmlowp__neon_mlal_lane)
- BENCHMARK_GEMM(qs8_gemm_6x16_gemmlowp__neon_mlal_lane)
BENCHMARK_GEMM(qs8_gemm_1x8s4c2__neon_mull_padal)
BENCHMARK_GEMM(qs8_gemm_2x8s4c2__neon_mull_padal)
BENCHMARK_GEMM(qs8_gemm_1x8s4c2__neon_mlal_padal)
BENCHMARK_GEMM(qs8_gemm_2x8s4c2__neon_mlal_padal)
+ BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mull_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mull_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mull_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mull_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mull_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mull_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mull_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mull_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mlal_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mlal_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mlal_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mlal_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mlal_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mlal_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mlal_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mlal_padal_dup)
BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mull_padal_dup)
BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mull_padal_dup)
BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mull_padal_dup)
@@ -586,6 +655,16 @@
BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mlal_padal_dup)
BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mlal_padal_dup)
BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mlal_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_1x8_gemmlowp__neon_mlal_lane)
+ BENCHMARK_GEMM(qs8_gemm_2x8_gemmlowp__neon_mlal_lane)
+ BENCHMARK_GEMM(qs8_gemm_3x8_gemmlowp__neon_mlal_lane)
+ BENCHMARK_GEMM(qs8_gemm_4x8_gemmlowp__neon_mlal_lane)
+ BENCHMARK_GEMM(qs8_gemm_6x8_gemmlowp__neon_mlal_lane)
+ BENCHMARK_GEMM(qs8_gemm_1x16_gemmlowp__neon_mlal_lane)
+ BENCHMARK_GEMM(qs8_gemm_2x16_gemmlowp__neon_mlal_lane)
+ BENCHMARK_GEMM(qs8_gemm_3x16_gemmlowp__neon_mlal_lane)
+ BENCHMARK_GEMM(qs8_gemm_4x16_gemmlowp__neon_mlal_lane)
+ BENCHMARK_GEMM(qs8_gemm_6x16_gemmlowp__neon_mlal_lane)
BENCHMARK_GEMM(qs8_gemm_1x8c8_gemmlowp__neon_mull_padal)
BENCHMARK_GEMM(qs8_gemm_2x8c8_gemmlowp__neon_mull_padal)
BENCHMARK_GEMM(qs8_gemm_3x8c8_gemmlowp__neon_mull_padal)