4x8 A53 GEMM, and GEMMINC unpipelined microkernels.

PiperOrigin-RevId: 276743130
diff --git a/bench/f32-gemm.cc b/bench/f32-gemm.cc
index 15eacbb..f2a3973 100644
--- a/bench/f32-gemm.cc
+++ b/bench/f32-gemm.cc
@@ -351,6 +351,9 @@
   static void sgemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53, 4, 12, 1, 1);
   }
+  static void sgemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53, 4, 8, 1, 1);
+  }
   static void sgemm_4x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57, 4, 8, 1, 1);
   }
@@ -390,6 +393,7 @@
   BENCHMARK_GEMM(sgemm_1x8__aarch64_neonfma_cortex_a57)
   BENCHMARK_GEMM(sgemm_1x8__aarch64_neonfma_cortex_a75)
   BENCHMARK_GEMM(sgemm_4x12__aarch64_neonfma_cortex_a53)
+  BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_cortex_a53)
   BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_cortex_a57)
   BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_cortex_a75)
   BENCHMARK_GEMM(sgemm_4x8__aarch64_neonfma_ld128)