DUP Neon GEMM kernels for Exynos

PiperOrigin-RevId: 282066945
diff --git a/bench/f32-igemm.cc b/bench/f32-igemm.cc
index ac0a677..fc82032 100644
--- a/bench/f32-igemm.cc
+++ b/bench/f32-igemm.cc
@@ -154,55 +154,67 @@
   static void f32_igemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1);
   }
-
   static void f32_igemm_4x2__neon_lane_ld64(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x2__neon_lane_ld64, 4, 2, 1, 1);
   }
-
   static void f32_igemm_4x4__neon_lane_ld64(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x4__neon_lane_ld64, 4, 4, 1, 1);
   }
-
   static void f32_igemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1);
   }
-
   static void f32_igemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1);
   }
 
+  static void f32_igemm_1x8__neon_dup_ld64(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__neon_dup_ld64, 1, 8, 1, 1);
+  }
+  static void f32_igemm_4x8__neon_dup_ld128(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_dup_ld128, 4, 8, 1, 1);
+  }
+  static void f32_igemm_4x8__neon_dup_ld64(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_dup_ld64, 4, 8, 1, 1);
+  }
+  static void f32_igemm_6x8__neon_dup_ld64(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__neon_dup_ld64, 6, 8, 1, 1);
+  }
+  static void f32_igemm_1x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64, 1, 8, 1, 1);
+  }
+  static void f32_igemm_4x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128, 4, 8, 1, 1);
+  }
+  static void f32_igemm_4x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64, 4, 8, 1, 1);
+  }
+  static void f32_igemm_6x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64, 6, 8, 1, 1);
+  }
   static void f32_igemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1);
   }
-
   static void f32_igemm_1x8s4__neon(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8s4__neon, 1, 8, 1, 4);
   }
-
   static void f32_igemm_1x8s4__neonfma(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8s4__neonfma, 1, 8, 1, 4);
   }
-
   static void f32_igemm_4x8s4__neon(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8s4__neon, 4, 8, 1, 4);
   }
-
   static void f32_igemm_4x8s4__neonfma(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8s4__neonfma, 4, 8, 1, 4);
   }
-
   static void f32_igemm_6x8s4__neon(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8s4__neon, 6, 8, 1, 4);
   }
-
   static void f32_igemm_6x8s4__neonfma(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8s4__neonfma, 6, 8, 1, 4);
   }
-
   static void f32_igemm_8x8s4__neon(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_8x8s4__neon, 8, 8, 1, 4);
   }
-
   static void f32_igemm_8x8s4__neonfma(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_8x8s4__neonfma, 8, 8, 1, 4);
   }
@@ -213,6 +225,14 @@
   BENCHMARK_CONV(f32_igemm_4x8__neon_lane_ld128)
   BENCHMARK_CONV(f32_igemm_4x8__neon_lane_ld64)
   BENCHMARK_CONV(f32_igemm_6x8__neon_lane_ld64)
+  BENCHMARK_CONV(f32_igemm_1x8__neon_dup_ld64)
+  BENCHMARK_CONV(f32_igemm_4x8__neon_dup_ld128)
+  BENCHMARK_CONV(f32_igemm_4x8__neon_dup_ld64)
+  BENCHMARK_CONV(f32_igemm_6x8__neon_dup_ld64)
+  BENCHMARK_CONV(f32_igemm_1x8__neonfma_dup_ld64)
+  BENCHMARK_CONV(f32_igemm_4x8__neonfma_dup_ld128)
+  BENCHMARK_CONV(f32_igemm_4x8__neonfma_dup_ld64)
+  BENCHMARK_CONV(f32_igemm_6x8__neonfma_dup_ld64)
   BENCHMARK_CONV(f32_igemm_1x8s4__neon)
   BENCHMARK_CONV(f32_igemm_1x8s4__neonfma)
   BENCHMARK_CONV(f32_igemm_4x8s4__neon)