Enable AARCH32 4x8 GEMM kernel
Improves performance on Exynos by using VLDM.
Comparable performance on other CPUs but consistent
performance independent of compiler.
PiperOrigin-RevId: 283189038
diff --git a/src/init.c b/src/init.c
index e7ce5b3..60190bc 100644
--- a/src/init.c
+++ b/src/init.c
@@ -131,14 +131,25 @@
/**************************** F32 micro-kernels ****************************/
#ifndef XNN_NO_F32_OPERATORS
- xnn_params.f32.gemm = (struct gemm_parameters) {
- .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__neon_lane_ld128,
- .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__neon_lane_ld128,
- .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
- .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
- .mr = 4,
- .nr = 8,
- };
+ #if XNN_ENABLE_ASSEMBLY
+ xnn_params.f32.gemm = (struct gemm_parameters) {
+ .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64,
+ .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__neon_lane_ld128,
+ .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
+ .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
+ .mr = 4,
+ .nr = 8,
+ };
+ #else // XNN_ENABLE_ASSEMBLY
+ xnn_params.f32.gemm = (struct gemm_parameters) {
+ .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__neon_lane_ld128,
+ .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__neon_lane_ld128,
+ .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
+ .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
+ .mr = 4,
+ .nr = 8,
+ };
+ #endif // XNN_ENABLE_ASSEMBLY
xnn_params.f32.gemm2 = (struct gemm_parameters) {
.gemm = NULL,
.igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__neon_lane_ld64,
@@ -390,7 +401,7 @@
.mr = 6,
.nr = 8,
};
- #endif
+ #endif // XNN_ENABLE_ASSEMBLY
xnn_params.f32.gemm2 = (struct gemm_parameters) {
.gemm = NULL,