Port F32 GEMM A75 1x8 microkernel to JIT and specialize for min/max, add tests and benchmarks

Implement ld1r for aarch64 assembler

PiperOrigin-RevId: 426260122
diff --git a/BUILD.bazel b/BUILD.bazel
index ff2af12..28ed44c 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -7365,6 +7365,7 @@
 ]
 
 JIT_AARCH64_SRCS = [
+    "src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.cc",
     "src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.cc",
     "src/f32-igemm/6x8-aarch64-neonfma-cortex-a75.cc",
 ]