Cortex A76 use 6x8 micro kernel

On big core
A75 kernel is 0.3% faster with 6x8 than 4x8
A53 kernel is 11.2% faster with 6x8 than 4x8

On little core
A75 kernel is 6.4% faster with 6x8 than 4x8
A53 kernel is 9.2% faster with 6x8 than 4x8

PiperOrigin-RevId: 277156128
diff --git a/src/init.c b/src/init.c
index 0fbc33b..8f6975c 100644
--- a/src/init.c
+++ b/src/init.c
@@ -296,7 +296,6 @@
           };
           break;
         case cpuinfo_uarch_cortex_a72:
-        case cpuinfo_uarch_cortex_a76:
           xnn_params.f32.gemm = (struct gemm_parameters) {
             .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75,
             .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75,
@@ -307,6 +306,7 @@
           };
           break;
         case cpuinfo_uarch_cortex_a75:
+        case cpuinfo_uarch_cortex_a76:
         case cpuinfo_uarch_mongoose_m1:
         case cpuinfo_uarch_mongoose_m2:
         case cpuinfo_uarch_meerkat_m3: