Reoptimize microkernel selection for WAsm 1.0

Performance on Skylake-X workstation:
- QC8MobileNetV1: 249448 us -> 226005 us (+10%)
- QC8MobileNetV2: 161857 us -> 131251 us (+23%)
- QS8MobileNetV1: 249005 us -> 227868 us (+9%)
- QS8MobileNetV2: 161510 us -> 130567 us (+24%)
- QU8MobileNetV1: 282988 us -> 264586 us (+7%)
- QU8MobileNetV2: 178407 us -> 151082 us (+18%)

PiperOrigin-RevId: 419711994
diff --git a/BUILD.bazel b/BUILD.bazel
index 3e144d2..39a65b2 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -349,48 +349,36 @@
     "src/f32-vunary/gen/vabs-scalar-x4.c",
     "src/f32-vunary/gen/vneg-scalar-x4.c",
     "src/f32-vunary/gen/vsqr-scalar-x4.c",
-    "src/qc8-dwconv/gen/up2x9-minmax-fp32-scalar-fmagic.c",
-    "src/qc8-dwconv/gen/up2x25-minmax-fp32-scalar-fmagic.c",
-    "src/qc8-gemm/gen/1x2-minmax-fp32-scalar-fmagic.c",
-    "src/qc8-gemm/gen/1x4-minmax-fp32-scalar-fmagic.c",
-    "src/qc8-gemm/gen/2x2-minmax-fp32-scalar-fmagic.c",
-    "src/qc8-gemm/gen/4x4-minmax-fp32-scalar-fmagic.c",
-    "src/qc8-igemm/gen/1x2-minmax-fp32-scalar-fmagic.c",
-    "src/qc8-igemm/gen/1x4-minmax-fp32-scalar-fmagic.c",
-    "src/qc8-igemm/gen/2x2-minmax-fp32-scalar-fmagic.c",
-    "src/qc8-igemm/gen/4x4-minmax-fp32-scalar-fmagic.c",
-    "src/qs8-dwconv/gen/up2x9-minmax-fp32-scalar-fmagic.c",
-    "src/qs8-dwconv/gen/up2x25-minmax-fp32-scalar-fmagic.c",
+    "src/qc8-dwconv/gen/up2x9-minmax-fp32-scalar-imagic.c",
+    "src/qc8-dwconv/gen/up1x25-minmax-fp32-scalar-imagic.c",
+    "src/qc8-gemm/gen/1x2-minmax-fp32-scalar-imagic.c",
+    "src/qc8-gemm/gen/2x2-minmax-fp32-scalar-imagic.c",
+    "src/qc8-igemm/gen/1x2-minmax-fp32-scalar-imagic.c",
+    "src/qc8-igemm/gen/2x2-minmax-fp32-scalar-imagic.c",
+    "src/qs8-dwconv/gen/up2x9-minmax-fp32-scalar-imagic.c",
+    "src/qs8-dwconv/gen/up1x25-minmax-fp32-scalar-imagic.c",
     "src/qs8-f32-vcvt/gen/vcvt-scalar-x1.c",
     "src/qs8-gavgpool/gen/7p7x-minmax-scalar-c4.c",
     "src/qs8-gavgpool/gen/7x-minmax-scalar-c4.c",
-    "src/qs8-gemm/gen/1x2-minmax-fp32-scalar-fmagic.c",
-    "src/qs8-gemm/gen/1x4-minmax-fp32-scalar-fmagic.c",
-    "src/qs8-gemm/gen/2x2-minmax-fp32-scalar-fmagic.c",
-    "src/qs8-gemm/gen/4x4-minmax-fp32-scalar-fmagic.c",
-    "src/qs8-igemm/gen/1x2-minmax-fp32-scalar-fmagic.c",
-    "src/qs8-igemm/gen/1x4-minmax-fp32-scalar-fmagic.c",
-    "src/qs8-igemm/gen/2x2-minmax-fp32-scalar-fmagic.c",
-    "src/qs8-igemm/gen/4x4-minmax-fp32-scalar-fmagic.c",
+    "src/qs8-gemm/gen/1x2-minmax-fp32-scalar-imagic.c",
+    "src/qs8-gemm/gen/2x2-minmax-fp32-scalar-imagic.c",
+    "src/qs8-igemm/gen/1x2-minmax-fp32-scalar-imagic.c",
+    "src/qs8-igemm/gen/2x2-minmax-fp32-scalar-imagic.c",
     "src/qs8-vadd/gen/minmax-scalar-x4.c",
     "src/qs8-vaddc/gen/minmax-scalar-x4.c",
     "src/qs8-vmul/gen/minmax-fp32-scalar-x4.c",
     "src/qs8-vmulc/gen/minmax-fp32-scalar-x4.c",
     "src/qu8-avgpool/9p8x-minmax-scalar-c1.c",
     "src/qu8-avgpool/9x-minmax-scalar-c1.c",
-    "src/qu8-dwconv/gen/up2x9-minmax-fp32-scalar-fmagic.c",
-    "src/qu8-dwconv/gen/up2x25-minmax-fp32-scalar-fmagic.c",
+    "src/qu8-dwconv/gen/up2x9-minmax-fp32-scalar-imagic.c",
+    "src/qu8-dwconv/gen/up1x25-minmax-fp32-scalar-imagic.c",
     "src/qu8-f32-vcvt/gen/vcvt-scalar-x1.c",
     "src/qu8-gavgpool/7p7x-minmax-scalar-c1.c",
     "src/qu8-gavgpool/7x-minmax-scalar-c1.c",
-    "src/qu8-gemm/gen/1x2-minmax-fp32-scalar-fmagic.c",
-    "src/qu8-gemm/gen/1x4-minmax-fp32-scalar-fmagic.c",
-    "src/qu8-gemm/gen/2x2-minmax-fp32-scalar-fmagic.c",
-    "src/qu8-gemm/gen/4x4-minmax-fp32-scalar-fmagic.c",
-    "src/qu8-igemm/gen/1x2-minmax-fp32-scalar-fmagic.c",
-    "src/qu8-igemm/gen/1x4-minmax-fp32-scalar-fmagic.c",
-    "src/qu8-igemm/gen/2x2-minmax-fp32-scalar-fmagic.c",
-    "src/qu8-igemm/gen/4x4-minmax-fp32-scalar-fmagic.c",
+    "src/qu8-gemm/gen/1x2-minmax-fp32-scalar-imagic.c",
+    "src/qu8-gemm/gen/2x2-minmax-fp32-scalar-imagic.c",
+    "src/qu8-igemm/gen/1x2-minmax-fp32-scalar-imagic.c",
+    "src/qu8-igemm/gen/2x2-minmax-fp32-scalar-imagic.c",
     "src/qu8-vadd/gen/minmax-scalar-x4.c",
     "src/qu8-vaddc/gen/minmax-scalar-x4.c",
     "src/qu8-vmul/gen/minmax-fp32-scalar-x4.c",
@@ -1323,6 +1311,72 @@
     "src/f32-vrelu/gen/vrelu-wasm-x2.c",
     "src/f32-vrelu/gen/vrelu-wasm-x4.c",
     "src/f32-vrelu/gen/vrelu-wasm-x8.c",
+    "src/qc8-dwconv/gen/up1x9-minmax-fp32-wasm-fmagic.c",
+    "src/qc8-dwconv/gen/up1x25-minmax-fp32-wasm-fmagic.c",
+    "src/qc8-dwconv/gen/up2x9-minmax-fp32-wasm-fmagic.c",
+    "src/qc8-dwconv/gen/up2x25-minmax-fp32-wasm-fmagic.c",
+    "src/qc8-dwconv/gen/up4x9-minmax-fp32-wasm-fmagic.c",
+    "src/qc8-dwconv/gen/up4x25-minmax-fp32-wasm-fmagic.c",
+    "src/qc8-gemm/gen/1x2-minmax-fp32-wasm-fmagic.c",
+    "src/qc8-gemm/gen/1x4-minmax-fp32-wasm-fmagic.c",
+    "src/qc8-gemm/gen/2x2-minmax-fp32-wasm-fmagic.c",
+    "src/qc8-gemm/gen/2x4-minmax-fp32-wasm-fmagic.c",
+    "src/qc8-gemm/gen/3x2-minmax-fp32-wasm-fmagic.c",
+    "src/qc8-gemm/gen/3x4-minmax-fp32-wasm-fmagic.c",
+    "src/qc8-gemm/gen/4x2-minmax-fp32-wasm-fmagic.c",
+    "src/qc8-gemm/gen/4x4-minmax-fp32-wasm-fmagic.c",
+    "src/qc8-igemm/gen/1x2-minmax-fp32-wasm-fmagic.c",
+    "src/qc8-igemm/gen/1x4-minmax-fp32-wasm-fmagic.c",
+    "src/qc8-igemm/gen/2x2-minmax-fp32-wasm-fmagic.c",
+    "src/qc8-igemm/gen/2x4-minmax-fp32-wasm-fmagic.c",
+    "src/qc8-igemm/gen/3x2-minmax-fp32-wasm-fmagic.c",
+    "src/qc8-igemm/gen/3x4-minmax-fp32-wasm-fmagic.c",
+    "src/qc8-igemm/gen/4x2-minmax-fp32-wasm-fmagic.c",
+    "src/qc8-igemm/gen/4x4-minmax-fp32-wasm-fmagic.c",
+    "src/qs8-dwconv/gen/up1x9-minmax-fp32-wasm-fmagic.c",
+    "src/qs8-dwconv/gen/up1x25-minmax-fp32-wasm-fmagic.c",
+    "src/qs8-dwconv/gen/up2x9-minmax-fp32-wasm-fmagic.c",
+    "src/qs8-dwconv/gen/up2x25-minmax-fp32-wasm-fmagic.c",
+    "src/qs8-dwconv/gen/up4x9-minmax-fp32-wasm-fmagic.c",
+    "src/qs8-dwconv/gen/up4x25-minmax-fp32-wasm-fmagic.c",
+    "src/qs8-gemm/gen/1x2-minmax-fp32-wasm-fmagic.c",
+    "src/qs8-gemm/gen/1x4-minmax-fp32-wasm-fmagic.c",
+    "src/qs8-gemm/gen/2x2-minmax-fp32-wasm-fmagic.c",
+    "src/qs8-gemm/gen/2x4-minmax-fp32-wasm-fmagic.c",
+    "src/qs8-gemm/gen/3x2-minmax-fp32-wasm-fmagic.c",
+    "src/qs8-gemm/gen/3x4-minmax-fp32-wasm-fmagic.c",
+    "src/qs8-gemm/gen/4x2-minmax-fp32-wasm-fmagic.c",
+    "src/qs8-gemm/gen/4x4-minmax-fp32-wasm-fmagic.c",
+    "src/qs8-igemm/gen/1x2-minmax-fp32-wasm-fmagic.c",
+    "src/qs8-igemm/gen/1x4-minmax-fp32-wasm-fmagic.c",
+    "src/qs8-igemm/gen/2x2-minmax-fp32-wasm-fmagic.c",
+    "src/qs8-igemm/gen/2x4-minmax-fp32-wasm-fmagic.c",
+    "src/qs8-igemm/gen/3x2-minmax-fp32-wasm-fmagic.c",
+    "src/qs8-igemm/gen/3x4-minmax-fp32-wasm-fmagic.c",
+    "src/qs8-igemm/gen/4x2-minmax-fp32-wasm-fmagic.c",
+    "src/qs8-igemm/gen/4x4-minmax-fp32-wasm-fmagic.c",
+    "src/qu8-dwconv/gen/up1x9-minmax-fp32-wasm-fmagic.c",
+    "src/qu8-dwconv/gen/up1x25-minmax-fp32-wasm-fmagic.c",
+    "src/qu8-dwconv/gen/up2x9-minmax-fp32-wasm-fmagic.c",
+    "src/qu8-dwconv/gen/up2x25-minmax-fp32-wasm-fmagic.c",
+    "src/qu8-dwconv/gen/up4x9-minmax-fp32-wasm-fmagic.c",
+    "src/qu8-dwconv/gen/up4x25-minmax-fp32-wasm-fmagic.c",
+    "src/qu8-gemm/gen/1x2-minmax-fp32-wasm-fmagic.c",
+    "src/qu8-gemm/gen/1x4-minmax-fp32-wasm-fmagic.c",
+    "src/qu8-gemm/gen/2x2-minmax-fp32-wasm-fmagic.c",
+    "src/qu8-gemm/gen/2x4-minmax-fp32-wasm-fmagic.c",
+    "src/qu8-gemm/gen/3x2-minmax-fp32-wasm-fmagic.c",
+    "src/qu8-gemm/gen/3x4-minmax-fp32-wasm-fmagic.c",
+    "src/qu8-gemm/gen/4x2-minmax-fp32-wasm-fmagic.c",
+    "src/qu8-gemm/gen/4x4-minmax-fp32-wasm-fmagic.c",
+    "src/qu8-igemm/gen/1x2-minmax-fp32-wasm-fmagic.c",
+    "src/qu8-igemm/gen/1x4-minmax-fp32-wasm-fmagic.c",
+    "src/qu8-igemm/gen/2x2-minmax-fp32-wasm-fmagic.c",
+    "src/qu8-igemm/gen/2x4-minmax-fp32-wasm-fmagic.c",
+    "src/qu8-igemm/gen/3x2-minmax-fp32-wasm-fmagic.c",
+    "src/qu8-igemm/gen/3x4-minmax-fp32-wasm-fmagic.c",
+    "src/qu8-igemm/gen/4x2-minmax-fp32-wasm-fmagic.c",
+    "src/qu8-igemm/gen/4x4-minmax-fp32-wasm-fmagic.c",
 ]
 
 ALL_WASMSIMD_MICROKERNEL_SRCS = [