Refactor HardSwish micro-kernels

- Code-generate HSWISH micro-kernels
- Support unrolling in HSWISH implementation
- Add HSWISH micro-kernels for AVX, FMA3, and AVX512F
- Code-generate HSWISH unit tests
- Switch all platforms to newer versions of the micro-kernels

PiperOrigin-RevId: 284705773
diff --git a/BUILD.bazel b/BUILD.bazel
index 8e2b8c0..aba475e 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -112,7 +112,9 @@
     "src/f32-gemm/gen/2x4-scalar.c",
     "src/f32-gemm/gen/4x2-scalar.c",
     "src/f32-gemm/gen/4x4-scalar.c",
-    "src/f32-hswish/scalar.c",
+    "src/f32-hswish/gen/scalar-x1.c",
+    "src/f32-hswish/gen/scalar-x2.c",
+    "src/f32-hswish/gen/scalar-x4.c",
     "src/f32-igemm/gen/1x4-scalar.c",
     "src/f32-igemm/gen/2x4-scalar.c",
     "src/f32-igemm/gen/4x2-scalar.c",
@@ -235,7 +237,9 @@
     "src/f32-gemm/gen/2x4-wasm.c",
     "src/f32-gemm/gen/4x2-wasm.c",
     "src/f32-gemm/gen/4x4-wasm.c",
-    "src/f32-hswish/wasm.c",
+    "src/f32-hswish/gen/wasm-x1.c",
+    "src/f32-hswish/gen/wasm-x2.c",
+    "src/f32-hswish/gen/wasm-x4.c",
     "src/f32-igemm/gen/1x4-wasm.c",
     "src/f32-igemm/gen/2x4-wasm.c",
     "src/f32-igemm/gen/4x2-wasm.c",
@@ -333,7 +337,8 @@
     "src/f32-gemm/gen-inc/6x8-psimd-loadsplat.c",
     "src/f32-gemm/gen-inc/6x8-psimd-splat.c",
     "src/f32-gemm/gen-inc/6x8s4-psimd.c",
-    "src/f32-hswish/psimd.c",
+    "src/f32-hswish/gen/psimd-x4.c",
+    "src/f32-hswish/gen/psimd-x8.c",
     "src/f32-igemm/gen/1x8-psimd-loadsplat.c",
     "src/f32-igemm/gen/1x8-psimd-splat.c",
     "src/f32-igemm/gen/1x8s4-psimd.c",
@@ -434,7 +439,8 @@
     "src/f32-gemm/gen-inc/4x8s4-neon.c",
     "src/f32-gemm/gen-inc/6x8s4-neon.c",
     "src/f32-gemm/gen-inc/8x8s4-neon.c",
-    "src/f32-hswish/neon.c",
+    "src/f32-hswish/gen/neon-x4.c",
+    "src/f32-hswish/gen/neon-x8.c",
     "src/f32-igemm/gen/1x8-neon-lane-ld64.c",
     "src/f32-igemm/gen/4x2-neon-lane-ld64.c",
     "src/f32-igemm/gen/4x4-neon-lane-ld64.c",
@@ -542,7 +548,8 @@
     "src/f32-gemm/gen-inc/4x8s4-neonfma.c",
     "src/f32-gemm/gen-inc/6x8s4-neonfma.c",
     "src/f32-gemm/gen-inc/8x8s4-neonfma.c",
-    "src/f32-hswish/neonfma.c",
+    "src/f32-hswish/gen/neonfma-x4.c",
+    "src/f32-hswish/gen/neonfma-x8.c",
     "src/f32-ppmm/gen/4x8-neonfma.c",
     "src/f32-ppmm/gen/8x8-neonfma.c",
     "src/f32-sigmoid/gen/neonfma-p5-nr2fma-x16.c",
@@ -658,7 +665,8 @@
     "src/f32-gemm/gen-inc/4x8-sse-dup.c",
     "src/f32-gemm/gen-inc/4x8-sse-load1.c",
     "src/f32-gemm/gen-inc/4x8s4-sse.c",
-    "src/f32-hswish/sse.c",
+    "src/f32-hswish/gen/sse-x4.c",
+    "src/f32-hswish/gen/sse-x8.c",
     "src/f32-igemm/gen/1x8-sse-dup.c",
     "src/f32-igemm/gen/1x8-sse-load1.c",
     "src/f32-igemm/gen/1x8s4-sse.c",
@@ -779,6 +787,8 @@
     "src/f32-gemm/gen-inc/3x16-avx-broadcast.c",
     "src/f32-gemm/gen-inc/4x16-avx-broadcast.c",
     "src/f32-gemm/gen-inc/5x16-avx-broadcast.c",
+    "src/f32-hswish/gen/avx-x8.c",
+    "src/f32-hswish/gen/avx-x16.c",
     "src/f32-igemm/gen/1x8-avx-broadcast.c",
     "src/f32-igemm/gen/4x8-avx-broadcast.c",
     "src/f32-igemm/gen/5x8-avx-broadcast.c",
@@ -833,6 +843,8 @@
     "src/f32-gemm/gen-inc/3x16s4-fma3.c",
     "src/f32-gemm/gen-inc/4x16s4-fma3.c",
     "src/f32-gemm/gen-inc/5x16s4-fma3.c",
+    "src/f32-hswish/gen/fma3-x8.c",
+    "src/f32-hswish/gen/fma3-x16.c",
     "src/f32-igemm/gen/1x8-fma3-broadcast.c",
     "src/f32-igemm/gen/4x8-fma3-broadcast.c",
     "src/f32-igemm/gen/5x8-fma3-broadcast.c",
@@ -943,6 +955,8 @@
     "src/f32-gemm/gen-inc/6x16-avx512f-broadcast.c",
     "src/f32-gemm/gen-inc/7x16-avx512f-broadcast.c",
     "src/f32-gemm/gen-inc/8x16-avx512f-broadcast.c",
+    "src/f32-hswish/gen/avx512f-x16.c",
+    "src/f32-hswish/gen/avx512f-x32.c",
     "src/f32-igemm/gen/1x16-avx512f-broadcast.c",
     "src/f32-igemm/gen/4x16-avx512f-broadcast.c",
     "src/f32-igemm/gen/5x16-avx512f-broadcast.c",