Generalize FILL microkernels to all 8-/16-/32-bit data types

PiperOrigin-RevId: 389415595
diff --git a/BUILD.bazel b/BUILD.bazel
index 16ff5e6..fb43f89 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -291,8 +291,6 @@
     "src/x8-zip/x4-scalar.c",
     "src/x8-zip/xm-scalar.c",
     "src/x32-depthtospace2d-chw2hwc/scalar.c",
-    "src/x32-fill/scalar-float.c",
-    "src/x32-fill/scalar-int.c",
     "src/x32-packx/x2-scalar.c",
     "src/x32-packx/x3-scalar.c",
     "src/x32-packx/x4-scalar.c",
@@ -304,6 +302,7 @@
     "src/x32-zip/x4-scalar.c",
     "src/x32-zip/xm-scalar.c",
     "src/xx-copy/memcpy.c",
+    "src/xx-fill/scalar-x16.c",
 ]
 
 ALL_SCALAR_MICROKERNEL_SRCS = [
@@ -903,8 +902,6 @@
     "src/x8-zip/x4-scalar.c",
     "src/x8-zip/xm-scalar.c",
     "src/x32-depthtospace2d-chw2hwc/scalar.c",
-    "src/x32-fill/scalar-float.c",
-    "src/x32-fill/scalar-int.c",
     "src/x32-packx/x2-scalar.c",
     "src/x32-packx/x3-scalar.c",
     "src/x32-packx/x4-scalar.c",
@@ -916,6 +913,7 @@
     "src/x32-zip/x4-scalar.c",
     "src/x32-zip/xm-scalar.c",
     "src/xx-copy/memcpy.c",
+    "src/xx-fill/scalar-x16.c",
 ]
 
 ALL_WASM_MICROKERNEL_SRCS = [
@@ -1842,7 +1840,6 @@
     "src/qu8-vmul/gen/minmax-fp32-wasmsimd-mul32-ld64-x16.c",
     "src/qu8-vmulc/gen/minmax-fp32-wasmsimd-mul32-ld64-x8.c",
     "src/qu8-vmulc/gen/minmax-fp32-wasmsimd-mul32-ld64-x16.c",
-    "src/x32-fill/wasmsimd.c",
     "src/x32-packx/x4-wasmsimd.c",
     "src/x32-pad/wasmsimd.c",
     "src/x32-unpool/wasmsimd.c",
@@ -1850,6 +1847,7 @@
     "src/x32-zip/x3-wasmsimd.c",
     "src/x32-zip/x4-wasmsimd.c",
     "src/x32-zip/xm-wasmsimd.c",
+    "src/xx-fill/wasmsimd-x64.c",
 ]
 
 # ISA-specific micro-kernels
@@ -1958,7 +1956,6 @@
     "src/x8-zip/x3-neon.c",
     "src/x8-zip/x4-neon.c",
     "src/x8-zip/xm-neon.c",
-    "src/x32-fill/neon.c",
     "src/x32-packx/x4-neon-st4.c",
     "src/x32-pad/neon.c",
     "src/x32-unpool/neon.c",
@@ -1966,6 +1963,7 @@
     "src/x32-zip/x3-neon.c",
     "src/x32-zip/x4-neon.c",
     "src/x32-zip/xm-neon.c",
+    "src/xx-fill/neon-x64.c",
 ]
 
 ALL_NEON_MICROKERNEL_SRCS = [
@@ -2529,7 +2527,6 @@
     "src/x8-zip/x3-neon.c",
     "src/x8-zip/x4-neon.c",
     "src/x8-zip/xm-neon.c",
-    "src/x32-fill/neon.c",
     "src/x32-packx/x4-neon-st4.c",
     "src/x32-pad/neon.c",
     "src/x32-unpool/neon.c",
@@ -2537,6 +2534,7 @@
     "src/x32-zip/x3-neon.c",
     "src/x32-zip/x4-neon.c",
     "src/x32-zip/xm-neon.c",
+    "src/xx-fill/neon-x64.c",
 ]
 
 PROD_NEONFMA_MICROKERNEL_SRCS = [
@@ -3226,7 +3224,6 @@
     "src/f32-vunary/gen/vabs-sse-x8.c",
     "src/f32-vunary/gen/vneg-sse-x8.c",
     "src/f32-vunary/gen/vsqr-sse-x8.c",
-    "src/x32-fill/sse.c",
     "src/x32-packx/x4-sse.c",
     "src/x32-pad/sse.c",
 ]
@@ -3402,7 +3399,6 @@
     "src/math/sqrt-sse-hh1mac.c",
     "src/math/sqrt-sse-nr1mac.c",
     "src/math/sqrt-sse-nr2mac.c",
-    "src/x32-fill/sse.c",
     "src/x32-packx/x4-sse.c",
     "src/x32-pad/sse.c",
 ]
@@ -3464,6 +3460,7 @@
     "src/x32-zip/x3-sse2.c",
     "src/x32-zip/x4-sse2.c",
     "src/x32-zip/xm-sse2.c",
+    "src/xx-fill/sse2-x64.c",
 ]
 
 ALL_SSE2_MICROKERNEL_SRCS = [
@@ -3722,6 +3719,7 @@
     "src/x32-zip/x3-sse2.c",
     "src/x32-zip/x4-sse2.c",
     "src/x32-zip/xm-sse2.c",
+    "src/xx-fill/sse2-x64.c",
 ]
 
 PROD_SSSE3_MICROKERNEL_SRCS = [
@@ -9546,19 +9544,28 @@
 )
 
 xnnpack_unit_test(
-    name = "x32_depthtospace2d_chw2hwc_test",
+    name = "x8_lut_test",
     srcs = [
-        "test/x32-depthtospace2d-chw2hwc.cc",
-        "test/depthtospace-microkernel-tester.h",
+        "test/x8-lut.cc",
+        "test/lut-microkernel-tester.h",
     ] + MICROKERNEL_TEST_HDRS,
     deps = MICROKERNEL_TEST_DEPS,
 )
 
 xnnpack_unit_test(
-    name = "x32_fill_test",
+    name = "x8_zip_test",
     srcs = [
-        "test/x32-fill.cc",
-        "test/fill-microkernel-tester.h",
+        "test/x8-zip.cc",
+        "test/zip-microkernel-tester.h",
+    ] + MICROKERNEL_TEST_HDRS,
+    deps = MICROKERNEL_TEST_DEPS,
+)
+
+xnnpack_unit_test(
+    name = "x32_depthtospace2d_chw2hwc_test",
+    srcs = [
+        "test/x32-depthtospace2d-chw2hwc.cc",
+        "test/depthtospace-microkernel-tester.h",
     ] + MICROKERNEL_TEST_HDRS,
     deps = MICROKERNEL_TEST_DEPS,
 )
@@ -9601,19 +9608,10 @@
 )
 
 xnnpack_unit_test(
-    name = "x8_lut_test",
+    name = "xx_fill_test",
     srcs = [
-        "test/x8-lut.cc",
-        "test/lut-microkernel-tester.h",
-    ] + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
-)
-
-xnnpack_unit_test(
-    name = "x8_zip_test",
-    srcs = [
-        "test/x8-zip.cc",
-        "test/zip-microkernel-tester.h",
+        "test/xx-fill.cc",
+        "test/fill-microkernel-tester.h",
     ] + MICROKERNEL_TEST_HDRS,
     deps = MICROKERNEL_TEST_DEPS,
 )