Generalize FILL microkernels to all 8-/16-/32-bit data types

PiperOrigin-RevId: 389415595
diff --git a/CMakeLists.txt b/CMakeLists.txt
index df0e48c..bd7accc 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -418,8 +418,6 @@
   src/x8-zip/x4-scalar.c
   src/x8-zip/xm-scalar.c
   src/x32-depthtospace2d-chw2hwc/scalar.c
-  src/x32-fill/scalar-float.c
-  src/x32-fill/scalar-int.c
   src/x32-packx/x2-scalar.c
   src/x32-packx/x3-scalar.c
   src/x32-packx/x4-scalar.c
@@ -430,7 +428,8 @@
   src/x32-zip/x3-scalar.c
   src/x32-zip/x4-scalar.c
   src/x32-zip/xm-scalar.c
-  src/xx-copy/memcpy.c)
+  src/xx-copy/memcpy.c
+  src/xx-fill/scalar-x16.c)
 
 SET(ALL_SCALAR_MICROKERNEL_SRCS
   src/f32-argmaxpool/4x-scalar-c1.c
@@ -1029,8 +1028,6 @@
   src/x8-zip/x4-scalar.c
   src/x8-zip/xm-scalar.c
   src/x32-depthtospace2d-chw2hwc/scalar.c
-  src/x32-fill/scalar-float.c
-  src/x32-fill/scalar-int.c
   src/x32-packx/x2-scalar.c
   src/x32-packx/x3-scalar.c
   src/x32-packx/x4-scalar.c
@@ -1041,7 +1038,8 @@
   src/x32-zip/x3-scalar.c
   src/x32-zip/x4-scalar.c
   src/x32-zip/xm-scalar.c
-  src/xx-copy/memcpy.c)
+  src/xx-copy/memcpy.c
+  src/xx-fill/scalar-x16.c)
 
 SET(PROD_NEON_MICROKERNEL_SRCS
   src/f32-argmaxpool/4x-neon-c4.c
@@ -1148,14 +1146,14 @@
   src/x8-zip/x3-neon.c
   src/x8-zip/x4-neon.c
   src/x8-zip/xm-neon.c
-  src/x32-fill/neon.c
   src/x32-packx/x4-neon-st4.c
   src/x32-pad/neon.c
   src/x32-unpool/neon.c
   src/x32-zip/x2-neon.c
   src/x32-zip/x3-neon.c
   src/x32-zip/x4-neon.c
-  src/x32-zip/xm-neon.c)
+  src/x32-zip/xm-neon.c
+  src/xx-fill/neon-x64.c)
 
 SET(ALL_NEON_MICROKERNEL_SRCS
   src/f32-argmaxpool/4x-neon-c4.c
@@ -1718,14 +1716,14 @@
   src/x8-zip/x3-neon.c
   src/x8-zip/x4-neon.c
   src/x8-zip/xm-neon.c
-  src/x32-fill/neon.c
   src/x32-packx/x4-neon-st4.c
   src/x32-pad/neon.c
   src/x32-unpool/neon.c
   src/x32-zip/x2-neon.c
   src/x32-zip/x3-neon.c
   src/x32-zip/x4-neon.c
-  src/x32-zip/xm-neon.c)
+  src/x32-zip/xm-neon.c
+  src/xx-fill/neon-x64.c)
 
 SET(PROD_NEONFMA_MICROKERNEL_SRCS
   src/f32-dwconv/gen/up4x9-minmax-neonfma.c
@@ -2404,7 +2402,6 @@
   src/f32-vunary/gen/vabs-sse-x8.c
   src/f32-vunary/gen/vneg-sse-x8.c
   src/f32-vunary/gen/vsqr-sse-x8.c
-  src/x32-fill/sse.c
   src/x32-packx/x4-sse.c
   src/x32-pad/sse.c)
 
@@ -2579,7 +2576,6 @@
   src/math/sqrt-sse-hh1mac.c
   src/math/sqrt-sse-nr1mac.c
   src/math/sqrt-sse-nr2mac.c
-  src/x32-fill/sse.c
   src/x32-packx/x4-sse.c
   src/x32-pad/sse.c)
 
@@ -2639,7 +2635,8 @@
   src/x32-zip/x2-sse2.c
   src/x32-zip/x3-sse2.c
   src/x32-zip/x4-sse2.c
-  src/x32-zip/xm-sse2.c)
+  src/x32-zip/xm-sse2.c
+  src/xx-fill/sse2-x64.c)
 
 SET(ALL_SSE2_MICROKERNEL_SRCS
   src/f32-argmaxpool/4x-sse2-c4.c
@@ -2896,7 +2893,8 @@
   src/x32-zip/x2-sse2.c
   src/x32-zip/x3-sse2.c
   src/x32-zip/x4-sse2.c
-  src/x32-zip/xm-sse2.c)
+  src/x32-zip/xm-sse2.c
+  src/xx-fill/sse2-x64.c)
 
 SET(PROD_SSSE3_MICROKERNEL_SRCS
   src/f32-dwconv2d-chw/gen/3x3p1-minmax-ssse3-2x4-acc2.c
@@ -6711,15 +6709,6 @@
   TARGET_LINK_LIBRARIES(u8-vclamp-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
   ADD_TEST(u8-vclamp-test u8-vclamp-test)
 
-  ADD_EXECUTABLE(x32-fill-test test/x32-fill.cc $<TARGET_OBJECTS:all_microkernels>)
-  SET_TARGET_PROPERTIES(x32-fill-test PROPERTIES
-    CXX_STANDARD 11
-    CXX_STANDARD_REQUIRED YES
-    CXX_EXTENSIONS YES)
-  TARGET_INCLUDE_DIRECTORIES(x32-fill-test PRIVATE include src test)
-  TARGET_LINK_LIBRARIES(x32-fill-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
-  ADD_TEST(x32-fill-test x32-fill-test)
-
   ADD_EXECUTABLE(x32-packx-test test/x32-packx.cc $<TARGET_OBJECTS:all_microkernels>)
   SET_TARGET_PROPERTIES(x32-packx-test PROPERTIES
     CXX_STANDARD 11
@@ -6782,6 +6771,15 @@
   TARGET_INCLUDE_DIRECTORIES(x8-zip-test PRIVATE include src test)
   TARGET_LINK_LIBRARIES(x8-zip-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
   ADD_TEST(x8-zip-test x8-zip-test)
+
+  ADD_EXECUTABLE(xx-fill-test test/xx-fill.cc $<TARGET_OBJECTS:all_microkernels>)
+  SET_TARGET_PROPERTIES(xx-fill-test PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS YES)
+  TARGET_INCLUDE_DIRECTORIES(xx-fill-test PRIVATE include src test)
+  TARGET_LINK_LIBRARIES(xx-fill-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
+  ADD_TEST(xx-fill-test xx-fill-test)
 ENDIF()
 
 # ---[ XNNPACK microbenchmarks