Generalize FILL microkernels to all 8-/16-/32-bit data types PiperOrigin-RevId: 389415595

commit: 933051bc5e908dc1d34cc91bfb1d4244c9c0520b [log] [tgz]
author: Marat Dukhan <maratek@google.com> Sat Aug 07 16:26:15 2021 -0700
committer: XNNPACK Team <xnnpack-github-robot@google.com> Sat Aug 07 16:27:12 2021 -0700
tree: 0b17038e2dd429e0cec713ce954ee93ad81edab8
parent: 7c74affd9b29d39a89f381fac32e15bfd3259090 [diff]
diff --git a/BUILD.bazel b/BUILD.bazel
index 16ff5e6..fb43f89 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel

@@ -291,8 +291,6 @@
     "src/x8-zip/x4-scalar.c",
     "src/x8-zip/xm-scalar.c",
     "src/x32-depthtospace2d-chw2hwc/scalar.c",
-    "src/x32-fill/scalar-float.c",
-    "src/x32-fill/scalar-int.c",
     "src/x32-packx/x2-scalar.c",
     "src/x32-packx/x3-scalar.c",
     "src/x32-packx/x4-scalar.c",
@@ -304,6 +302,7 @@
     "src/x32-zip/x4-scalar.c",
     "src/x32-zip/xm-scalar.c",
     "src/xx-copy/memcpy.c",
+    "src/xx-fill/scalar-x16.c",
 ]
 
 ALL_SCALAR_MICROKERNEL_SRCS = [
@@ -903,8 +902,6 @@
     "src/x8-zip/x4-scalar.c",
     "src/x8-zip/xm-scalar.c",
     "src/x32-depthtospace2d-chw2hwc/scalar.c",
-    "src/x32-fill/scalar-float.c",
-    "src/x32-fill/scalar-int.c",
     "src/x32-packx/x2-scalar.c",
     "src/x32-packx/x3-scalar.c",
     "src/x32-packx/x4-scalar.c",
@@ -916,6 +913,7 @@
     "src/x32-zip/x4-scalar.c",
     "src/x32-zip/xm-scalar.c",
     "src/xx-copy/memcpy.c",
+    "src/xx-fill/scalar-x16.c",
 ]
 
 ALL_WASM_MICROKERNEL_SRCS = [
@@ -1842,7 +1840,6 @@
     "src/qu8-vmul/gen/minmax-fp32-wasmsimd-mul32-ld64-x16.c",
     "src/qu8-vmulc/gen/minmax-fp32-wasmsimd-mul32-ld64-x8.c",
     "src/qu8-vmulc/gen/minmax-fp32-wasmsimd-mul32-ld64-x16.c",
-    "src/x32-fill/wasmsimd.c",
     "src/x32-packx/x4-wasmsimd.c",
     "src/x32-pad/wasmsimd.c",
     "src/x32-unpool/wasmsimd.c",
@@ -1850,6 +1847,7 @@
     "src/x32-zip/x3-wasmsimd.c",
     "src/x32-zip/x4-wasmsimd.c",
     "src/x32-zip/xm-wasmsimd.c",
+    "src/xx-fill/wasmsimd-x64.c",
 ]
 
 # ISA-specific micro-kernels
@@ -1958,7 +1956,6 @@
     "src/x8-zip/x3-neon.c",
     "src/x8-zip/x4-neon.c",
     "src/x8-zip/xm-neon.c",
-    "src/x32-fill/neon.c",
     "src/x32-packx/x4-neon-st4.c",
     "src/x32-pad/neon.c",
     "src/x32-unpool/neon.c",
@@ -1966,6 +1963,7 @@
     "src/x32-zip/x3-neon.c",
     "src/x32-zip/x4-neon.c",
     "src/x32-zip/xm-neon.c",
+    "src/xx-fill/neon-x64.c",
 ]
 
 ALL_NEON_MICROKERNEL_SRCS = [
@@ -2529,7 +2527,6 @@
     "src/x8-zip/x3-neon.c",
     "src/x8-zip/x4-neon.c",
     "src/x8-zip/xm-neon.c",
-    "src/x32-fill/neon.c",
     "src/x32-packx/x4-neon-st4.c",
     "src/x32-pad/neon.c",
     "src/x32-unpool/neon.c",
@@ -2537,6 +2534,7 @@
     "src/x32-zip/x3-neon.c",
     "src/x32-zip/x4-neon.c",
     "src/x32-zip/xm-neon.c",
+    "src/xx-fill/neon-x64.c",
 ]
 
 PROD_NEONFMA_MICROKERNEL_SRCS = [
@@ -3226,7 +3224,6 @@
     "src/f32-vunary/gen/vabs-sse-x8.c",
     "src/f32-vunary/gen/vneg-sse-x8.c",
     "src/f32-vunary/gen/vsqr-sse-x8.c",
-    "src/x32-fill/sse.c",
     "src/x32-packx/x4-sse.c",
     "src/x32-pad/sse.c",
 ]
@@ -3402,7 +3399,6 @@
     "src/math/sqrt-sse-hh1mac.c",
     "src/math/sqrt-sse-nr1mac.c",
     "src/math/sqrt-sse-nr2mac.c",
-    "src/x32-fill/sse.c",
     "src/x32-packx/x4-sse.c",
     "src/x32-pad/sse.c",
 ]
@@ -3464,6 +3460,7 @@
     "src/x32-zip/x3-sse2.c",
     "src/x32-zip/x4-sse2.c",
     "src/x32-zip/xm-sse2.c",
+    "src/xx-fill/sse2-x64.c",
 ]
 
 ALL_SSE2_MICROKERNEL_SRCS = [
@@ -3722,6 +3719,7 @@
     "src/x32-zip/x3-sse2.c",
     "src/x32-zip/x4-sse2.c",
     "src/x32-zip/xm-sse2.c",
+    "src/xx-fill/sse2-x64.c",
 ]
 
 PROD_SSSE3_MICROKERNEL_SRCS = [
@@ -9546,19 +9544,28 @@
 )
 
 xnnpack_unit_test(
-    name = "x32_depthtospace2d_chw2hwc_test",
+    name = "x8_lut_test",
     srcs = [
-        "test/x32-depthtospace2d-chw2hwc.cc",
-        "test/depthtospace-microkernel-tester.h",
+        "test/x8-lut.cc",
+        "test/lut-microkernel-tester.h",
     ] + MICROKERNEL_TEST_HDRS,
     deps = MICROKERNEL_TEST_DEPS,
 )
 
 xnnpack_unit_test(
-    name = "x32_fill_test",
+    name = "x8_zip_test",
     srcs = [
-        "test/x32-fill.cc",
-        "test/fill-microkernel-tester.h",
+        "test/x8-zip.cc",
+        "test/zip-microkernel-tester.h",
+    ] + MICROKERNEL_TEST_HDRS,
+    deps = MICROKERNEL_TEST_DEPS,
+)
+
+xnnpack_unit_test(
+    name = "x32_depthtospace2d_chw2hwc_test",
+    srcs = [
+        "test/x32-depthtospace2d-chw2hwc.cc",
+        "test/depthtospace-microkernel-tester.h",
     ] + MICROKERNEL_TEST_HDRS,
     deps = MICROKERNEL_TEST_DEPS,
 )
@@ -9601,19 +9608,10 @@
 )
 
 xnnpack_unit_test(
-    name = "x8_lut_test",
+    name = "xx_fill_test",
     srcs = [
-        "test/x8-lut.cc",
-        "test/lut-microkernel-tester.h",
-    ] + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
-)
-
-xnnpack_unit_test(
-    name = "x8_zip_test",
-    srcs = [
-        "test/x8-zip.cc",
-        "test/zip-microkernel-tester.h",
+        "test/xx-fill.cc",
+        "test/fill-microkernel-tester.h",
     ] + MICROKERNEL_TEST_HDRS,
     deps = MICROKERNEL_TEST_DEPS,
 )

diff --git a/CMakeLists.txt b/CMakeLists.txt
index df0e48c..bd7accc 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt

@@ -418,8 +418,6 @@
   src/x8-zip/x4-scalar.c
   src/x8-zip/xm-scalar.c
   src/x32-depthtospace2d-chw2hwc/scalar.c
-  src/x32-fill/scalar-float.c
-  src/x32-fill/scalar-int.c
   src/x32-packx/x2-scalar.c
   src/x32-packx/x3-scalar.c
   src/x32-packx/x4-scalar.c
@@ -430,7 +428,8 @@
   src/x32-zip/x3-scalar.c
   src/x32-zip/x4-scalar.c
   src/x32-zip/xm-scalar.c
-  src/xx-copy/memcpy.c)
+  src/xx-copy/memcpy.c
+  src/xx-fill/scalar-x16.c)
 
 SET(ALL_SCALAR_MICROKERNEL_SRCS
   src/f32-argmaxpool/4x-scalar-c1.c
@@ -1029,8 +1028,6 @@
   src/x8-zip/x4-scalar.c
   src/x8-zip/xm-scalar.c
   src/x32-depthtospace2d-chw2hwc/scalar.c
-  src/x32-fill/scalar-float.c
-  src/x32-fill/scalar-int.c
   src/x32-packx/x2-scalar.c
   src/x32-packx/x3-scalar.c
   src/x32-packx/x4-scalar.c
@@ -1041,7 +1038,8 @@
   src/x32-zip/x3-scalar.c
   src/x32-zip/x4-scalar.c
   src/x32-zip/xm-scalar.c
-  src/xx-copy/memcpy.c)
+  src/xx-copy/memcpy.c
+  src/xx-fill/scalar-x16.c)
 
 SET(PROD_NEON_MICROKERNEL_SRCS
   src/f32-argmaxpool/4x-neon-c4.c
@@ -1148,14 +1146,14 @@
   src/x8-zip/x3-neon.c
   src/x8-zip/x4-neon.c
   src/x8-zip/xm-neon.c
-  src/x32-fill/neon.c
   src/x32-packx/x4-neon-st4.c
   src/x32-pad/neon.c
   src/x32-unpool/neon.c
   src/x32-zip/x2-neon.c
   src/x32-zip/x3-neon.c
   src/x32-zip/x4-neon.c
-  src/x32-zip/xm-neon.c)
+  src/x32-zip/xm-neon.c
+  src/xx-fill/neon-x64.c)
 
 SET(ALL_NEON_MICROKERNEL_SRCS
   src/f32-argmaxpool/4x-neon-c4.c
@@ -1718,14 +1716,14 @@
   src/x8-zip/x3-neon.c
   src/x8-zip/x4-neon.c
   src/x8-zip/xm-neon.c
-  src/x32-fill/neon.c
   src/x32-packx/x4-neon-st4.c
   src/x32-pad/neon.c
   src/x32-unpool/neon.c
   src/x32-zip/x2-neon.c
   src/x32-zip/x3-neon.c
   src/x32-zip/x4-neon.c
-  src/x32-zip/xm-neon.c)
+  src/x32-zip/xm-neon.c
+  src/xx-fill/neon-x64.c)
 
 SET(PROD_NEONFMA_MICROKERNEL_SRCS
   src/f32-dwconv/gen/up4x9-minmax-neonfma.c
@@ -2404,7 +2402,6 @@
   src/f32-vunary/gen/vabs-sse-x8.c
   src/f32-vunary/gen/vneg-sse-x8.c
   src/f32-vunary/gen/vsqr-sse-x8.c
-  src/x32-fill/sse.c
   src/x32-packx/x4-sse.c
   src/x32-pad/sse.c)
 
@@ -2579,7 +2576,6 @@
   src/math/sqrt-sse-hh1mac.c
   src/math/sqrt-sse-nr1mac.c
   src/math/sqrt-sse-nr2mac.c
-  src/x32-fill/sse.c
   src/x32-packx/x4-sse.c
   src/x32-pad/sse.c)
 
@@ -2639,7 +2635,8 @@
   src/x32-zip/x2-sse2.c
   src/x32-zip/x3-sse2.c
   src/x32-zip/x4-sse2.c
-  src/x32-zip/xm-sse2.c)
+  src/x32-zip/xm-sse2.c
+  src/xx-fill/sse2-x64.c)
 
 SET(ALL_SSE2_MICROKERNEL_SRCS
   src/f32-argmaxpool/4x-sse2-c4.c
@@ -2896,7 +2893,8 @@
   src/x32-zip/x2-sse2.c
   src/x32-zip/x3-sse2.c
   src/x32-zip/x4-sse2.c
-  src/x32-zip/xm-sse2.c)
+  src/x32-zip/xm-sse2.c
+  src/xx-fill/sse2-x64.c)
 
 SET(PROD_SSSE3_MICROKERNEL_SRCS
   src/f32-dwconv2d-chw/gen/3x3p1-minmax-ssse3-2x4-acc2.c
@@ -6711,15 +6709,6 @@
   TARGET_LINK_LIBRARIES(u8-vclamp-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
   ADD_TEST(u8-vclamp-test u8-vclamp-test)
 
-  ADD_EXECUTABLE(x32-fill-test test/x32-fill.cc $<TARGET_OBJECTS:all_microkernels>)
-  SET_TARGET_PROPERTIES(x32-fill-test PROPERTIES
-    CXX_STANDARD 11
-    CXX_STANDARD_REQUIRED YES
-    CXX_EXTENSIONS YES)
-  TARGET_INCLUDE_DIRECTORIES(x32-fill-test PRIVATE include src test)
-  TARGET_LINK_LIBRARIES(x32-fill-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
-  ADD_TEST(x32-fill-test x32-fill-test)
-
   ADD_EXECUTABLE(x32-packx-test test/x32-packx.cc $<TARGET_OBJECTS:all_microkernels>)
   SET_TARGET_PROPERTIES(x32-packx-test PROPERTIES
     CXX_STANDARD 11
@@ -6782,6 +6771,15 @@
   TARGET_INCLUDE_DIRECTORIES(x8-zip-test PRIVATE include src test)
   TARGET_LINK_LIBRARIES(x8-zip-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
   ADD_TEST(x8-zip-test x8-zip-test)
+
+  ADD_EXECUTABLE(xx-fill-test test/xx-fill.cc $<TARGET_OBJECTS:all_microkernels>)
+  SET_TARGET_PROPERTIES(xx-fill-test PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS YES)
+  TARGET_INCLUDE_DIRECTORIES(xx-fill-test PRIVATE include src test)
+  TARGET_LINK_LIBRARIES(xx-fill-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
+  ADD_TEST(xx-fill-test xx-fill-test)
 ENDIF()
 
 # ---[ XNNPACK microbenchmarks

diff --git a/src/init.c b/src/init.c
index 9244ca4..d4e7ec1 100644
--- a/src/init.c
+++ b/src/init.c

@@ -98,13 +98,6 @@
     }
   #endif
 
-  /**************************** XX micro-kernels ****************************/
-  #ifndef XNN_NO_XX_OPERATORS
-    init_flags |= XNN_INIT_FLAG_XX;
-
-    xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
-  #endif
-
   if (cpuinfo_has_arm_neon()) {
     /**************************** QC8 micro-kernels ****************************/
     #ifndef XNN_NO_QC8_OPERATORS
@@ -588,10 +581,6 @@
     #ifndef XNN_NO_X32_OPERATORS
       init_flags |= XNN_INIT_FLAG_X32;
 
-      xnn_params.x32.fill = (struct fill_parameters) {
-        .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__neon,
-        .row_tile = 1,
-      };
       xnn_params.x32.pad = (struct pad_parameters) {
         .ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__neon,
         .row_tile = 1,
@@ -611,7 +600,20 @@
         };
       #endif  // XNN_NO_NCHW_OPERATORS
     #endif  // XNN_NO_X32_OPERATORS
+
+    /**************************** XX micro-kernels ****************************/
+    #ifndef XNN_NO_XX_OPERATORS
+      init_flags |= XNN_INIT_FLAG_XX;
+
+      xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
+      xnn_params.xx.fill = (struct fill_parameters) {
+        .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__neon_x64,
+        .row_tile = 1,
+      };
+    #endif  // XNN_NO_XX_OPERATORS
+
   } else if (!XNN_PLATFORM_MOBILE) {
+
     /*************************** QU8 micro-kernels ***************************/
     #ifndef XNN_NO_QS8_OPERATORS
       init_flags |= XNN_INIT_FLAG_QS8;
@@ -944,10 +946,6 @@
     #ifndef XNN_NO_X32_OPERATORS
       init_flags |= XNN_INIT_FLAG_X32;
 
-      xnn_params.x32.fill = (struct fill_parameters) {
-        .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__scalar_int,
-        .row_tile = 1,
-      };
       xnn_params.x32.pad = (struct pad_parameters) {
         .ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__scalar_int,
         .row_tile = 1,
@@ -967,17 +965,21 @@
         };
       #endif  // XNN_NO_NCHW_OPERATORS
     #endif  // XNN_NO_X32_OPERATORS
+
+    /**************************** XX micro-kernels ****************************/
+    #ifndef XNN_NO_XX_OPERATORS
+      init_flags |= XNN_INIT_FLAG_XX;
+
+      xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
+      xnn_params.xx.fill = (struct fill_parameters) {
+        .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
+        .row_tile = 1,
+      };
+    #endif  // XNN_NO_XX_OPERATORS
   }
 
 #elif XNN_ARCH_ARM64
 
-  /**************************** XX micro-kernels ****************************/
-  #ifndef XNN_NO_XX_OPERATORS
-    init_flags |= XNN_INIT_FLAG_XX;
-
-    xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
-  #endif
-
   /**************************** QC8 micro-kernels ****************************/
   #ifndef XNN_NO_QC8_OPERATORS
     init_flags |= XNN_INIT_FLAG_QC8;
@@ -1968,10 +1970,6 @@
   #ifndef XNN_NO_X32_OPERATORS
     init_flags |= XNN_INIT_FLAG_X32;
 
-    xnn_params.x32.fill = (struct fill_parameters) {
-      .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__neon,
-      .row_tile = 1,
-    };
     xnn_params.x32.pad = (struct pad_parameters) {
       .ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__neon,
       .row_tile = 1,
@@ -1992,19 +1990,23 @@
     #endif  // XNN_NO_NCHW_OPERATORS
   #endif  // XNN_NO_X32_OPERATORS
 
-#elif XNN_ARCH_X86 || XNN_ARCH_X86_64
-  if (!cpuinfo_has_x86_sse2()) {
-    xnn_log_error("XNNPACK initialization failed: SSE2 is not supported");
-    return;
-  }
-
   /**************************** XX micro-kernels ****************************/
   #ifndef XNN_NO_XX_OPERATORS
     init_flags |= XNN_INIT_FLAG_XX;
 
     xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
+    xnn_params.xx.fill = (struct fill_parameters) {
+      .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__neon_x64,
+      .row_tile = 1,
+    };
   #endif
 
+#elif XNN_ARCH_X86 || XNN_ARCH_X86_64
+  if (!cpuinfo_has_x86_sse2()) {
+    xnn_log_error("XNNPACK initialization failed: SSE2 is not supported");
+    return;
+  }
+
   /**************************** QC8 micro-kernels ****************************/
   #ifndef XNN_NO_QC8_OPERATORS
     init_flags |= XNN_INIT_FLAG_QC8;
@@ -3019,10 +3021,6 @@
   #ifndef XNN_NO_X32_OPERATORS
     init_flags |= XNN_INIT_FLAG_X32;
 
-    xnn_params.x32.fill = (struct fill_parameters) {
-      .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__sse,
-      .row_tile = 1,
-    };
     xnn_params.x32.pad = (struct pad_parameters) {
       .ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__sse,
       .row_tile = 1,
@@ -3043,15 +3041,19 @@
     #endif  // XNN_NO_NCHW_OPERATORS
   #endif  // XNN_NO_X32_OPERATORS
 
-#elif XNN_ARCH_WASMSIMD
-
   /**************************** XX micro-kernels ****************************/
   #ifndef XNN_NO_XX_OPERATORS
     init_flags |= XNN_INIT_FLAG_XX;
 
     xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
+    xnn_params.xx.fill = (struct fill_parameters) {
+      .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__sse2_x64,
+      .row_tile = 1,
+    };
   #endif
 
+#elif XNN_ARCH_WASMSIMD
+
   /**************************** QC8 micro-kernels ****************************/
   #ifndef XNN_NO_QS8_OPERATORS
     init_flags |= XNN_INIT_FLAG_QC8;
@@ -3607,10 +3609,6 @@
   #ifndef XNN_NO_X32_OPERATORS
     init_flags |= XNN_INIT_FLAG_X32;
 
-    xnn_params.x32.fill = (struct fill_parameters) {
-      .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__wasmsimd,
-      .row_tile = 1,
-    };
     xnn_params.x32.pad = (struct pad_parameters) {
       .ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__wasmsimd,
       .row_tile = 1,
@@ -3631,15 +3629,19 @@
     #endif  // XNN_NO_NCHW_OPERATORS
   #endif  // XNN_NO_X32_OPERATORS
 
-#elif XNN_ARCH_WASM
-
   /**************************** XX micro-kernels ****************************/
   #ifndef XNN_NO_XX_OPERATORS
     init_flags |= XNN_INIT_FLAG_XX;
 
     xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
+    xnn_params.xx.fill = (struct fill_parameters) {
+      .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__wasmsimd_x64,
+      .row_tile = 1,
+    };
   #endif
 
+#elif XNN_ARCH_WASM
+
   /**************************** QC8 micro-kernels ****************************/
   #ifndef XNN_NO_QC8_OPERATORS
     init_flags |= XNN_INIT_FLAG_QC8;
@@ -4064,10 +4066,6 @@
   #ifndef XNN_NO_X32_OPERATORS
     init_flags |= XNN_INIT_FLAG_X32;
 
-    xnn_params.x32.fill = (struct fill_parameters) {
-      .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__scalar_float,
-      .row_tile = 1,
-    };
     xnn_params.x32.pad = (struct pad_parameters) {
       .ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__scalar_float,
       .row_tile = 1,
@@ -4088,6 +4086,17 @@
     #endif  // XNN_NO_NCHW_OPERATORS
   #endif  // XNN_NO_X32_OPERATORS
 
+  /**************************** XX micro-kernels ****************************/
+  #ifndef XNN_NO_XX_OPERATORS
+    init_flags |= XNN_INIT_FLAG_XX;
+
+    xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
+    xnn_params.xx.fill = (struct fill_parameters) {
+      .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
+      .row_tile = 1,
+    };
+  #endif
+
 #elif XNN_ARCH_RISCV
 
   /**************************** XX micro-kernels ****************************/
@@ -4413,7 +4422,7 @@
     init_flags |= XNN_INIT_FLAG_X32;
 
     xnn_params.x32.fill = (struct fill_parameters) {
-      .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__scalar_float,
+      .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
       .row_tile = 1,
     };
     xnn_params.x32.pad = (struct pad_parameters) {

diff --git a/src/operator-run.c b/src/operator-run.c
index d2133e4..5a3bda7 100644
--- a/src/operator-run.c
+++ b/src/operator-run.c

@@ -759,7 +759,7 @@
       &context->padding_value,
       input, 0 /* input stride */, output, 0 /* output stride */);
   } else {
-    context->fill_ukernel(1 /* rows */, context->output_size[0], output, 0 /* output stride */, &context->padding_value);
+    context->fill_ukernel(1 /* rows */, context->output_size[0], output, 0 /* output stride */, context->padding_value);
   }
 }
 

diff --git a/src/operators/constant-pad-nd.c b/src/operators/constant-pad-nd.c
index d74df1a..ef1475d 100644
--- a/src/operators/constant-pad-nd.c
+++ b/src/operators/constant-pad-nd.c

@@ -148,7 +148,7 @@
     .input = input,
     .output = output,
     .padding_value = constant_pad_op->pad_value,
-    .fill_ukernel = xnn_params.x32.fill.ukernel,
+    .fill_ukernel = xnn_params.xx.fill.ukernel,
     .pad_ukernel = xnn_params.x32.pad.ukernel,
   };
 

diff --git a/src/x32-fill/neon.c b/src/x32-fill/neon.c
deleted file mode 100644
index fb5e3a0..0000000
--- a/src/x32-fill/neon.c
+++ /dev/null

@@ -1,49 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/fill.h>
-
-
-void xnn_x32_fill_ukernel__neon(
-    size_t rows,
-    size_t channels,
-    uint32_t* output,
-    size_t output_stride,
-    const uint32_t* fill_value)
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(uint32_t) == 0);
-  assert(fill_value != NULL);
-
-  const size_t output_increment = output_stride - channels;
-
-  const uint32x4_t vfill = vld1q_dup_u32(fill_value);
-  do {
-    size_t c = channels;
-    for (; c >= 16 * sizeof(uint32_t); c -= 16 * sizeof(uint32_t)) {
-      vst1q_u32(output, vfill); output += 4;
-      vst1q_u32(output, vfill); output += 4;
-      vst1q_u32(output, vfill); output += 4;
-      vst1q_u32(output, vfill); output += 4;
-    }
-    for (; c >= 4 * sizeof(uint32_t); c -= 4 * sizeof(uint32_t)) {
-      vst1q_u32(output, vfill); output += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      if XNN_LIKELY(c & (2 * sizeof(uint32_t))) {
-        vst1_u32(output, vget_low_u32(vfill)); output += 2;
-      }
-      if XNN_LIKELY(c & (1 * sizeof(uint32_t))) {
-        vst1q_lane_u32(output, vfill, 0); output += 1;
-      }
-    }
-    output = (void*) ((uintptr_t) output + output_increment);
-  } while (--rows != 0);
-}

diff --git a/src/x32-fill/scalar-float.c b/src/x32-fill/scalar-float.c
deleted file mode 100644
index 360e71c..0000000
--- a/src/x32-fill/scalar-float.c
+++ /dev/null

@@ -1,44 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <xnnpack/fill.h>
-
-
-void xnn_x32_fill_ukernel__scalar_float(
-    size_t rows,
-    size_t channels,
-    uint32_t* output,
-    size_t output_stride,
-    const uint32_t* fill_value)
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-  assert(fill_value != NULL);
-
-  const size_t output_increment = output_stride - channels;
-
-  const float vfill = *((const float*) fill_value);
-  float* o = (float*) output;
-  do {
-    size_t c = channels;
-    for (; c >= 4 * sizeof(uint32_t); c -= 4 * sizeof(uint32_t)) {
-      o[0] = vfill;
-      o[1] = vfill;
-      o[2] = vfill;
-      o[3] = vfill;
-      o += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      do {
-        *o++ = vfill;
-        c -= sizeof(uint32_t);
-      } while (c != 0);
-    }
-    o = (void*) ((uintptr_t) o + output_increment);
-  } while (--rows != 0);
-}

diff --git a/src/x32-fill/scalar-int.c b/src/x32-fill/scalar-int.c
deleted file mode 100644
index 8648576..0000000
--- a/src/x32-fill/scalar-int.c
+++ /dev/null

@@ -1,43 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <xnnpack/fill.h>
-
-
-void xnn_x32_fill_ukernel__scalar_int(
-    size_t rows,
-    size_t channels,
-    uint32_t* output,
-    size_t output_stride,
-    const uint32_t* fill_value)
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(uint32_t) == 0);
-  assert(fill_value != NULL);
-
-  const size_t output_increment = output_stride - channels;
-
-  const uint32_t vfill = *fill_value;
-  do {
-    size_t c = channels;
-    for (; c >= 4 * sizeof(uint32_t); c -= 4 * sizeof(uint32_t)) {
-      output[0] = vfill;
-      output[1] = vfill;
-      output[2] = vfill;
-      output[3] = vfill;
-      output += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      do {
-        *output++ = vfill;
-        c -= sizeof(uint32_t);
-      } while (c != 0);
-    }
-    output = (void*) ((uintptr_t) output + output_increment);
-  } while (--rows != 0);
-}

diff --git a/src/x32-fill/sse.c b/src/x32-fill/sse.c
deleted file mode 100644
index 3569c93..0000000
--- a/src/x32-fill/sse.c
+++ /dev/null

@@ -1,54 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <xmmintrin.h>
-
-#include <xnnpack/fill.h>
-
-
-void xnn_x32_fill_ukernel__sse(
-    size_t rows,
-    size_t channels,
-    uint32_t* output,
-    size_t output_stride,
-    const uint32_t* fill_value)
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(uint32_t) == 0);
-  assert(fill_value != NULL);
-
-  const size_t output_increment = output_stride - channels;
-
-  const __m128 vfill = _mm_load1_ps((const float*) fill_value);
-  float* o = (float*) output;
-  do {
-    size_t c = channels;
-    for (; c >= 16 * sizeof(uint32_t); c -= 16 * sizeof(uint32_t)) {
-      _mm_storeu_ps(o, vfill);
-      _mm_storeu_ps(o + 4, vfill);
-      _mm_storeu_ps(o + 8, vfill);
-      _mm_storeu_ps(o + 12, vfill);
-      o += 16;
-    }
-    for (; c >= 4 * sizeof(uint32_t); c -= 4 * sizeof(uint32_t)) {
-      _mm_storeu_ps(o, vfill);
-      o += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      if XNN_LIKELY(c & (2 * sizeof(uint32_t))) {
-        _mm_storel_pi((__m64*) o, vfill);
-        o += 2;
-      }
-      if XNN_LIKELY(c & (1 * sizeof(uint32_t))) {
-        _mm_store_ss(o, vfill);
-        o += 1;
-      }
-    }
-    o = (void*) ((uintptr_t) o + output_increment);
-  } while (--rows != 0);
-}

diff --git a/src/x32-fill/wasmsimd.c b/src/x32-fill/wasmsimd.c
deleted file mode 100644
index 7ff2242..0000000
--- a/src/x32-fill/wasmsimd.c
+++ /dev/null

@@ -1,53 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include <xnnpack/fill.h>
-
-
-void xnn_x32_fill_ukernel__wasmsimd(
-    size_t rows,
-    size_t channels,
-    uint32_t* output,
-    size_t output_stride,
-    const uint32_t* fill_value)
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(uint32_t) == 0);
-  assert(fill_value != NULL);
-
-  const size_t output_increment = output_stride - channels;
-
-  const v128_t vfill = wasm_v128_load32_splat(fill_value);
-  do {
-    size_t c = channels;
-    for (; c >= 16 * sizeof(uint32_t); c -= 16 * sizeof(uint32_t)) {
-      wasm_v128_store(output, vfill);
-      wasm_v128_store(output + 4, vfill);
-      wasm_v128_store(output + 8, vfill);
-      wasm_v128_store(output + 12, vfill);
-      output += 16;
-    }
-    for (; c >= 4 * sizeof(uint32_t); c -= 4 * sizeof(uint32_t)) {
-      wasm_v128_store(output, vfill);
-      output += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      if XNN_LIKELY(c & (2 * sizeof(uint32_t))) {
-        *((double*) output) = wasm_f64x2_extract_lane(vfill, 0);
-        output += 2;
-      }
-      if XNN_LIKELY(c & (1 * sizeof(uint32_t))) {
-        *((float*) output) = wasm_f32x4_extract_lane(vfill, 0);
-        output += 1;
-      }
-    }
-    output = (void*) ((uintptr_t) output + output_increment);
-  } while (--rows != 0);
-}

diff --git a/src/xnnpack/fill.h b/src/xnnpack/fill.h
index 43fc96e..4c74388 100644
--- a/src/xnnpack/fill.h
+++ b/src/xnnpack/fill.h

@@ -20,15 +20,14 @@
   XNN_INTERNAL void fn_name(                   \
     size_t kernel_elements,                    \
     size_t channels,                           \
-    uint32_t* output,                          \
+    void* output,                              \
     size_t output_stride,                      \
-    const uint32_t* fill_value);
+    const uint32_t fill_pattern);
 
-DECLARE_FILL_UKERNEL_FUNCTION(xnn_x32_fill_ukernel__sse)
-DECLARE_FILL_UKERNEL_FUNCTION(xnn_x32_fill_ukernel__neon)
-DECLARE_FILL_UKERNEL_FUNCTION(xnn_x32_fill_ukernel__wasmsimd)
-DECLARE_FILL_UKERNEL_FUNCTION(xnn_x32_fill_ukernel__scalar_float)
-DECLARE_FILL_UKERNEL_FUNCTION(xnn_x32_fill_ukernel__scalar_int)
+DECLARE_FILL_UKERNEL_FUNCTION(xnn_xx_fill_ukernel__sse2_x64)
+DECLARE_FILL_UKERNEL_FUNCTION(xnn_xx_fill_ukernel__neon_x64)
+DECLARE_FILL_UKERNEL_FUNCTION(xnn_xx_fill_ukernel__wasmsimd_x64)
+DECLARE_FILL_UKERNEL_FUNCTION(xnn_xx_fill_ukernel__scalar_x16)
 
 
 #ifdef __cplusplus

diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 0314b7f..57a880a 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h

@@ -1284,14 +1284,7 @@
     size_t channels,
     void* output,
     size_t output_stride,
-    const void* fill_value);
-
-typedef void (*xnn_x32_fill_ukernel_function)(
-    size_t rows,
-    size_t channels,
-    uint32_t* output,
-    size_t output_stride,
-    const uint32_t* fill_value);
+    const uint32_t fill_pattern);
 
 typedef void (*xnn_depthtospace2d_chw2hwc_ukernel_function)(
     size_t output_channels,
@@ -2538,9 +2531,6 @@
   uint32_t init_flags;
   struct xnn_allocator allocator;
   struct {
-    xnn_univector_ukernel_function copy;
-  } xx;
-  struct {
     struct gemm_parameters gemm;
     struct dwconv_parameters dwconv[XNN_MAX_QC8_DWCONV_UKERNELS];
   } qc8;
@@ -2638,12 +2628,15 @@
   } f32;
   struct {
     struct pad_parameters pad;
-    struct fill_parameters fill;
     xnn_unpool_ukernel_function unpool;
     struct zip_parameters zip;
     // Depth To Space 2D with CHW->HWC layout conversion.
     struct depthtospace2d_chw2hwc_parameters depthtospace2d_chw2hwc;
   } x32;
+  struct {
+    xnn_univector_ukernel_function copy;
+    struct fill_parameters fill;
+  } xx;
 };
 
 #ifdef __cplusplus

diff --git a/src/xx-fill/neon-x64.c b/src/xx-fill/neon-x64.c
new file mode 100644
index 0000000..57188f1
--- /dev/null
+++ b/src/xx-fill/neon-x64.c

@@ -0,0 +1,55 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/fill.h>
+
+
+void xnn_xx_fill_ukernel__neon_x64(
+    size_t rows,
+    size_t channels,
+    void* output,
+    size_t output_stride,
+    const uint32_t fill_pattern)
+{
+  assert(rows != 0);
+  assert(channels != 0);
+
+  const size_t output_increment = output_stride - channels;
+
+  const uint8x16_t vfill_pattern = vreinterpretq_u8_u32(vdupq_n_u32(fill_pattern));
+  do {
+    size_t c = channels;
+    for (; c >= 64 * sizeof(uint8_t); c -= 64 * sizeof(uint8_t)) {
+      vst1q_u8(output, vfill_pattern); output = ((uint8_t*) output + 16);
+      vst1q_u8(output, vfill_pattern); output = ((uint8_t*) output + 16);
+      vst1q_u8(output, vfill_pattern); output = ((uint8_t*) output + 16);
+      vst1q_u8(output, vfill_pattern); output = ((uint8_t*) output + 16);
+    }
+    for (; c >= 16 * sizeof(uint8_t); c -= 16 * sizeof(uint8_t)) {
+      vst1q_u8(output, vfill_pattern); output = ((uint8_t*) output + 16);
+    }
+    if XNN_UNLIKELY(c != 0) {
+      if XNN_LIKELY(c & (8 * sizeof(uint8_t))) {
+        vst1_u8(output, vget_low_u8(vfill_pattern)); output = ((uint8_t*) output + 8);
+      }
+      if XNN_LIKELY(c & (4 * sizeof(uint8_t))) {
+        vst1q_lane_u32(output, vreinterpretq_u32_u8(vfill_pattern), 0); output = ((uint8_t*) output + 4);
+      }
+      uint8x8_t vfill_subpattern = vget_low_u8(vfill_pattern);
+      if XNN_LIKELY(c & (2 * sizeof(uint8_t))) {
+        vst1_lane_u16(output, vreinterpret_u16_u8(vfill_subpattern), 0); output = ((uint8_t*) output + 2);
+        vfill_subpattern = vext_u8(vfill_subpattern, vfill_subpattern, 2);
+      }
+      if XNN_LIKELY(c & (1 * sizeof(uint8_t))) {
+        vst1_lane_u8(output, vfill_subpattern, 0); output = ((uint8_t*) output + 1);
+      }
+    }
+    output = (void*) ((uintptr_t) output + output_increment);
+  } while (--rows != 0);
+}

diff --git a/src/xx-fill/scalar-x16.c b/src/xx-fill/scalar-x16.c
new file mode 100644
index 0000000..a05eb0a
--- /dev/null
+++ b/src/xx-fill/scalar-x16.c

@@ -0,0 +1,55 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/fill.h>
+
+
+void xnn_xx_fill_ukernel__scalar_x16(
+    size_t rows,
+    size_t channels,
+    void* output,
+    size_t output_stride,
+    const uint32_t fill_pattern)
+{
+  assert(rows != 0);
+  assert(channels != 0);
+
+  const size_t output_increment = output_stride - channels;
+
+  do {
+    uint32_t vfill_pattern = fill_pattern;
+    size_t c = channels;
+    for (; c >= 16 * sizeof(uint8_t); c -= 16 * sizeof(uint8_t)) {
+      ((uint32_t*) output)[0] = vfill_pattern;
+      ((uint32_t*) output)[1] = vfill_pattern;
+      ((uint32_t*) output)[2] = vfill_pattern;
+      ((uint32_t*) output)[3] = vfill_pattern;
+      output = ((uint8_t*) output + 16);
+    }
+    if XNN_UNLIKELY(c != 0) {
+      if XNN_LIKELY(c & (8 * sizeof(uint8_t))) {
+        ((uint32_t*) output)[0] = vfill_pattern;
+        ((uint32_t*) output)[1] = vfill_pattern;
+        output = ((uint8_t*) output + 8);
+      }
+      if XNN_LIKELY(c & (4 * sizeof(uint8_t))) {
+        *((uint32_t*) output) = vfill_pattern;
+        output = ((uint8_t*) output + 4);
+      }
+      if XNN_LIKELY(c & (2 * sizeof(uint8_t))) {
+        *((uint16_t*) output) = (uint16_t) vfill_pattern;
+        vfill_pattern >>= 16;
+        output = ((uint8_t*) output + 2);
+      }
+      if XNN_LIKELY(c & (1 * sizeof(uint8_t))) {
+        *((uint8_t*) output) = (uint8_t) vfill_pattern;
+        output = ((uint8_t*) output + 1);
+      }
+    }
+    output = (void*) ((uintptr_t) output + output_increment);
+  } while (--rows != 0);
+}

diff --git a/src/xx-fill/sse2-x64.c b/src/xx-fill/sse2-x64.c
new file mode 100644
index 0000000..08a55cd
--- /dev/null
+++ b/src/xx-fill/sse2-x64.c

@@ -0,0 +1,61 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/fill.h>
+
+
+void xnn_xx_fill_ukernel__sse2_x64(
+    size_t rows,
+    size_t channels,
+    void* output,
+    size_t output_stride,
+    const uint32_t fill_pattern)
+{
+  assert(rows != 0);
+  assert(channels != 0);
+
+  const size_t output_increment = output_stride - channels;
+
+  const __m128i vfill = _mm_shuffle_epi32(_mm_cvtsi32_si128(fill_pattern), _MM_SHUFFLE(0, 0, 0, 0));
+  do {
+    size_t c = channels;
+    for (; c >= 64 * sizeof(uint8_t); c -= 64 * sizeof(uint8_t)) {
+      _mm_storeu_si128((__m128i*) output, vfill);
+      _mm_storeu_si128((__m128i*) output + 1, vfill);
+      _mm_storeu_si128((__m128i*) output + 2, vfill);
+      _mm_storeu_si128((__m128i*) output + 3, vfill);
+      output = ((uint8_t*) output + 64);
+    }
+    for (; c >= 16 * sizeof(uint8_t); c -= 16 * sizeof(uint8_t)) {
+      _mm_storeu_si128((__m128i*) output, vfill);
+      output = ((uint8_t*) output + 16);
+    }
+    if XNN_UNLIKELY(c != 0) {
+      if XNN_LIKELY(c & (8 * sizeof(uint8_t))) {
+        _mm_storel_epi64(output, vfill);
+        output = ((uint8_t*) output + 8);
+      }
+      if XNN_LIKELY(c & (4 * sizeof(uint8_t))) {
+        *((uint32_t*) output) = fill_pattern;
+        output = ((uint8_t*) output + 4);
+      }
+      uint32_t vfill_subpattern = fill_pattern;
+      if XNN_LIKELY(c & (2 * sizeof(uint8_t))) {
+        *((uint16_t*) output) = (uint16_t) vfill_subpattern;
+        vfill_subpattern >>= 16;
+        output = ((uint8_t*) output + 2);
+      }
+      if XNN_LIKELY(c & (1 * sizeof(uint8_t))) {
+        *((uint8_t*) output) = (uint8_t) vfill_subpattern;
+        output = ((uint8_t*) output + 1);
+      }
+    }
+    output = (void*) ((uintptr_t) output + output_increment);
+  } while (--rows != 0);
+}

diff --git a/src/xx-fill/wasmsimd-x64.c b/src/xx-fill/wasmsimd-x64.c
new file mode 100644
index 0000000..f629f2b
--- /dev/null
+++ b/src/xx-fill/wasmsimd-x64.c

@@ -0,0 +1,61 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/fill.h>
+
+
+void xnn_xx_fill_ukernel__wasmsimd_x64(
+    size_t rows,
+    size_t channels,
+    void* output,
+    size_t output_stride,
+    const uint32_t fill_pattern)
+{
+  assert(rows != 0);
+  assert(channels != 0);
+
+  const size_t output_increment = output_stride - channels;
+
+  const v128_t vfill_pattern = wasm_i32x4_splat(fill_pattern);
+  do {
+    size_t c = channels;
+    for (; c >= 64 * sizeof(uint8_t); c -= 64 * sizeof(uint8_t)) {
+      wasm_v128_store(output, vfill_pattern);
+      wasm_v128_store((uint8_t*) output + 16, vfill_pattern);
+      wasm_v128_store((uint8_t*) output + 32, vfill_pattern);
+      wasm_v128_store((uint8_t*) output + 48, vfill_pattern);
+      output = ((uint8_t*) output + 64);
+    }
+    for (; c >= 16 * sizeof(uint8_t); c -= 16 * sizeof(uint8_t)) {
+      wasm_v128_store(output, vfill_pattern);
+      output = ((uint8_t*) output + 16);
+    }
+    if XNN_UNLIKELY(c != 0) {
+      if XNN_LIKELY(c & (8 * sizeof(uint8_t))) {
+        *((double*) output) = wasm_f64x2_extract_lane(vfill_pattern, 0);
+        output = ((uint8_t*) output + 8);
+      }
+      uint32_t vfill_subpattern = fill_pattern;
+      if XNN_LIKELY(c & (4 * sizeof(uint8_t))) {
+        *((uint32_t*) output) = vfill_subpattern;
+        output = ((uint8_t*) output + 4);
+      }
+      if XNN_LIKELY(c & (2 * sizeof(uint8_t))) {
+        *((uint16_t*) output) = (uint16_t) vfill_subpattern;
+        vfill_subpattern >>= 16;
+        output = ((uint8_t*) output + 2);
+      }
+      if XNN_LIKELY(c & (1 * sizeof(uint8_t))) {
+        *((uint8_t*) output) = (uint8_t) vfill_subpattern;
+        output = ((uint8_t*) output + 1);
+      }
+    }
+    output = (void*) ((uintptr_t) output + output_increment);
+  } while (--rows != 0);
+}

diff --git a/test/fill-microkernel-tester.h b/test/fill-microkernel-tester.h
index e40df66..47466b6 100644
--- a/test/fill-microkernel-tester.h
+++ b/test/fill-microkernel-tester.h

@@ -64,32 +64,35 @@
     return this->iterations_;
   }
 
-  void Test(xnn_x32_fill_ukernel_function fill) const {
+  void Test(xnn_fill_ukernel_function fill) const {
     ASSERT_GE(output_stride(), channels());
 
     std::random_device random_device;
     auto rng = std::mt19937(random_device());
-    auto u32rng = std::bind(std::uniform_int_distribution<uint32_t>(), rng);
+    auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
 
-    std::vector<uint32_t> output((rows() - 1) * output_stride() + channels());
-    std::vector<uint32_t> output_copy(output.size());
+    std::vector<uint8_t> output((rows() - 1) * output_stride() + channels());
+    std::vector<uint8_t> output_copy(output.size());
     for (size_t iteration = 0; iteration < iterations(); iteration++) {
-      std::generate(output.begin(), output.end(), std::ref(u32rng));
+      std::generate(output.begin(), output.end(), std::ref(u8rng));
       std::copy(output.cbegin(), output.cend(), output_copy.begin());
-      const uint32_t fill_value = u32rng();
+      std::array<uint8_t, 4> fill_pattern;
+      std::generate(fill_pattern.begin(), fill_pattern.end(), std::ref(u8rng));
+      uint32_t fill_value = 0;
+      memcpy(&fill_value, fill_pattern.data(), sizeof(fill_value));
 
       // Call optimized micro-kernel.
       fill(
         rows(),
-        channels() * sizeof(uint32_t),
+        channels() * sizeof(uint8_t),
         output.data(),
-        output_stride() * sizeof(uint32_t),
-        &fill_value);
+        output_stride() * sizeof(uint8_t),
+        fill_value);
 
       // Verify results.
       for (size_t i = 0; i < rows(); i++) {
         for (size_t c = 0; c < channels(); c++) {
-          ASSERT_EQ(output[i * output_stride() + c], fill_value)
+          ASSERT_EQ(uint32_t(output[i * output_stride() + c]), uint32_t(fill_pattern[c % fill_pattern.size()]))
             << "at row " << i << " / " << rows()
             << ", channel " << c << " / " << channels()
             << ", fill value 0x" << std::hex << std::setw(8) << std::setfill('0') << fill_value
@@ -98,7 +101,7 @@
       }
       for (size_t i = 0; i + 1 < rows(); i++) {
         for (size_t c = channels(); c < output_stride(); c++) {
-          ASSERT_EQ(output[i * output_stride() + c], output_copy[i * output_stride() + c])
+          ASSERT_EQ(uint32_t(output[i * output_stride() + c]), uint32_t(output_copy[i * output_stride() + c]))
             << "at row " << i << " / " << rows()
             << ", channel " << c << " / " << channels()
             << ", original value 0x" << std::hex << std::setw(8) << std::setfill('0')

diff --git a/test/x32-fill.cc b/test/x32-fill.cc
deleted file mode 100644
index 9035e42..0000000
--- a/test/x32-fill.cc
+++ /dev/null

@@ -1,268 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <gtest/gtest.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/isa-checks.h>
-
-#include <xnnpack/fill.h>
-#include "fill-microkernel-tester.h"
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(X32_FILL__NEON, channels_eq_4) {
-    TEST_REQUIRES_ARM_NEON;
-    FillMicrokernelTester()
-      .channels(4)
-      .Test(xnn_x32_fill_ukernel__neon);
-  }
-
-  TEST(X32_FILL__NEON, channels_div_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 8; channels < 32; channels += 4) {
-      FillMicrokernelTester()
-        .channels(channels)
-        .Test(xnn_x32_fill_ukernel__neon);
-    }
-  }
-
-  TEST(X32_FILL__NEON, channels_lt_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 1; channels < 4; channels++) {
-      FillMicrokernelTester()
-        .channels(channels)
-        .Test(xnn_x32_fill_ukernel__neon);
-    }
-  }
-
-  TEST(X32_FILL__NEON, channels_gt_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 5; channels < 8; channels++) {
-      FillMicrokernelTester()
-        .channels(channels)
-        .Test(xnn_x32_fill_ukernel__neon);
-    }
-  }
-
-  TEST(X32_FILL__NEON, multiple_rows) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 2; rows < 5; rows++) {
-      for (size_t channels = 1; channels < 16; channels += 3) {
-        FillMicrokernelTester()
-          .channels(channels)
-          .rows(rows)
-          .Test(xnn_x32_fill_ukernel__neon);
-      }
-    }
-  }
-
-  TEST(X32_FILL__NEON, multiple_rows_with_output_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 2; rows < 5; rows++) {
-      for (size_t channels = 1; channels < 16; channels += 3) {
-        FillMicrokernelTester()
-          .channels(channels)
-          .rows(rows)
-          .output_stride(17)
-          .Test(xnn_x32_fill_ukernel__neon);
-      }
-    }
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(X32_FILL__SSE, channels_eq_4) {
-    TEST_REQUIRES_X86_SSE;
-    FillMicrokernelTester()
-      .channels(4)
-      .Test(xnn_x32_fill_ukernel__sse);
-  }
-
-  TEST(X32_FILL__SSE, channels_div_4) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t channels = 8; channels < 32; channels += 4) {
-      FillMicrokernelTester()
-        .channels(channels)
-        .Test(xnn_x32_fill_ukernel__sse);
-    }
-  }
-
-  TEST(X32_FILL__SSE, channels_lt_4) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t channels = 1; channels < 4; channels++) {
-      FillMicrokernelTester()
-        .channels(channels)
-        .Test(xnn_x32_fill_ukernel__sse);
-    }
-  }
-
-  TEST(X32_FILL__SSE, channels_gt_4) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t channels = 5; channels < 8; channels++) {
-      FillMicrokernelTester()
-        .channels(channels)
-        .Test(xnn_x32_fill_ukernel__sse);
-    }
-  }
-
-  TEST(X32_FILL__SSE, multiple_rows) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t rows = 2; rows < 5; rows++) {
-      for (size_t channels = 1; channels < 16; channels += 3) {
-        FillMicrokernelTester()
-          .channels(channels)
-          .rows(rows)
-          .Test(xnn_x32_fill_ukernel__sse);
-      }
-    }
-  }
-
-  TEST(X32_FILL__SSE, multiple_rows_with_output_stride) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t rows = 2; rows < 5; rows++) {
-      for (size_t channels = 1; channels < 16; channels += 3) {
-        FillMicrokernelTester()
-          .channels(channels)
-          .rows(rows)
-          .output_stride(17)
-          .Test(xnn_x32_fill_ukernel__sse);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_WASMSIMD
-  TEST(X32_FILL__WASMSIMD, channels_eq_4) {
-    FillMicrokernelTester()
-      .channels(4)
-      .Test(xnn_x32_fill_ukernel__wasmsimd);
-  }
-
-  TEST(X32_FILL__WASMSIMD, channels_div_4) {
-    for (size_t channels = 8; channels < 32; channels += 4) {
-      FillMicrokernelTester()
-        .channels(channels)
-        .Test(xnn_x32_fill_ukernel__wasmsimd);
-    }
-  }
-
-  TEST(X32_FILL__WASMSIMD, channels_lt_4) {
-    for (size_t channels = 1; channels < 4; channels++) {
-      FillMicrokernelTester()
-        .channels(channels)
-        .Test(xnn_x32_fill_ukernel__wasmsimd);
-    }
-  }
-
-  TEST(X32_FILL__WASMSIMD, channels_gt_4) {
-    for (size_t channels = 5; channels < 8; channels++) {
-      FillMicrokernelTester()
-        .channels(channels)
-        .Test(xnn_x32_fill_ukernel__wasmsimd);
-    }
-  }
-
-  TEST(X32_FILL__WASMSIMD, multiple_rows) {
-    for (size_t rows = 2; rows < 5; rows++) {
-      for (size_t channels = 1; channels < 16; channels += 3) {
-        FillMicrokernelTester()
-          .channels(channels)
-          .rows(rows)
-          .Test(xnn_x32_fill_ukernel__wasmsimd);
-      }
-    }
-  }
-
-  TEST(X32_FILL__WASMSIMD, multiple_rows_with_output_stride) {
-    for (size_t rows = 2; rows < 5; rows++) {
-      for (size_t channels = 1; channels < 16; channels += 3) {
-        FillMicrokernelTester()
-          .channels(channels)
-          .rows(rows)
-          .output_stride(17)
-          .Test(xnn_x32_fill_ukernel__wasmsimd);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD
-
-
-TEST(X32_FILL__SCALAR_FLOAT, eq_1) {
-  FillMicrokernelTester()
-    .channels(1)
-    .Test(xnn_x32_fill_ukernel__scalar_float);
-}
-
-TEST(X32_FILL__SCALAR_FLOAT, channels_gt_1) {
-  for (size_t channels = 2; channels < 10; channels++) {
-    FillMicrokernelTester()
-      .channels(channels)
-      .Test(xnn_x32_fill_ukernel__scalar_float);
-  }
-}
-
-TEST(X32_FILL__SCALAR_FLOAT, multiple_rows) {
-  for (size_t rows = 2; rows < 5; rows++) {
-    for (size_t channels = 1; channels < 16; channels += 3) {
-      FillMicrokernelTester()
-        .channels(channels)
-        .rows(rows)
-        .Test(xnn_x32_fill_ukernel__scalar_float);
-    }
-  }
-}
-
-TEST(X32_FILL__SCALAR_FLOAT, multiple_rows_with_output_stride) {
-  for (size_t rows = 2; rows < 5; rows++) {
-    for (size_t channels = 1; channels < 16; channels += 3) {
-      FillMicrokernelTester()
-        .channels(channels)
-        .rows(rows)
-        .output_stride(17)
-        .Test(xnn_x32_fill_ukernel__scalar_float);
-    }
-  }
-}
-
-
-TEST(X32_FILL__SCALAR_INT, eq_1) {
-  FillMicrokernelTester()
-    .channels(1)
-    .Test(xnn_x32_fill_ukernel__scalar_int);
-}
-
-TEST(X32_FILL__SCALAR_INT, channels_gt_1) {
-  for (size_t channels = 2; channels < 10; channels++) {
-    FillMicrokernelTester()
-      .channels(channels)
-      .Test(xnn_x32_fill_ukernel__scalar_int);
-  }
-}
-
-TEST(X32_FILL__SCALAR_INT, multiple_rows) {
-  for (size_t rows = 2; rows < 5; rows++) {
-    for (size_t channels = 1; channels < 16; channels += 3) {
-      FillMicrokernelTester()
-        .channels(channels)
-        .rows(rows)
-        .Test(xnn_x32_fill_ukernel__scalar_int);
-    }
-  }
-}
-
-TEST(X32_FILL__SCALAR_INT, multiple_rows_with_output_stride) {
-  for (size_t rows = 2; rows < 5; rows++) {
-    for (size_t channels = 1; channels < 16; channels += 3) {
-      FillMicrokernelTester()
-        .channels(channels)
-        .rows(rows)
-        .output_stride(17)
-        .Test(xnn_x32_fill_ukernel__scalar_int);
-    }
-  }
-}

diff --git a/test/xx-fill.cc b/test/xx-fill.cc
new file mode 100644
index 0000000..a8e4305
--- /dev/null
+++ b/test/xx-fill.cc

@@ -0,0 +1,324 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <gtest/gtest.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/isa-checks.h>
+
+#include <xnnpack/fill.h>
+#include "fill-microkernel-tester.h"
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(XX_FILL__NEON_X64, channels_eq_1) {
+    TEST_REQUIRES_ARM_NEON;
+    FillMicrokernelTester()
+      .channels(1)
+      .Test(xnn_xx_fill_ukernel__neon_x64);
+  }
+
+  TEST(XX_FILL__NEON_X64, channels_eq_2) {
+    TEST_REQUIRES_ARM_NEON;
+    FillMicrokernelTester()
+      .channels(2)
+      .Test(xnn_xx_fill_ukernel__neon_x64);
+  }
+
+  TEST(XX_FILL__NEON_X64, channels_eq_4) {
+    TEST_REQUIRES_ARM_NEON;
+    FillMicrokernelTester()
+      .channels(4)
+      .Test(xnn_xx_fill_ukernel__neon_x64);
+  }
+
+  TEST(XX_FILL__NEON_X64, channels_eq_64) {
+    TEST_REQUIRES_ARM_NEON;
+    FillMicrokernelTester()
+      .channels(64)
+      .Test(xnn_xx_fill_ukernel__neon_x64);
+  }
+
+  TEST(XX_FILL__NEON_X64, channels_div_64) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 128; channels <= 192; channels += 64) {
+      FillMicrokernelTester()
+        .channels(channels)
+        .Test(xnn_xx_fill_ukernel__neon_x64);
+    }
+  }
+
+  TEST(XX_FILL__NEON_X64, channels_lt_64) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels < 64; channels++) {
+      FillMicrokernelTester()
+        .channels(channels)
+        .Test(xnn_xx_fill_ukernel__neon_x64);
+    }
+  }
+
+  TEST(XX_FILL__NEON_X64, channels_gt_64) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 65; channels < 128; channels++) {
+      FillMicrokernelTester()
+        .channels(channels)
+        .Test(xnn_xx_fill_ukernel__neon_x64);
+    }
+  }
+
+  TEST(XX_FILL__NEON_X64, multiple_rows) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t rows = 2; rows < 5; rows++) {
+      for (size_t channels = 1; channels < 192; channels += 15) {
+        FillMicrokernelTester()
+          .channels(channels)
+          .rows(rows)
+          .Test(xnn_xx_fill_ukernel__neon_x64);
+      }
+    }
+  }
+
+  TEST(XX_FILL__NEON_X64, multiple_rows_with_output_stride) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t rows = 2; rows < 5; rows++) {
+      for (size_t channels = 1; channels < 192; channels += 15) {
+        FillMicrokernelTester()
+          .channels(channels)
+          .rows(rows)
+          .output_stride(193)
+          .Test(xnn_xx_fill_ukernel__neon_x64);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(XX_FILL__SSE2_X64, channels_eq_1) {
+    TEST_REQUIRES_X86_SSE2;
+    FillMicrokernelTester()
+      .channels(1)
+      .Test(xnn_xx_fill_ukernel__sse2_x64);
+  }
+
+  TEST(XX_FILL__SSE2_X64, channels_eq_2) {
+    TEST_REQUIRES_X86_SSE2;
+    FillMicrokernelTester()
+      .channels(2)
+      .Test(xnn_xx_fill_ukernel__sse2_x64);
+  }
+
+  TEST(XX_FILL__SSE2_X64, channels_eq_4) {
+    TEST_REQUIRES_X86_SSE2;
+    FillMicrokernelTester()
+      .channels(4)
+      .Test(xnn_xx_fill_ukernel__sse2_x64);
+  }
+
+  TEST(XX_FILL__SSE2_X64, channels_eq_64) {
+    TEST_REQUIRES_X86_SSE2;
+    FillMicrokernelTester()
+      .channels(64)
+      .Test(xnn_xx_fill_ukernel__sse2_x64);
+  }
+
+  TEST(XX_FILL__SSE2_X64, channels_div_64) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 128; channels <= 192; channels += 64) {
+      FillMicrokernelTester()
+        .channels(channels)
+        .Test(xnn_xx_fill_ukernel__sse2_x64);
+    }
+  }
+
+  TEST(XX_FILL__SSE2_X64, channels_lt_64) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 1; channels < 64; channels++) {
+      FillMicrokernelTester()
+        .channels(channels)
+        .Test(xnn_xx_fill_ukernel__sse2_x64);
+    }
+  }
+
+  TEST(XX_FILL__SSE2_X64, channels_gt_64) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 65; channels < 128; channels++) {
+      FillMicrokernelTester()
+        .channels(channels)
+        .Test(xnn_xx_fill_ukernel__sse2_x64);
+    }
+  }
+
+  TEST(XX_FILL__SSE2_X64, multiple_rows) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t rows = 2; rows < 5; rows++) {
+      for (size_t channels = 1; channels < 192; channels += 15) {
+        FillMicrokernelTester()
+          .channels(channels)
+          .rows(rows)
+          .Test(xnn_xx_fill_ukernel__sse2_x64);
+      }
+    }
+  }
+
+  TEST(XX_FILL__SSE2_X64, multiple_rows_with_output_stride) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t rows = 2; rows < 5; rows++) {
+      for (size_t channels = 1; channels < 192; channels += 15) {
+        FillMicrokernelTester()
+          .channels(channels)
+          .rows(rows)
+          .output_stride(193)
+          .Test(xnn_xx_fill_ukernel__sse2_x64);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_WASMSIMD
+  TEST(XX_FILL__WASMSIMD_X64, channels_eq_1) {
+    FillMicrokernelTester()
+      .channels(1)
+      .Test(xnn_xx_fill_ukernel__wasmsimd_x64);
+  }
+
+  TEST(XX_FILL__WASMSIMD_X64, channels_eq_2) {
+    FillMicrokernelTester()
+      .channels(2)
+      .Test(xnn_xx_fill_ukernel__wasmsimd_x64);
+  }
+
+  TEST(XX_FILL__WASMSIMD_X64, channels_eq_4) {
+    FillMicrokernelTester()
+      .channels(4)
+      .Test(xnn_xx_fill_ukernel__wasmsimd_x64);
+  }
+
+  TEST(XX_FILL__WASMSIMD_X64, channels_eq_64) {
+    FillMicrokernelTester()
+      .channels(64)
+      .Test(xnn_xx_fill_ukernel__wasmsimd_x64);
+  }
+
+  TEST(XX_FILL__WASMSIMD_X64, channels_div_64) {
+    for (size_t channels = 128; channels <= 192; channels += 64) {
+      FillMicrokernelTester()
+        .channels(channels)
+        .Test(xnn_xx_fill_ukernel__wasmsimd_x64);
+    }
+  }
+
+  TEST(XX_FILL__WASMSIMD_X64, channels_lt_64) {
+    for (size_t channels = 1; channels < 64; channels++) {
+      FillMicrokernelTester()
+        .channels(channels)
+        .Test(xnn_xx_fill_ukernel__wasmsimd_x64);
+    }
+  }
+
+  TEST(XX_FILL__WASMSIMD_X64, channels_gt_64) {
+    for (size_t channels = 65; channels < 128; channels++) {
+      FillMicrokernelTester()
+        .channels(channels)
+        .Test(xnn_xx_fill_ukernel__wasmsimd_x64);
+    }
+  }
+
+  TEST(XX_FILL__WASMSIMD_X64, multiple_rows) {
+    for (size_t rows = 2; rows < 5; rows++) {
+      for (size_t channels = 1; channels < 192; channels += 15) {
+        FillMicrokernelTester()
+          .channels(channels)
+          .rows(rows)
+          .Test(xnn_xx_fill_ukernel__wasmsimd_x64);
+      }
+    }
+  }
+
+  TEST(XX_FILL__WASMSIMD_X64, multiple_rows_with_output_stride) {
+    for (size_t rows = 2; rows < 5; rows++) {
+      for (size_t channels = 1; channels < 192; channels += 15) {
+        FillMicrokernelTester()
+          .channels(channels)
+          .rows(rows)
+          .output_stride(193)
+          .Test(xnn_xx_fill_ukernel__wasmsimd_x64);
+      }
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
+TEST(XX_FILL__SCALAR_X16, channels_eq_1) {
+  FillMicrokernelTester()
+    .channels(1)
+    .Test(xnn_xx_fill_ukernel__scalar_x16);
+}
+
+TEST(XX_FILL__SCALAR_X16, channels_eq_2) {
+  FillMicrokernelTester()
+    .channels(2)
+    .Test(xnn_xx_fill_ukernel__scalar_x16);
+}
+
+TEST(XX_FILL__SCALAR_X16, channels_eq_4) {
+  FillMicrokernelTester()
+    .channels(4)
+    .Test(xnn_xx_fill_ukernel__scalar_x16);
+}
+
+TEST(XX_FILL__SCALAR_X16, channels_eq_16) {
+  FillMicrokernelTester()
+    .channels(16)
+    .Test(xnn_xx_fill_ukernel__scalar_x16);
+}
+
+TEST(XX_FILL__SCALAR_X16, channels_div_16) {
+  for (size_t channels = 32; channels <= 48; channels += 48) {
+    FillMicrokernelTester()
+      .channels(channels)
+      .Test(xnn_xx_fill_ukernel__scalar_x16);
+  }
+}
+
+TEST(XX_FILL__SCALAR_X16, channels_lt_16) {
+  for (size_t channels = 1; channels < 16; channels++) {
+    FillMicrokernelTester()
+      .channels(channels)
+      .Test(xnn_xx_fill_ukernel__scalar_x16);
+  }
+}
+
+TEST(XX_FILL__SCALAR_X16, channels_gt_16) {
+  for (size_t channels = 17; channels < 32; channels++) {
+    FillMicrokernelTester()
+      .channels(channels)
+      .Test(xnn_xx_fill_ukernel__scalar_x16);
+  }
+}
+
+TEST(XX_FILL__SCALAR_X16, multiple_rows) {
+  for (size_t rows = 2; rows < 5; rows++) {
+    for (size_t channels = 1; channels < 48; channels += 3) {
+      FillMicrokernelTester()
+        .channels(channels)
+        .rows(rows)
+        .Test(xnn_xx_fill_ukernel__scalar_x16);
+    }
+  }
+}
+
+TEST(XX_FILL__SCALAR_X16, multiple_rows_with_output_stride) {
+  for (size_t rows = 2; rows < 5; rows++) {
+    for (size_t channels = 1; channels < 48; channels += 3) {
+      FillMicrokernelTester()
+        .channels(channels)
+        .rows(rows)
+        .output_stride(53)
+        .Test(xnn_xx_fill_ukernel__scalar_x16);
+    }
+  }
+}
commit	933051bc5e908dc1d34cc91bfb1d4244c9c0520b	[log] [tgz]
author	Marat Dukhan <maratek@google.com>	Sat Aug 07 16:26:15 2021 -0700
committer	XNNPACK Team <xnnpack-github-robot@google.com>	Sat Aug 07 16:27:12 2021 -0700
tree	0b17038e2dd429e0cec713ce954ee93ad81edab8
parent	7c74affd9b29d39a89f381fac32e15bfd3259090 [diff]