Generalize FILL microkernels to all 8-/16-/32-bit data types
PiperOrigin-RevId: 389415595
diff --git a/BUILD.bazel b/BUILD.bazel
index 16ff5e6..fb43f89 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -291,8 +291,6 @@
"src/x8-zip/x4-scalar.c",
"src/x8-zip/xm-scalar.c",
"src/x32-depthtospace2d-chw2hwc/scalar.c",
- "src/x32-fill/scalar-float.c",
- "src/x32-fill/scalar-int.c",
"src/x32-packx/x2-scalar.c",
"src/x32-packx/x3-scalar.c",
"src/x32-packx/x4-scalar.c",
@@ -304,6 +302,7 @@
"src/x32-zip/x4-scalar.c",
"src/x32-zip/xm-scalar.c",
"src/xx-copy/memcpy.c",
+ "src/xx-fill/scalar-x16.c",
]
ALL_SCALAR_MICROKERNEL_SRCS = [
@@ -903,8 +902,6 @@
"src/x8-zip/x4-scalar.c",
"src/x8-zip/xm-scalar.c",
"src/x32-depthtospace2d-chw2hwc/scalar.c",
- "src/x32-fill/scalar-float.c",
- "src/x32-fill/scalar-int.c",
"src/x32-packx/x2-scalar.c",
"src/x32-packx/x3-scalar.c",
"src/x32-packx/x4-scalar.c",
@@ -916,6 +913,7 @@
"src/x32-zip/x4-scalar.c",
"src/x32-zip/xm-scalar.c",
"src/xx-copy/memcpy.c",
+ "src/xx-fill/scalar-x16.c",
]
ALL_WASM_MICROKERNEL_SRCS = [
@@ -1842,7 +1840,6 @@
"src/qu8-vmul/gen/minmax-fp32-wasmsimd-mul32-ld64-x16.c",
"src/qu8-vmulc/gen/minmax-fp32-wasmsimd-mul32-ld64-x8.c",
"src/qu8-vmulc/gen/minmax-fp32-wasmsimd-mul32-ld64-x16.c",
- "src/x32-fill/wasmsimd.c",
"src/x32-packx/x4-wasmsimd.c",
"src/x32-pad/wasmsimd.c",
"src/x32-unpool/wasmsimd.c",
@@ -1850,6 +1847,7 @@
"src/x32-zip/x3-wasmsimd.c",
"src/x32-zip/x4-wasmsimd.c",
"src/x32-zip/xm-wasmsimd.c",
+ "src/xx-fill/wasmsimd-x64.c",
]
# ISA-specific micro-kernels
@@ -1958,7 +1956,6 @@
"src/x8-zip/x3-neon.c",
"src/x8-zip/x4-neon.c",
"src/x8-zip/xm-neon.c",
- "src/x32-fill/neon.c",
"src/x32-packx/x4-neon-st4.c",
"src/x32-pad/neon.c",
"src/x32-unpool/neon.c",
@@ -1966,6 +1963,7 @@
"src/x32-zip/x3-neon.c",
"src/x32-zip/x4-neon.c",
"src/x32-zip/xm-neon.c",
+ "src/xx-fill/neon-x64.c",
]
ALL_NEON_MICROKERNEL_SRCS = [
@@ -2529,7 +2527,6 @@
"src/x8-zip/x3-neon.c",
"src/x8-zip/x4-neon.c",
"src/x8-zip/xm-neon.c",
- "src/x32-fill/neon.c",
"src/x32-packx/x4-neon-st4.c",
"src/x32-pad/neon.c",
"src/x32-unpool/neon.c",
@@ -2537,6 +2534,7 @@
"src/x32-zip/x3-neon.c",
"src/x32-zip/x4-neon.c",
"src/x32-zip/xm-neon.c",
+ "src/xx-fill/neon-x64.c",
]
PROD_NEONFMA_MICROKERNEL_SRCS = [
@@ -3226,7 +3224,6 @@
"src/f32-vunary/gen/vabs-sse-x8.c",
"src/f32-vunary/gen/vneg-sse-x8.c",
"src/f32-vunary/gen/vsqr-sse-x8.c",
- "src/x32-fill/sse.c",
"src/x32-packx/x4-sse.c",
"src/x32-pad/sse.c",
]
@@ -3402,7 +3399,6 @@
"src/math/sqrt-sse-hh1mac.c",
"src/math/sqrt-sse-nr1mac.c",
"src/math/sqrt-sse-nr2mac.c",
- "src/x32-fill/sse.c",
"src/x32-packx/x4-sse.c",
"src/x32-pad/sse.c",
]
@@ -3464,6 +3460,7 @@
"src/x32-zip/x3-sse2.c",
"src/x32-zip/x4-sse2.c",
"src/x32-zip/xm-sse2.c",
+ "src/xx-fill/sse2-x64.c",
]
ALL_SSE2_MICROKERNEL_SRCS = [
@@ -3722,6 +3719,7 @@
"src/x32-zip/x3-sse2.c",
"src/x32-zip/x4-sse2.c",
"src/x32-zip/xm-sse2.c",
+ "src/xx-fill/sse2-x64.c",
]
PROD_SSSE3_MICROKERNEL_SRCS = [
@@ -9546,19 +9544,28 @@
)
xnnpack_unit_test(
- name = "x32_depthtospace2d_chw2hwc_test",
+ name = "x8_lut_test",
srcs = [
- "test/x32-depthtospace2d-chw2hwc.cc",
- "test/depthtospace-microkernel-tester.h",
+ "test/x8-lut.cc",
+ "test/lut-microkernel-tester.h",
] + MICROKERNEL_TEST_HDRS,
deps = MICROKERNEL_TEST_DEPS,
)
xnnpack_unit_test(
- name = "x32_fill_test",
+ name = "x8_zip_test",
srcs = [
- "test/x32-fill.cc",
- "test/fill-microkernel-tester.h",
+ "test/x8-zip.cc",
+ "test/zip-microkernel-tester.h",
+ ] + MICROKERNEL_TEST_HDRS,
+ deps = MICROKERNEL_TEST_DEPS,
+)
+
+xnnpack_unit_test(
+ name = "x32_depthtospace2d_chw2hwc_test",
+ srcs = [
+ "test/x32-depthtospace2d-chw2hwc.cc",
+ "test/depthtospace-microkernel-tester.h",
] + MICROKERNEL_TEST_HDRS,
deps = MICROKERNEL_TEST_DEPS,
)
@@ -9601,19 +9608,10 @@
)
xnnpack_unit_test(
- name = "x8_lut_test",
+ name = "xx_fill_test",
srcs = [
- "test/x8-lut.cc",
- "test/lut-microkernel-tester.h",
- ] + MICROKERNEL_TEST_HDRS,
- deps = MICROKERNEL_TEST_DEPS,
-)
-
-xnnpack_unit_test(
- name = "x8_zip_test",
- srcs = [
- "test/x8-zip.cc",
- "test/zip-microkernel-tester.h",
+ "test/xx-fill.cc",
+ "test/fill-microkernel-tester.h",
] + MICROKERNEL_TEST_HDRS,
deps = MICROKERNEL_TEST_DEPS,
)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index df0e48c..bd7accc 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -418,8 +418,6 @@
src/x8-zip/x4-scalar.c
src/x8-zip/xm-scalar.c
src/x32-depthtospace2d-chw2hwc/scalar.c
- src/x32-fill/scalar-float.c
- src/x32-fill/scalar-int.c
src/x32-packx/x2-scalar.c
src/x32-packx/x3-scalar.c
src/x32-packx/x4-scalar.c
@@ -430,7 +428,8 @@
src/x32-zip/x3-scalar.c
src/x32-zip/x4-scalar.c
src/x32-zip/xm-scalar.c
- src/xx-copy/memcpy.c)
+ src/xx-copy/memcpy.c
+ src/xx-fill/scalar-x16.c)
SET(ALL_SCALAR_MICROKERNEL_SRCS
src/f32-argmaxpool/4x-scalar-c1.c
@@ -1029,8 +1028,6 @@
src/x8-zip/x4-scalar.c
src/x8-zip/xm-scalar.c
src/x32-depthtospace2d-chw2hwc/scalar.c
- src/x32-fill/scalar-float.c
- src/x32-fill/scalar-int.c
src/x32-packx/x2-scalar.c
src/x32-packx/x3-scalar.c
src/x32-packx/x4-scalar.c
@@ -1041,7 +1038,8 @@
src/x32-zip/x3-scalar.c
src/x32-zip/x4-scalar.c
src/x32-zip/xm-scalar.c
- src/xx-copy/memcpy.c)
+ src/xx-copy/memcpy.c
+ src/xx-fill/scalar-x16.c)
SET(PROD_NEON_MICROKERNEL_SRCS
src/f32-argmaxpool/4x-neon-c4.c
@@ -1148,14 +1146,14 @@
src/x8-zip/x3-neon.c
src/x8-zip/x4-neon.c
src/x8-zip/xm-neon.c
- src/x32-fill/neon.c
src/x32-packx/x4-neon-st4.c
src/x32-pad/neon.c
src/x32-unpool/neon.c
src/x32-zip/x2-neon.c
src/x32-zip/x3-neon.c
src/x32-zip/x4-neon.c
- src/x32-zip/xm-neon.c)
+ src/x32-zip/xm-neon.c
+ src/xx-fill/neon-x64.c)
SET(ALL_NEON_MICROKERNEL_SRCS
src/f32-argmaxpool/4x-neon-c4.c
@@ -1718,14 +1716,14 @@
src/x8-zip/x3-neon.c
src/x8-zip/x4-neon.c
src/x8-zip/xm-neon.c
- src/x32-fill/neon.c
src/x32-packx/x4-neon-st4.c
src/x32-pad/neon.c
src/x32-unpool/neon.c
src/x32-zip/x2-neon.c
src/x32-zip/x3-neon.c
src/x32-zip/x4-neon.c
- src/x32-zip/xm-neon.c)
+ src/x32-zip/xm-neon.c
+ src/xx-fill/neon-x64.c)
SET(PROD_NEONFMA_MICROKERNEL_SRCS
src/f32-dwconv/gen/up4x9-minmax-neonfma.c
@@ -2404,7 +2402,6 @@
src/f32-vunary/gen/vabs-sse-x8.c
src/f32-vunary/gen/vneg-sse-x8.c
src/f32-vunary/gen/vsqr-sse-x8.c
- src/x32-fill/sse.c
src/x32-packx/x4-sse.c
src/x32-pad/sse.c)
@@ -2579,7 +2576,6 @@
src/math/sqrt-sse-hh1mac.c
src/math/sqrt-sse-nr1mac.c
src/math/sqrt-sse-nr2mac.c
- src/x32-fill/sse.c
src/x32-packx/x4-sse.c
src/x32-pad/sse.c)
@@ -2639,7 +2635,8 @@
src/x32-zip/x2-sse2.c
src/x32-zip/x3-sse2.c
src/x32-zip/x4-sse2.c
- src/x32-zip/xm-sse2.c)
+ src/x32-zip/xm-sse2.c
+ src/xx-fill/sse2-x64.c)
SET(ALL_SSE2_MICROKERNEL_SRCS
src/f32-argmaxpool/4x-sse2-c4.c
@@ -2896,7 +2893,8 @@
src/x32-zip/x2-sse2.c
src/x32-zip/x3-sse2.c
src/x32-zip/x4-sse2.c
- src/x32-zip/xm-sse2.c)
+ src/x32-zip/xm-sse2.c
+ src/xx-fill/sse2-x64.c)
SET(PROD_SSSE3_MICROKERNEL_SRCS
src/f32-dwconv2d-chw/gen/3x3p1-minmax-ssse3-2x4-acc2.c
@@ -6711,15 +6709,6 @@
TARGET_LINK_LIBRARIES(u8-vclamp-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
ADD_TEST(u8-vclamp-test u8-vclamp-test)
- ADD_EXECUTABLE(x32-fill-test test/x32-fill.cc $<TARGET_OBJECTS:all_microkernels>)
- SET_TARGET_PROPERTIES(x32-fill-test PROPERTIES
- CXX_STANDARD 11
- CXX_STANDARD_REQUIRED YES
- CXX_EXTENSIONS YES)
- TARGET_INCLUDE_DIRECTORIES(x32-fill-test PRIVATE include src test)
- TARGET_LINK_LIBRARIES(x32-fill-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
- ADD_TEST(x32-fill-test x32-fill-test)
-
ADD_EXECUTABLE(x32-packx-test test/x32-packx.cc $<TARGET_OBJECTS:all_microkernels>)
SET_TARGET_PROPERTIES(x32-packx-test PROPERTIES
CXX_STANDARD 11
@@ -6782,6 +6771,15 @@
TARGET_INCLUDE_DIRECTORIES(x8-zip-test PRIVATE include src test)
TARGET_LINK_LIBRARIES(x8-zip-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
ADD_TEST(x8-zip-test x8-zip-test)
+
+ ADD_EXECUTABLE(xx-fill-test test/xx-fill.cc $<TARGET_OBJECTS:all_microkernels>)
+ SET_TARGET_PROPERTIES(xx-fill-test PROPERTIES
+ CXX_STANDARD 11
+ CXX_STANDARD_REQUIRED YES
+ CXX_EXTENSIONS YES)
+ TARGET_INCLUDE_DIRECTORIES(xx-fill-test PRIVATE include src test)
+ TARGET_LINK_LIBRARIES(xx-fill-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
+ ADD_TEST(xx-fill-test xx-fill-test)
ENDIF()
# ---[ XNNPACK microbenchmarks
diff --git a/src/init.c b/src/init.c
index 9244ca4..d4e7ec1 100644
--- a/src/init.c
+++ b/src/init.c
@@ -98,13 +98,6 @@
}
#endif
- /**************************** XX micro-kernels ****************************/
- #ifndef XNN_NO_XX_OPERATORS
- init_flags |= XNN_INIT_FLAG_XX;
-
- xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
- #endif
-
if (cpuinfo_has_arm_neon()) {
/**************************** QC8 micro-kernels ****************************/
#ifndef XNN_NO_QC8_OPERATORS
@@ -588,10 +581,6 @@
#ifndef XNN_NO_X32_OPERATORS
init_flags |= XNN_INIT_FLAG_X32;
- xnn_params.x32.fill = (struct fill_parameters) {
- .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__neon,
- .row_tile = 1,
- };
xnn_params.x32.pad = (struct pad_parameters) {
.ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__neon,
.row_tile = 1,
@@ -611,7 +600,20 @@
};
#endif // XNN_NO_NCHW_OPERATORS
#endif // XNN_NO_X32_OPERATORS
+
+ /**************************** XX micro-kernels ****************************/
+ #ifndef XNN_NO_XX_OPERATORS
+ init_flags |= XNN_INIT_FLAG_XX;
+
+ xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
+ xnn_params.xx.fill = (struct fill_parameters) {
+ .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__neon_x64,
+ .row_tile = 1,
+ };
+ #endif // XNN_NO_XX_OPERATORS
+
} else if (!XNN_PLATFORM_MOBILE) {
+
/*************************** QU8 micro-kernels ***************************/
#ifndef XNN_NO_QS8_OPERATORS
init_flags |= XNN_INIT_FLAG_QS8;
@@ -944,10 +946,6 @@
#ifndef XNN_NO_X32_OPERATORS
init_flags |= XNN_INIT_FLAG_X32;
- xnn_params.x32.fill = (struct fill_parameters) {
- .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__scalar_int,
- .row_tile = 1,
- };
xnn_params.x32.pad = (struct pad_parameters) {
.ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__scalar_int,
.row_tile = 1,
@@ -967,17 +965,21 @@
};
#endif // XNN_NO_NCHW_OPERATORS
#endif // XNN_NO_X32_OPERATORS
+
+ /**************************** XX micro-kernels ****************************/
+ #ifndef XNN_NO_XX_OPERATORS
+ init_flags |= XNN_INIT_FLAG_XX;
+
+ xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
+ xnn_params.xx.fill = (struct fill_parameters) {
+ .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
+ .row_tile = 1,
+ };
+ #endif // XNN_NO_XX_OPERATORS
}
#elif XNN_ARCH_ARM64
- /**************************** XX micro-kernels ****************************/
- #ifndef XNN_NO_XX_OPERATORS
- init_flags |= XNN_INIT_FLAG_XX;
-
- xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
- #endif
-
/**************************** QC8 micro-kernels ****************************/
#ifndef XNN_NO_QC8_OPERATORS
init_flags |= XNN_INIT_FLAG_QC8;
@@ -1968,10 +1970,6 @@
#ifndef XNN_NO_X32_OPERATORS
init_flags |= XNN_INIT_FLAG_X32;
- xnn_params.x32.fill = (struct fill_parameters) {
- .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__neon,
- .row_tile = 1,
- };
xnn_params.x32.pad = (struct pad_parameters) {
.ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__neon,
.row_tile = 1,
@@ -1992,19 +1990,23 @@
#endif // XNN_NO_NCHW_OPERATORS
#endif // XNN_NO_X32_OPERATORS
-#elif XNN_ARCH_X86 || XNN_ARCH_X86_64
- if (!cpuinfo_has_x86_sse2()) {
- xnn_log_error("XNNPACK initialization failed: SSE2 is not supported");
- return;
- }
-
/**************************** XX micro-kernels ****************************/
#ifndef XNN_NO_XX_OPERATORS
init_flags |= XNN_INIT_FLAG_XX;
xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
+ xnn_params.xx.fill = (struct fill_parameters) {
+ .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__neon_x64,
+ .row_tile = 1,
+ };
#endif
+#elif XNN_ARCH_X86 || XNN_ARCH_X86_64
+ if (!cpuinfo_has_x86_sse2()) {
+ xnn_log_error("XNNPACK initialization failed: SSE2 is not supported");
+ return;
+ }
+
/**************************** QC8 micro-kernels ****************************/
#ifndef XNN_NO_QC8_OPERATORS
init_flags |= XNN_INIT_FLAG_QC8;
@@ -3019,10 +3021,6 @@
#ifndef XNN_NO_X32_OPERATORS
init_flags |= XNN_INIT_FLAG_X32;
- xnn_params.x32.fill = (struct fill_parameters) {
- .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__sse,
- .row_tile = 1,
- };
xnn_params.x32.pad = (struct pad_parameters) {
.ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__sse,
.row_tile = 1,
@@ -3043,15 +3041,19 @@
#endif // XNN_NO_NCHW_OPERATORS
#endif // XNN_NO_X32_OPERATORS
-#elif XNN_ARCH_WASMSIMD
-
/**************************** XX micro-kernels ****************************/
#ifndef XNN_NO_XX_OPERATORS
init_flags |= XNN_INIT_FLAG_XX;
xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
+ xnn_params.xx.fill = (struct fill_parameters) {
+ .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__sse2_x64,
+ .row_tile = 1,
+ };
#endif
+#elif XNN_ARCH_WASMSIMD
+
/**************************** QC8 micro-kernels ****************************/
#ifndef XNN_NO_QS8_OPERATORS
init_flags |= XNN_INIT_FLAG_QC8;
@@ -3607,10 +3609,6 @@
#ifndef XNN_NO_X32_OPERATORS
init_flags |= XNN_INIT_FLAG_X32;
- xnn_params.x32.fill = (struct fill_parameters) {
- .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__wasmsimd,
- .row_tile = 1,
- };
xnn_params.x32.pad = (struct pad_parameters) {
.ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__wasmsimd,
.row_tile = 1,
@@ -3631,15 +3629,19 @@
#endif // XNN_NO_NCHW_OPERATORS
#endif // XNN_NO_X32_OPERATORS
-#elif XNN_ARCH_WASM
-
/**************************** XX micro-kernels ****************************/
#ifndef XNN_NO_XX_OPERATORS
init_flags |= XNN_INIT_FLAG_XX;
xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
+ xnn_params.xx.fill = (struct fill_parameters) {
+ .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__wasmsimd_x64,
+ .row_tile = 1,
+ };
#endif
+#elif XNN_ARCH_WASM
+
/**************************** QC8 micro-kernels ****************************/
#ifndef XNN_NO_QC8_OPERATORS
init_flags |= XNN_INIT_FLAG_QC8;
@@ -4064,10 +4066,6 @@
#ifndef XNN_NO_X32_OPERATORS
init_flags |= XNN_INIT_FLAG_X32;
- xnn_params.x32.fill = (struct fill_parameters) {
- .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__scalar_float,
- .row_tile = 1,
- };
xnn_params.x32.pad = (struct pad_parameters) {
.ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__scalar_float,
.row_tile = 1,
@@ -4088,6 +4086,17 @@
#endif // XNN_NO_NCHW_OPERATORS
#endif // XNN_NO_X32_OPERATORS
+ /**************************** XX micro-kernels ****************************/
+ #ifndef XNN_NO_XX_OPERATORS
+ init_flags |= XNN_INIT_FLAG_XX;
+
+ xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
+ xnn_params.xx.fill = (struct fill_parameters) {
+ .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
+ .row_tile = 1,
+ };
+ #endif
+
#elif XNN_ARCH_RISCV
/**************************** XX micro-kernels ****************************/
@@ -4413,7 +4422,7 @@
init_flags |= XNN_INIT_FLAG_X32;
xnn_params.x32.fill = (struct fill_parameters) {
- .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__scalar_float,
+ .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
.row_tile = 1,
};
xnn_params.x32.pad = (struct pad_parameters) {
diff --git a/src/operator-run.c b/src/operator-run.c
index d2133e4..5a3bda7 100644
--- a/src/operator-run.c
+++ b/src/operator-run.c
@@ -759,7 +759,7 @@
&context->padding_value,
input, 0 /* input stride */, output, 0 /* output stride */);
} else {
- context->fill_ukernel(1 /* rows */, context->output_size[0], output, 0 /* output stride */, &context->padding_value);
+ context->fill_ukernel(1 /* rows */, context->output_size[0], output, 0 /* output stride */, context->padding_value);
}
}
diff --git a/src/operators/constant-pad-nd.c b/src/operators/constant-pad-nd.c
index d74df1a..ef1475d 100644
--- a/src/operators/constant-pad-nd.c
+++ b/src/operators/constant-pad-nd.c
@@ -148,7 +148,7 @@
.input = input,
.output = output,
.padding_value = constant_pad_op->pad_value,
- .fill_ukernel = xnn_params.x32.fill.ukernel,
+ .fill_ukernel = xnn_params.xx.fill.ukernel,
.pad_ukernel = xnn_params.x32.pad.ukernel,
};
diff --git a/src/x32-fill/neon.c b/src/x32-fill/neon.c
deleted file mode 100644
index fb5e3a0..0000000
--- a/src/x32-fill/neon.c
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/fill.h>
-
-
-void xnn_x32_fill_ukernel__neon(
- size_t rows,
- size_t channels,
- uint32_t* output,
- size_t output_stride,
- const uint32_t* fill_value)
-{
- assert(rows != 0);
- assert(channels != 0);
- assert(channels % sizeof(uint32_t) == 0);
- assert(fill_value != NULL);
-
- const size_t output_increment = output_stride - channels;
-
- const uint32x4_t vfill = vld1q_dup_u32(fill_value);
- do {
- size_t c = channels;
- for (; c >= 16 * sizeof(uint32_t); c -= 16 * sizeof(uint32_t)) {
- vst1q_u32(output, vfill); output += 4;
- vst1q_u32(output, vfill); output += 4;
- vst1q_u32(output, vfill); output += 4;
- vst1q_u32(output, vfill); output += 4;
- }
- for (; c >= 4 * sizeof(uint32_t); c -= 4 * sizeof(uint32_t)) {
- vst1q_u32(output, vfill); output += 4;
- }
- if XNN_UNLIKELY(c != 0) {
- if XNN_LIKELY(c & (2 * sizeof(uint32_t))) {
- vst1_u32(output, vget_low_u32(vfill)); output += 2;
- }
- if XNN_LIKELY(c & (1 * sizeof(uint32_t))) {
- vst1q_lane_u32(output, vfill, 0); output += 1;
- }
- }
- output = (void*) ((uintptr_t) output + output_increment);
- } while (--rows != 0);
-}
diff --git a/src/x32-fill/scalar-float.c b/src/x32-fill/scalar-float.c
deleted file mode 100644
index 360e71c..0000000
--- a/src/x32-fill/scalar-float.c
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <xnnpack/fill.h>
-
-
-void xnn_x32_fill_ukernel__scalar_float(
- size_t rows,
- size_t channels,
- uint32_t* output,
- size_t output_stride,
- const uint32_t* fill_value)
-{
- assert(rows != 0);
- assert(channels != 0);
- assert(channels % sizeof(float) == 0);
- assert(fill_value != NULL);
-
- const size_t output_increment = output_stride - channels;
-
- const float vfill = *((const float*) fill_value);
- float* o = (float*) output;
- do {
- size_t c = channels;
- for (; c >= 4 * sizeof(uint32_t); c -= 4 * sizeof(uint32_t)) {
- o[0] = vfill;
- o[1] = vfill;
- o[2] = vfill;
- o[3] = vfill;
- o += 4;
- }
- if XNN_UNLIKELY(c != 0) {
- do {
- *o++ = vfill;
- c -= sizeof(uint32_t);
- } while (c != 0);
- }
- o = (void*) ((uintptr_t) o + output_increment);
- } while (--rows != 0);
-}
diff --git a/src/x32-fill/scalar-int.c b/src/x32-fill/scalar-int.c
deleted file mode 100644
index 8648576..0000000
--- a/src/x32-fill/scalar-int.c
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <xnnpack/fill.h>
-
-
-void xnn_x32_fill_ukernel__scalar_int(
- size_t rows,
- size_t channels,
- uint32_t* output,
- size_t output_stride,
- const uint32_t* fill_value)
-{
- assert(rows != 0);
- assert(channels != 0);
- assert(channels % sizeof(uint32_t) == 0);
- assert(fill_value != NULL);
-
- const size_t output_increment = output_stride - channels;
-
- const uint32_t vfill = *fill_value;
- do {
- size_t c = channels;
- for (; c >= 4 * sizeof(uint32_t); c -= 4 * sizeof(uint32_t)) {
- output[0] = vfill;
- output[1] = vfill;
- output[2] = vfill;
- output[3] = vfill;
- output += 4;
- }
- if XNN_UNLIKELY(c != 0) {
- do {
- *output++ = vfill;
- c -= sizeof(uint32_t);
- } while (c != 0);
- }
- output = (void*) ((uintptr_t) output + output_increment);
- } while (--rows != 0);
-}
diff --git a/src/x32-fill/sse.c b/src/x32-fill/sse.c
deleted file mode 100644
index 3569c93..0000000
--- a/src/x32-fill/sse.c
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <xmmintrin.h>
-
-#include <xnnpack/fill.h>
-
-
-void xnn_x32_fill_ukernel__sse(
- size_t rows,
- size_t channels,
- uint32_t* output,
- size_t output_stride,
- const uint32_t* fill_value)
-{
- assert(rows != 0);
- assert(channels != 0);
- assert(channels % sizeof(uint32_t) == 0);
- assert(fill_value != NULL);
-
- const size_t output_increment = output_stride - channels;
-
- const __m128 vfill = _mm_load1_ps((const float*) fill_value);
- float* o = (float*) output;
- do {
- size_t c = channels;
- for (; c >= 16 * sizeof(uint32_t); c -= 16 * sizeof(uint32_t)) {
- _mm_storeu_ps(o, vfill);
- _mm_storeu_ps(o + 4, vfill);
- _mm_storeu_ps(o + 8, vfill);
- _mm_storeu_ps(o + 12, vfill);
- o += 16;
- }
- for (; c >= 4 * sizeof(uint32_t); c -= 4 * sizeof(uint32_t)) {
- _mm_storeu_ps(o, vfill);
- o += 4;
- }
- if XNN_UNLIKELY(c != 0) {
- if XNN_LIKELY(c & (2 * sizeof(uint32_t))) {
- _mm_storel_pi((__m64*) o, vfill);
- o += 2;
- }
- if XNN_LIKELY(c & (1 * sizeof(uint32_t))) {
- _mm_store_ss(o, vfill);
- o += 1;
- }
- }
- o = (void*) ((uintptr_t) o + output_increment);
- } while (--rows != 0);
-}
diff --git a/src/x32-fill/wasmsimd.c b/src/x32-fill/wasmsimd.c
deleted file mode 100644
index 7ff2242..0000000
--- a/src/x32-fill/wasmsimd.c
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include <xnnpack/fill.h>
-
-
-void xnn_x32_fill_ukernel__wasmsimd(
- size_t rows,
- size_t channels,
- uint32_t* output,
- size_t output_stride,
- const uint32_t* fill_value)
-{
- assert(rows != 0);
- assert(channels != 0);
- assert(channels % sizeof(uint32_t) == 0);
- assert(fill_value != NULL);
-
- const size_t output_increment = output_stride - channels;
-
- const v128_t vfill = wasm_v128_load32_splat(fill_value);
- do {
- size_t c = channels;
- for (; c >= 16 * sizeof(uint32_t); c -= 16 * sizeof(uint32_t)) {
- wasm_v128_store(output, vfill);
- wasm_v128_store(output + 4, vfill);
- wasm_v128_store(output + 8, vfill);
- wasm_v128_store(output + 12, vfill);
- output += 16;
- }
- for (; c >= 4 * sizeof(uint32_t); c -= 4 * sizeof(uint32_t)) {
- wasm_v128_store(output, vfill);
- output += 4;
- }
- if XNN_UNLIKELY(c != 0) {
- if XNN_LIKELY(c & (2 * sizeof(uint32_t))) {
- *((double*) output) = wasm_f64x2_extract_lane(vfill, 0);
- output += 2;
- }
- if XNN_LIKELY(c & (1 * sizeof(uint32_t))) {
- *((float*) output) = wasm_f32x4_extract_lane(vfill, 0);
- output += 1;
- }
- }
- output = (void*) ((uintptr_t) output + output_increment);
- } while (--rows != 0);
-}
diff --git a/src/xnnpack/fill.h b/src/xnnpack/fill.h
index 43fc96e..4c74388 100644
--- a/src/xnnpack/fill.h
+++ b/src/xnnpack/fill.h
@@ -20,15 +20,14 @@
XNN_INTERNAL void fn_name( \
size_t kernel_elements, \
size_t channels, \
- uint32_t* output, \
+ void* output, \
size_t output_stride, \
- const uint32_t* fill_value);
+ const uint32_t fill_pattern);
-DECLARE_FILL_UKERNEL_FUNCTION(xnn_x32_fill_ukernel__sse)
-DECLARE_FILL_UKERNEL_FUNCTION(xnn_x32_fill_ukernel__neon)
-DECLARE_FILL_UKERNEL_FUNCTION(xnn_x32_fill_ukernel__wasmsimd)
-DECLARE_FILL_UKERNEL_FUNCTION(xnn_x32_fill_ukernel__scalar_float)
-DECLARE_FILL_UKERNEL_FUNCTION(xnn_x32_fill_ukernel__scalar_int)
+DECLARE_FILL_UKERNEL_FUNCTION(xnn_xx_fill_ukernel__sse2_x64)
+DECLARE_FILL_UKERNEL_FUNCTION(xnn_xx_fill_ukernel__neon_x64)
+DECLARE_FILL_UKERNEL_FUNCTION(xnn_xx_fill_ukernel__wasmsimd_x64)
+DECLARE_FILL_UKERNEL_FUNCTION(xnn_xx_fill_ukernel__scalar_x16)
#ifdef __cplusplus
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 0314b7f..57a880a 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -1284,14 +1284,7 @@
size_t channels,
void* output,
size_t output_stride,
- const void* fill_value);
-
-typedef void (*xnn_x32_fill_ukernel_function)(
- size_t rows,
- size_t channels,
- uint32_t* output,
- size_t output_stride,
- const uint32_t* fill_value);
+ const uint32_t fill_pattern);
typedef void (*xnn_depthtospace2d_chw2hwc_ukernel_function)(
size_t output_channels,
@@ -2538,9 +2531,6 @@
uint32_t init_flags;
struct xnn_allocator allocator;
struct {
- xnn_univector_ukernel_function copy;
- } xx;
- struct {
struct gemm_parameters gemm;
struct dwconv_parameters dwconv[XNN_MAX_QC8_DWCONV_UKERNELS];
} qc8;
@@ -2638,12 +2628,15 @@
} f32;
struct {
struct pad_parameters pad;
- struct fill_parameters fill;
xnn_unpool_ukernel_function unpool;
struct zip_parameters zip;
// Depth To Space 2D with CHW->HWC layout conversion.
struct depthtospace2d_chw2hwc_parameters depthtospace2d_chw2hwc;
} x32;
+ struct {
+ xnn_univector_ukernel_function copy;
+ struct fill_parameters fill;
+ } xx;
};
#ifdef __cplusplus
diff --git a/src/xx-fill/neon-x64.c b/src/xx-fill/neon-x64.c
new file mode 100644
index 0000000..57188f1
--- /dev/null
+++ b/src/xx-fill/neon-x64.c
@@ -0,0 +1,55 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/fill.h>
+
+
+void xnn_xx_fill_ukernel__neon_x64(
+ size_t rows,
+ size_t channels,
+ void* output,
+ size_t output_stride,
+ const uint32_t fill_pattern)
+{
+ assert(rows != 0);
+ assert(channels != 0);
+
+ const size_t output_increment = output_stride - channels;
+
+ const uint8x16_t vfill_pattern = vreinterpretq_u8_u32(vdupq_n_u32(fill_pattern));
+ do {
+ size_t c = channels;
+ for (; c >= 64 * sizeof(uint8_t); c -= 64 * sizeof(uint8_t)) {
+ vst1q_u8(output, vfill_pattern); output = ((uint8_t*) output + 16);
+ vst1q_u8(output, vfill_pattern); output = ((uint8_t*) output + 16);
+ vst1q_u8(output, vfill_pattern); output = ((uint8_t*) output + 16);
+ vst1q_u8(output, vfill_pattern); output = ((uint8_t*) output + 16);
+ }
+ for (; c >= 16 * sizeof(uint8_t); c -= 16 * sizeof(uint8_t)) {
+ vst1q_u8(output, vfill_pattern); output = ((uint8_t*) output + 16);
+ }
+ if XNN_UNLIKELY(c != 0) {
+ if XNN_LIKELY(c & (8 * sizeof(uint8_t))) {
+ vst1_u8(output, vget_low_u8(vfill_pattern)); output = ((uint8_t*) output + 8);
+ }
+ if XNN_LIKELY(c & (4 * sizeof(uint8_t))) {
+ vst1q_lane_u32(output, vreinterpretq_u32_u8(vfill_pattern), 0); output = ((uint8_t*) output + 4);
+ }
+ uint8x8_t vfill_subpattern = vget_low_u8(vfill_pattern);
+ if XNN_LIKELY(c & (2 * sizeof(uint8_t))) {
+ vst1_lane_u16(output, vreinterpret_u16_u8(vfill_subpattern), 0); output = ((uint8_t*) output + 2);
+ vfill_subpattern = vext_u8(vfill_subpattern, vfill_subpattern, 2);
+ }
+ if XNN_LIKELY(c & (1 * sizeof(uint8_t))) {
+ vst1_lane_u8(output, vfill_subpattern, 0); output = ((uint8_t*) output + 1);
+ }
+ }
+ output = (void*) ((uintptr_t) output + output_increment);
+ } while (--rows != 0);
+}
diff --git a/src/xx-fill/scalar-x16.c b/src/xx-fill/scalar-x16.c
new file mode 100644
index 0000000..a05eb0a
--- /dev/null
+++ b/src/xx-fill/scalar-x16.c
@@ -0,0 +1,55 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/fill.h>
+
+
+void xnn_xx_fill_ukernel__scalar_x16(
+ size_t rows,
+ size_t channels,
+ void* output,
+ size_t output_stride,
+ const uint32_t fill_pattern)
+{
+ assert(rows != 0);
+ assert(channels != 0);
+
+ const size_t output_increment = output_stride - channels;
+
+ do {
+ uint32_t vfill_pattern = fill_pattern;
+ size_t c = channels;
+ for (; c >= 16 * sizeof(uint8_t); c -= 16 * sizeof(uint8_t)) {
+ ((uint32_t*) output)[0] = vfill_pattern;
+ ((uint32_t*) output)[1] = vfill_pattern;
+ ((uint32_t*) output)[2] = vfill_pattern;
+ ((uint32_t*) output)[3] = vfill_pattern;
+ output = ((uint8_t*) output + 16);
+ }
+ if XNN_UNLIKELY(c != 0) {
+ if XNN_LIKELY(c & (8 * sizeof(uint8_t))) {
+ ((uint32_t*) output)[0] = vfill_pattern;
+ ((uint32_t*) output)[1] = vfill_pattern;
+ output = ((uint8_t*) output + 8);
+ }
+ if XNN_LIKELY(c & (4 * sizeof(uint8_t))) {
+ *((uint32_t*) output) = vfill_pattern;
+ output = ((uint8_t*) output + 4);
+ }
+ if XNN_LIKELY(c & (2 * sizeof(uint8_t))) {
+ *((uint16_t*) output) = (uint16_t) vfill_pattern;
+ vfill_pattern >>= 16;
+ output = ((uint8_t*) output + 2);
+ }
+ if XNN_LIKELY(c & (1 * sizeof(uint8_t))) {
+ *((uint8_t*) output) = (uint8_t) vfill_pattern;
+ output = ((uint8_t*) output + 1);
+ }
+ }
+ output = (void*) ((uintptr_t) output + output_increment);
+ } while (--rows != 0);
+}
diff --git a/src/xx-fill/sse2-x64.c b/src/xx-fill/sse2-x64.c
new file mode 100644
index 0000000..08a55cd
--- /dev/null
+++ b/src/xx-fill/sse2-x64.c
@@ -0,0 +1,61 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/fill.h>
+
+
+void xnn_xx_fill_ukernel__sse2_x64(
+ size_t rows,
+ size_t channels,
+ void* output,
+ size_t output_stride,
+ const uint32_t fill_pattern)
+{
+ assert(rows != 0);
+ assert(channels != 0);
+
+ const size_t output_increment = output_stride - channels;
+
+ const __m128i vfill = _mm_shuffle_epi32(_mm_cvtsi32_si128(fill_pattern), _MM_SHUFFLE(0, 0, 0, 0));
+ do {
+ size_t c = channels;
+ for (; c >= 64 * sizeof(uint8_t); c -= 64 * sizeof(uint8_t)) {
+ _mm_storeu_si128((__m128i*) output, vfill);
+ _mm_storeu_si128((__m128i*) output + 1, vfill);
+ _mm_storeu_si128((__m128i*) output + 2, vfill);
+ _mm_storeu_si128((__m128i*) output + 3, vfill);
+ output = ((uint8_t*) output + 64);
+ }
+ for (; c >= 16 * sizeof(uint8_t); c -= 16 * sizeof(uint8_t)) {
+ _mm_storeu_si128((__m128i*) output, vfill);
+ output = ((uint8_t*) output + 16);
+ }
+ if XNN_UNLIKELY(c != 0) {
+ if XNN_LIKELY(c & (8 * sizeof(uint8_t))) {
+ _mm_storel_epi64(output, vfill);
+ output = ((uint8_t*) output + 8);
+ }
+ if XNN_LIKELY(c & (4 * sizeof(uint8_t))) {
+ *((uint32_t*) output) = fill_pattern;
+ output = ((uint8_t*) output + 4);
+ }
+ uint32_t vfill_subpattern = fill_pattern;
+ if XNN_LIKELY(c & (2 * sizeof(uint8_t))) {
+ *((uint16_t*) output) = (uint16_t) vfill_subpattern;
+ vfill_subpattern >>= 16;
+ output = ((uint8_t*) output + 2);
+ }
+ if XNN_LIKELY(c & (1 * sizeof(uint8_t))) {
+ *((uint8_t*) output) = (uint8_t) vfill_subpattern;
+ output = ((uint8_t*) output + 1);
+ }
+ }
+ output = (void*) ((uintptr_t) output + output_increment);
+ } while (--rows != 0);
+}
diff --git a/src/xx-fill/wasmsimd-x64.c b/src/xx-fill/wasmsimd-x64.c
new file mode 100644
index 0000000..f629f2b
--- /dev/null
+++ b/src/xx-fill/wasmsimd-x64.c
@@ -0,0 +1,61 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/fill.h>
+
+
+void xnn_xx_fill_ukernel__wasmsimd_x64(
+ size_t rows,
+ size_t channels,
+ void* output,
+ size_t output_stride,
+ const uint32_t fill_pattern)
+{
+ assert(rows != 0);
+ assert(channels != 0);
+
+ const size_t output_increment = output_stride - channels;
+
+ const v128_t vfill_pattern = wasm_i32x4_splat(fill_pattern);
+ do {
+ size_t c = channels;
+ for (; c >= 64 * sizeof(uint8_t); c -= 64 * sizeof(uint8_t)) {
+ wasm_v128_store(output, vfill_pattern);
+ wasm_v128_store((uint8_t*) output + 16, vfill_pattern);
+ wasm_v128_store((uint8_t*) output + 32, vfill_pattern);
+ wasm_v128_store((uint8_t*) output + 48, vfill_pattern);
+ output = ((uint8_t*) output + 64);
+ }
+ for (; c >= 16 * sizeof(uint8_t); c -= 16 * sizeof(uint8_t)) {
+ wasm_v128_store(output, vfill_pattern);
+ output = ((uint8_t*) output + 16);
+ }
+ if XNN_UNLIKELY(c != 0) {
+ if XNN_LIKELY(c & (8 * sizeof(uint8_t))) {
+ *((double*) output) = wasm_f64x2_extract_lane(vfill_pattern, 0);
+ output = ((uint8_t*) output + 8);
+ }
+ uint32_t vfill_subpattern = fill_pattern;
+ if XNN_LIKELY(c & (4 * sizeof(uint8_t))) {
+ *((uint32_t*) output) = vfill_subpattern;
+ output = ((uint8_t*) output + 4);
+ }
+ if XNN_LIKELY(c & (2 * sizeof(uint8_t))) {
+ *((uint16_t*) output) = (uint16_t) vfill_subpattern;
+ vfill_subpattern >>= 16;
+ output = ((uint8_t*) output + 2);
+ }
+ if XNN_LIKELY(c & (1 * sizeof(uint8_t))) {
+ *((uint8_t*) output) = (uint8_t) vfill_subpattern;
+ output = ((uint8_t*) output + 1);
+ }
+ }
+ output = (void*) ((uintptr_t) output + output_increment);
+ } while (--rows != 0);
+}
diff --git a/test/fill-microkernel-tester.h b/test/fill-microkernel-tester.h
index e40df66..47466b6 100644
--- a/test/fill-microkernel-tester.h
+++ b/test/fill-microkernel-tester.h
@@ -64,32 +64,35 @@
return this->iterations_;
}
- void Test(xnn_x32_fill_ukernel_function fill) const {
+ void Test(xnn_fill_ukernel_function fill) const {
ASSERT_GE(output_stride(), channels());
std::random_device random_device;
auto rng = std::mt19937(random_device());
- auto u32rng = std::bind(std::uniform_int_distribution<uint32_t>(), rng);
+ auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
- std::vector<uint32_t> output((rows() - 1) * output_stride() + channels());
- std::vector<uint32_t> output_copy(output.size());
+ std::vector<uint8_t> output((rows() - 1) * output_stride() + channels());
+ std::vector<uint8_t> output_copy(output.size());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
- std::generate(output.begin(), output.end(), std::ref(u32rng));
+ std::generate(output.begin(), output.end(), std::ref(u8rng));
std::copy(output.cbegin(), output.cend(), output_copy.begin());
- const uint32_t fill_value = u32rng();
+ std::array<uint8_t, 4> fill_pattern;
+ std::generate(fill_pattern.begin(), fill_pattern.end(), std::ref(u8rng));
+ uint32_t fill_value = 0;
+ memcpy(&fill_value, fill_pattern.data(), sizeof(fill_value));
// Call optimized micro-kernel.
fill(
rows(),
- channels() * sizeof(uint32_t),
+ channels() * sizeof(uint8_t),
output.data(),
- output_stride() * sizeof(uint32_t),
- &fill_value);
+ output_stride() * sizeof(uint8_t),
+ fill_value);
// Verify results.
for (size_t i = 0; i < rows(); i++) {
for (size_t c = 0; c < channels(); c++) {
- ASSERT_EQ(output[i * output_stride() + c], fill_value)
+ ASSERT_EQ(uint32_t(output[i * output_stride() + c]), uint32_t(fill_pattern[c % fill_pattern.size()]))
<< "at row " << i << " / " << rows()
<< ", channel " << c << " / " << channels()
<< ", fill value 0x" << std::hex << std::setw(8) << std::setfill('0') << fill_value
@@ -98,7 +101,7 @@
}
for (size_t i = 0; i + 1 < rows(); i++) {
for (size_t c = channels(); c < output_stride(); c++) {
- ASSERT_EQ(output[i * output_stride() + c], output_copy[i * output_stride() + c])
+ ASSERT_EQ(uint32_t(output[i * output_stride() + c]), uint32_t(output_copy[i * output_stride() + c]))
<< "at row " << i << " / " << rows()
<< ", channel " << c << " / " << channels()
<< ", original value 0x" << std::hex << std::setw(8) << std::setfill('0')
diff --git a/test/x32-fill.cc b/test/x32-fill.cc
deleted file mode 100644
index 9035e42..0000000
--- a/test/x32-fill.cc
+++ /dev/null
@@ -1,268 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <gtest/gtest.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/isa-checks.h>
-
-#include <xnnpack/fill.h>
-#include "fill-microkernel-tester.h"
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
- TEST(X32_FILL__NEON, channels_eq_4) {
- TEST_REQUIRES_ARM_NEON;
- FillMicrokernelTester()
- .channels(4)
- .Test(xnn_x32_fill_ukernel__neon);
- }
-
- TEST(X32_FILL__NEON, channels_div_4) {
- TEST_REQUIRES_ARM_NEON;
- for (size_t channels = 8; channels < 32; channels += 4) {
- FillMicrokernelTester()
- .channels(channels)
- .Test(xnn_x32_fill_ukernel__neon);
- }
- }
-
- TEST(X32_FILL__NEON, channels_lt_4) {
- TEST_REQUIRES_ARM_NEON;
- for (size_t channels = 1; channels < 4; channels++) {
- FillMicrokernelTester()
- .channels(channels)
- .Test(xnn_x32_fill_ukernel__neon);
- }
- }
-
- TEST(X32_FILL__NEON, channels_gt_4) {
- TEST_REQUIRES_ARM_NEON;
- for (size_t channels = 5; channels < 8; channels++) {
- FillMicrokernelTester()
- .channels(channels)
- .Test(xnn_x32_fill_ukernel__neon);
- }
- }
-
- TEST(X32_FILL__NEON, multiple_rows) {
- TEST_REQUIRES_ARM_NEON;
- for (size_t rows = 2; rows < 5; rows++) {
- for (size_t channels = 1; channels < 16; channels += 3) {
- FillMicrokernelTester()
- .channels(channels)
- .rows(rows)
- .Test(xnn_x32_fill_ukernel__neon);
- }
- }
- }
-
- TEST(X32_FILL__NEON, multiple_rows_with_output_stride) {
- TEST_REQUIRES_ARM_NEON;
- for (size_t rows = 2; rows < 5; rows++) {
- for (size_t channels = 1; channels < 16; channels += 3) {
- FillMicrokernelTester()
- .channels(channels)
- .rows(rows)
- .output_stride(17)
- .Test(xnn_x32_fill_ukernel__neon);
- }
- }
- }
-#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(X32_FILL__SSE, channels_eq_4) {
- TEST_REQUIRES_X86_SSE;
- FillMicrokernelTester()
- .channels(4)
- .Test(xnn_x32_fill_ukernel__sse);
- }
-
- TEST(X32_FILL__SSE, channels_div_4) {
- TEST_REQUIRES_X86_SSE;
- for (size_t channels = 8; channels < 32; channels += 4) {
- FillMicrokernelTester()
- .channels(channels)
- .Test(xnn_x32_fill_ukernel__sse);
- }
- }
-
- TEST(X32_FILL__SSE, channels_lt_4) {
- TEST_REQUIRES_X86_SSE;
- for (size_t channels = 1; channels < 4; channels++) {
- FillMicrokernelTester()
- .channels(channels)
- .Test(xnn_x32_fill_ukernel__sse);
- }
- }
-
- TEST(X32_FILL__SSE, channels_gt_4) {
- TEST_REQUIRES_X86_SSE;
- for (size_t channels = 5; channels < 8; channels++) {
- FillMicrokernelTester()
- .channels(channels)
- .Test(xnn_x32_fill_ukernel__sse);
- }
- }
-
- TEST(X32_FILL__SSE, multiple_rows) {
- TEST_REQUIRES_X86_SSE;
- for (size_t rows = 2; rows < 5; rows++) {
- for (size_t channels = 1; channels < 16; channels += 3) {
- FillMicrokernelTester()
- .channels(channels)
- .rows(rows)
- .Test(xnn_x32_fill_ukernel__sse);
- }
- }
- }
-
- TEST(X32_FILL__SSE, multiple_rows_with_output_stride) {
- TEST_REQUIRES_X86_SSE;
- for (size_t rows = 2; rows < 5; rows++) {
- for (size_t channels = 1; channels < 16; channels += 3) {
- FillMicrokernelTester()
- .channels(channels)
- .rows(rows)
- .output_stride(17)
- .Test(xnn_x32_fill_ukernel__sse);
- }
- }
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_WASMSIMD
- TEST(X32_FILL__WASMSIMD, channels_eq_4) {
- FillMicrokernelTester()
- .channels(4)
- .Test(xnn_x32_fill_ukernel__wasmsimd);
- }
-
- TEST(X32_FILL__WASMSIMD, channels_div_4) {
- for (size_t channels = 8; channels < 32; channels += 4) {
- FillMicrokernelTester()
- .channels(channels)
- .Test(xnn_x32_fill_ukernel__wasmsimd);
- }
- }
-
- TEST(X32_FILL__WASMSIMD, channels_lt_4) {
- for (size_t channels = 1; channels < 4; channels++) {
- FillMicrokernelTester()
- .channels(channels)
- .Test(xnn_x32_fill_ukernel__wasmsimd);
- }
- }
-
- TEST(X32_FILL__WASMSIMD, channels_gt_4) {
- for (size_t channels = 5; channels < 8; channels++) {
- FillMicrokernelTester()
- .channels(channels)
- .Test(xnn_x32_fill_ukernel__wasmsimd);
- }
- }
-
- TEST(X32_FILL__WASMSIMD, multiple_rows) {
- for (size_t rows = 2; rows < 5; rows++) {
- for (size_t channels = 1; channels < 16; channels += 3) {
- FillMicrokernelTester()
- .channels(channels)
- .rows(rows)
- .Test(xnn_x32_fill_ukernel__wasmsimd);
- }
- }
- }
-
- TEST(X32_FILL__WASMSIMD, multiple_rows_with_output_stride) {
- for (size_t rows = 2; rows < 5; rows++) {
- for (size_t channels = 1; channels < 16; channels += 3) {
- FillMicrokernelTester()
- .channels(channels)
- .rows(rows)
- .output_stride(17)
- .Test(xnn_x32_fill_ukernel__wasmsimd);
- }
- }
- }
-#endif // XNN_ARCH_WASMSIMD
-
-
-TEST(X32_FILL__SCALAR_FLOAT, eq_1) {
- FillMicrokernelTester()
- .channels(1)
- .Test(xnn_x32_fill_ukernel__scalar_float);
-}
-
-TEST(X32_FILL__SCALAR_FLOAT, channels_gt_1) {
- for (size_t channels = 2; channels < 10; channels++) {
- FillMicrokernelTester()
- .channels(channels)
- .Test(xnn_x32_fill_ukernel__scalar_float);
- }
-}
-
-TEST(X32_FILL__SCALAR_FLOAT, multiple_rows) {
- for (size_t rows = 2; rows < 5; rows++) {
- for (size_t channels = 1; channels < 16; channels += 3) {
- FillMicrokernelTester()
- .channels(channels)
- .rows(rows)
- .Test(xnn_x32_fill_ukernel__scalar_float);
- }
- }
-}
-
-TEST(X32_FILL__SCALAR_FLOAT, multiple_rows_with_output_stride) {
- for (size_t rows = 2; rows < 5; rows++) {
- for (size_t channels = 1; channels < 16; channels += 3) {
- FillMicrokernelTester()
- .channels(channels)
- .rows(rows)
- .output_stride(17)
- .Test(xnn_x32_fill_ukernel__scalar_float);
- }
- }
-}
-
-
-TEST(X32_FILL__SCALAR_INT, eq_1) {
- FillMicrokernelTester()
- .channels(1)
- .Test(xnn_x32_fill_ukernel__scalar_int);
-}
-
-TEST(X32_FILL__SCALAR_INT, channels_gt_1) {
- for (size_t channels = 2; channels < 10; channels++) {
- FillMicrokernelTester()
- .channels(channels)
- .Test(xnn_x32_fill_ukernel__scalar_int);
- }
-}
-
-TEST(X32_FILL__SCALAR_INT, multiple_rows) {
- for (size_t rows = 2; rows < 5; rows++) {
- for (size_t channels = 1; channels < 16; channels += 3) {
- FillMicrokernelTester()
- .channels(channels)
- .rows(rows)
- .Test(xnn_x32_fill_ukernel__scalar_int);
- }
- }
-}
-
-TEST(X32_FILL__SCALAR_INT, multiple_rows_with_output_stride) {
- for (size_t rows = 2; rows < 5; rows++) {
- for (size_t channels = 1; channels < 16; channels += 3) {
- FillMicrokernelTester()
- .channels(channels)
- .rows(rows)
- .output_stride(17)
- .Test(xnn_x32_fill_ukernel__scalar_int);
- }
- }
-}
diff --git a/test/xx-fill.cc b/test/xx-fill.cc
new file mode 100644
index 0000000..a8e4305
--- /dev/null
+++ b/test/xx-fill.cc
@@ -0,0 +1,324 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <gtest/gtest.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/isa-checks.h>
+
+#include <xnnpack/fill.h>
+#include "fill-microkernel-tester.h"
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(XX_FILL__NEON_X64, channels_eq_1) {
+ TEST_REQUIRES_ARM_NEON;
+ FillMicrokernelTester()
+ .channels(1)
+ .Test(xnn_xx_fill_ukernel__neon_x64);
+ }
+
+ TEST(XX_FILL__NEON_X64, channels_eq_2) {
+ TEST_REQUIRES_ARM_NEON;
+ FillMicrokernelTester()
+ .channels(2)
+ .Test(xnn_xx_fill_ukernel__neon_x64);
+ }
+
+ TEST(XX_FILL__NEON_X64, channels_eq_4) {
+ TEST_REQUIRES_ARM_NEON;
+ FillMicrokernelTester()
+ .channels(4)
+ .Test(xnn_xx_fill_ukernel__neon_x64);
+ }
+
+ TEST(XX_FILL__NEON_X64, channels_eq_64) {
+ TEST_REQUIRES_ARM_NEON;
+ FillMicrokernelTester()
+ .channels(64)
+ .Test(xnn_xx_fill_ukernel__neon_x64);
+ }
+
+ TEST(XX_FILL__NEON_X64, channels_div_64) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 128; channels <= 192; channels += 64) {
+ FillMicrokernelTester()
+ .channels(channels)
+ .Test(xnn_xx_fill_ukernel__neon_x64);
+ }
+ }
+
+ TEST(XX_FILL__NEON_X64, channels_lt_64) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 1; channels < 64; channels++) {
+ FillMicrokernelTester()
+ .channels(channels)
+ .Test(xnn_xx_fill_ukernel__neon_x64);
+ }
+ }
+
+ TEST(XX_FILL__NEON_X64, channels_gt_64) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 65; channels < 128; channels++) {
+ FillMicrokernelTester()
+ .channels(channels)
+ .Test(xnn_xx_fill_ukernel__neon_x64);
+ }
+ }
+
+ TEST(XX_FILL__NEON_X64, multiple_rows) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t rows = 2; rows < 5; rows++) {
+ for (size_t channels = 1; channels < 192; channels += 15) {
+ FillMicrokernelTester()
+ .channels(channels)
+ .rows(rows)
+ .Test(xnn_xx_fill_ukernel__neon_x64);
+ }
+ }
+ }
+
+ TEST(XX_FILL__NEON_X64, multiple_rows_with_output_stride) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t rows = 2; rows < 5; rows++) {
+ for (size_t channels = 1; channels < 192; channels += 15) {
+ FillMicrokernelTester()
+ .channels(channels)
+ .rows(rows)
+ .output_stride(193)
+ .Test(xnn_xx_fill_ukernel__neon_x64);
+ }
+ }
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(XX_FILL__SSE2_X64, channels_eq_1) {
+ TEST_REQUIRES_X86_SSE2;
+ FillMicrokernelTester()
+ .channels(1)
+ .Test(xnn_xx_fill_ukernel__sse2_x64);
+ }
+
+ TEST(XX_FILL__SSE2_X64, channels_eq_2) {
+ TEST_REQUIRES_X86_SSE2;
+ FillMicrokernelTester()
+ .channels(2)
+ .Test(xnn_xx_fill_ukernel__sse2_x64);
+ }
+
+ TEST(XX_FILL__SSE2_X64, channels_eq_4) {
+ TEST_REQUIRES_X86_SSE2;
+ FillMicrokernelTester()
+ .channels(4)
+ .Test(xnn_xx_fill_ukernel__sse2_x64);
+ }
+
+ TEST(XX_FILL__SSE2_X64, channels_eq_64) {
+ TEST_REQUIRES_X86_SSE2;
+ FillMicrokernelTester()
+ .channels(64)
+ .Test(xnn_xx_fill_ukernel__sse2_x64);
+ }
+
+ TEST(XX_FILL__SSE2_X64, channels_div_64) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 128; channels <= 192; channels += 64) {
+ FillMicrokernelTester()
+ .channels(channels)
+ .Test(xnn_xx_fill_ukernel__sse2_x64);
+ }
+ }
+
+ TEST(XX_FILL__SSE2_X64, channels_lt_64) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 1; channels < 64; channels++) {
+ FillMicrokernelTester()
+ .channels(channels)
+ .Test(xnn_xx_fill_ukernel__sse2_x64);
+ }
+ }
+
+ TEST(XX_FILL__SSE2_X64, channels_gt_64) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 65; channels < 128; channels++) {
+ FillMicrokernelTester()
+ .channels(channels)
+ .Test(xnn_xx_fill_ukernel__sse2_x64);
+ }
+ }
+
+ TEST(XX_FILL__SSE2_X64, multiple_rows) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t rows = 2; rows < 5; rows++) {
+ for (size_t channels = 1; channels < 192; channels += 15) {
+ FillMicrokernelTester()
+ .channels(channels)
+ .rows(rows)
+ .Test(xnn_xx_fill_ukernel__sse2_x64);
+ }
+ }
+ }
+
+ TEST(XX_FILL__SSE2_X64, multiple_rows_with_output_stride) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t rows = 2; rows < 5; rows++) {
+ for (size_t channels = 1; channels < 192; channels += 15) {
+ FillMicrokernelTester()
+ .channels(channels)
+ .rows(rows)
+ .output_stride(193)
+ .Test(xnn_xx_fill_ukernel__sse2_x64);
+ }
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_WASMSIMD
+ TEST(XX_FILL__WASMSIMD_X64, channels_eq_1) {
+ FillMicrokernelTester()
+ .channels(1)
+ .Test(xnn_xx_fill_ukernel__wasmsimd_x64);
+ }
+
+ TEST(XX_FILL__WASMSIMD_X64, channels_eq_2) {
+ FillMicrokernelTester()
+ .channels(2)
+ .Test(xnn_xx_fill_ukernel__wasmsimd_x64);
+ }
+
+ TEST(XX_FILL__WASMSIMD_X64, channels_eq_4) {
+ FillMicrokernelTester()
+ .channels(4)
+ .Test(xnn_xx_fill_ukernel__wasmsimd_x64);
+ }
+
+ TEST(XX_FILL__WASMSIMD_X64, channels_eq_64) {
+ FillMicrokernelTester()
+ .channels(64)
+ .Test(xnn_xx_fill_ukernel__wasmsimd_x64);
+ }
+
+ TEST(XX_FILL__WASMSIMD_X64, channels_div_64) {
+ for (size_t channels = 128; channels <= 192; channels += 64) {
+ FillMicrokernelTester()
+ .channels(channels)
+ .Test(xnn_xx_fill_ukernel__wasmsimd_x64);
+ }
+ }
+
+ TEST(XX_FILL__WASMSIMD_X64, channels_lt_64) {
+ for (size_t channels = 1; channels < 64; channels++) {
+ FillMicrokernelTester()
+ .channels(channels)
+ .Test(xnn_xx_fill_ukernel__wasmsimd_x64);
+ }
+ }
+
+ TEST(XX_FILL__WASMSIMD_X64, channels_gt_64) {
+ for (size_t channels = 65; channels < 128; channels++) {
+ FillMicrokernelTester()
+ .channels(channels)
+ .Test(xnn_xx_fill_ukernel__wasmsimd_x64);
+ }
+ }
+
+ TEST(XX_FILL__WASMSIMD_X64, multiple_rows) {
+ for (size_t rows = 2; rows < 5; rows++) {
+ for (size_t channels = 1; channels < 192; channels += 15) {
+ FillMicrokernelTester()
+ .channels(channels)
+ .rows(rows)
+ .Test(xnn_xx_fill_ukernel__wasmsimd_x64);
+ }
+ }
+ }
+
+ TEST(XX_FILL__WASMSIMD_X64, multiple_rows_with_output_stride) {
+ for (size_t rows = 2; rows < 5; rows++) {
+ for (size_t channels = 1; channels < 192; channels += 15) {
+ FillMicrokernelTester()
+ .channels(channels)
+ .rows(rows)
+ .output_stride(193)
+ .Test(xnn_xx_fill_ukernel__wasmsimd_x64);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMSIMD
+
+
+TEST(XX_FILL__SCALAR_X16, channels_eq_1) {
+ FillMicrokernelTester()
+ .channels(1)
+ .Test(xnn_xx_fill_ukernel__scalar_x16);
+}
+
+TEST(XX_FILL__SCALAR_X16, channels_eq_2) {
+ FillMicrokernelTester()
+ .channels(2)
+ .Test(xnn_xx_fill_ukernel__scalar_x16);
+}
+
+TEST(XX_FILL__SCALAR_X16, channels_eq_4) {
+ FillMicrokernelTester()
+ .channels(4)
+ .Test(xnn_xx_fill_ukernel__scalar_x16);
+}
+
+TEST(XX_FILL__SCALAR_X16, channels_eq_16) {
+ FillMicrokernelTester()
+ .channels(16)
+ .Test(xnn_xx_fill_ukernel__scalar_x16);
+}
+
+TEST(XX_FILL__SCALAR_X16, channels_div_16) {
+ for (size_t channels = 32; channels <= 48; channels += 48) {
+ FillMicrokernelTester()
+ .channels(channels)
+ .Test(xnn_xx_fill_ukernel__scalar_x16);
+ }
+}
+
+TEST(XX_FILL__SCALAR_X16, channels_lt_16) {
+ for (size_t channels = 1; channels < 16; channels++) {
+ FillMicrokernelTester()
+ .channels(channels)
+ .Test(xnn_xx_fill_ukernel__scalar_x16);
+ }
+}
+
+TEST(XX_FILL__SCALAR_X16, channels_gt_16) {
+ for (size_t channels = 17; channels < 32; channels++) {
+ FillMicrokernelTester()
+ .channels(channels)
+ .Test(xnn_xx_fill_ukernel__scalar_x16);
+ }
+}
+
+TEST(XX_FILL__SCALAR_X16, multiple_rows) {
+ for (size_t rows = 2; rows < 5; rows++) {
+ for (size_t channels = 1; channels < 48; channels += 3) {
+ FillMicrokernelTester()
+ .channels(channels)
+ .rows(rows)
+ .Test(xnn_xx_fill_ukernel__scalar_x16);
+ }
+ }
+}
+
+TEST(XX_FILL__SCALAR_X16, multiple_rows_with_output_stride) {
+ for (size_t rows = 2; rows < 5; rows++) {
+ for (size_t channels = 1; channels < 48; channels += 3) {
+ FillMicrokernelTester()
+ .channels(channels)
+ .rows(rows)
+ .output_stride(53)
+ .Test(xnn_xx_fill_ukernel__scalar_x16);
+ }
+ }
+}