NEON-FP16 implementation of F16->F32 VCVT microkernels
PiperOrigin-RevId: 399533359
diff --git a/BUILD.bazel b/BUILD.bazel
index 94fa615..7408396 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -2696,6 +2696,14 @@
"src/xx-pad/neon.c",
]
+PROD_NEONFP16_MICROKERNEL_SRCS = [
+]
+
+ALL_NEONFP16_MICROKERNEL_SRCS = [
+ "src/f16-f32-vcvt/gen/vcvt-neonfp16-x8.c",
+ "src/f16-f32-vcvt/gen/vcvt-neonfp16-x16.c",
+]
+
PROD_NEONFMA_MICROKERNEL_SRCS = [
"src/f32-dwconv/gen/up4x9-minmax-neonfma.c",
"src/f32-dwconv/gen/up4x25-minmax-neonfma-acc2.c",
@@ -6125,6 +6133,79 @@
)
xnnpack_cc_library(
+ name = "neonfp16_bench_microkernels",
+ hdrs = INTERNAL_HDRS,
+ aarch32_copts = [
+ "-marm",
+ "-march=armv7-a",
+ "-mfpu=neon-fp16",
+ ],
+ aarch32_srcs = ALL_NEONFP16_MICROKERNEL_SRCS,
+ aarch64_srcs = ALL_NEONFP16_MICROKERNEL_SRCS,
+ apple_aarch32_copts = [
+ "-mcpu=cortex-a9",
+ "-mtune=generic",
+ ],
+ gcc_copts = xnnpack_gcc_std_copts(),
+ msvc_copts = xnnpack_msvc_std_copts(),
+ deps = [
+ ":tables",
+ "@FP16",
+ "@pthreadpool",
+ ],
+)
+
+xnnpack_cc_library(
+ name = "neonfp16_prod_microkernels",
+ hdrs = INTERNAL_HDRS,
+ aarch32_copts = [
+ "-marm",
+ "-march=armv7-a",
+ "-mfpu=neon-fp16",
+ ],
+ aarch32_srcs = PROD_NEONFP16_MICROKERNEL_SRCS,
+ aarch64_srcs = PROD_NEONFP16_MICROKERNEL_SRCS,
+ apple_aarch32_copts = [
+ "-mcpu=cortex-a9",
+ "-mtune=generic",
+ ],
+ gcc_copts = xnnpack_gcc_std_copts(),
+ msvc_copts = xnnpack_msvc_std_copts(),
+ deps = [
+ ":tables",
+ "@FP16",
+ "@pthreadpool",
+ ],
+)
+
+xnnpack_cc_library(
+ name = "neonfp16_test_microkernels",
+ hdrs = INTERNAL_HDRS,
+ aarch32_copts = [
+ "-marm",
+ "-march=armv7-a",
+ "-mfpu=neon-fp16",
+ ],
+ aarch32_srcs = ALL_NEONFP16_MICROKERNEL_SRCS,
+ aarch64_srcs = ALL_NEONFP16_MICROKERNEL_SRCS,
+ apple_aarch32_copts = [
+ "-mcpu=cortex-a9",
+ "-mtune=generic",
+ ],
+ copts = [
+ "-UNDEBUG",
+ "-DXNN_TEST_MODE=1",
+ ],
+ gcc_copts = xnnpack_gcc_std_copts(),
+ msvc_copts = xnnpack_msvc_std_copts(),
+ deps = [
+ ":tables",
+ "@FP16",
+ "@pthreadpool",
+ ],
+)
+
+xnnpack_cc_library(
name = "neonfma_bench_microkernels",
hdrs = INTERNAL_HDRS,
aarch32_copts = [
@@ -6962,12 +7043,14 @@
name = "bench_microkernels",
aarch32_ios_deps = [
":neon_bench_microkernels",
+ ":neonfp16_bench_microkernels",
":neonfma_bench_microkernels",
":neonv8_bench_microkernels",
":asm_microkernels",
],
aarch32_nonios_deps = [
":neon_bench_microkernels",
+ ":neonfp16_bench_microkernels",
":neonfma_bench_microkernels",
":neonv8_bench_microkernels",
":neondot_bench_microkernels",
@@ -6975,6 +7058,7 @@
],
aarch64_deps = [
":neon_bench_microkernels",
+ ":neonfp16_bench_microkernels",
":neonfma_bench_microkernels",
":neonv8_bench_microkernels",
":neonfp16arith_bench_microkernels",
@@ -7010,12 +7094,14 @@
name = "prod_microkernels",
aarch32_ios_deps = [
":neon_prod_microkernels",
+ ":neonfp16_prod_microkernels",
":neonfma_prod_microkernels",
":neonv8_prod_microkernels",
":asm_microkernels",
],
aarch32_nonios_deps = [
":neon_prod_microkernels",
+ ":neonfp16_prod_microkernels",
":neonfma_prod_microkernels",
":neonv8_prod_microkernels",
":neondot_prod_microkernels",
@@ -7023,6 +7109,7 @@
],
aarch64_deps = [
":neon_prod_microkernels",
+ ":neonfp16_prod_microkernels",
":neonfma_prod_microkernels",
":neonv8_prod_microkernels",
":neonfp16arith_prod_microkernels",
@@ -7058,12 +7145,14 @@
name = "test_microkernels",
aarch32_ios_deps = [
":neon_test_microkernels",
+ ":neonfp16_test_microkernels",
":neonfma_test_microkernels",
":neonv8_test_microkernels",
":asm_microkernels",
],
aarch32_nonios_deps = [
":neon_test_microkernels",
+ ":neonfp16_test_microkernels",
":neonfma_test_microkernels",
":neonv8_test_microkernels",
":neondot_test_microkernels",
@@ -7071,6 +7160,7 @@
],
aarch64_deps = [
":neon_test_microkernels",
+ ":neonfp16_test_microkernels",
":neonfma_test_microkernels",
":neonv8_test_microkernels",
":neonfp16arith_test_microkernels",