Make packing functions non-inline
PiperOrigin-RevId: 319844892
diff --git a/BUILD.bazel b/BUILD.bazel
index 88fcdc1..1281696 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -3116,6 +3116,36 @@
)
xnnpack_cc_library(
+ name = "packing",
+ srcs = ["src/packing.c"],
+ hdrs = INTERNAL_HDRS,
+ gcc_copts = xnnpack_gcc_std_copts(),
+ msvc_copts = xnnpack_msvc_std_copts(),
+ deps = [
+ "@FP16",
+ "@FXdiv",
+ "@pthreadpool",
+ ],
+)
+
+xnnpack_cc_library(
+ name = "packing_test_mode",
+ srcs = ["src/packing.c"],
+ hdrs = INTERNAL_HDRS,
+ copts = [
+ "-UNDEBUG",
+ "-DXNN_TEST_MODE=1",
+ ],
+ gcc_copts = xnnpack_gcc_std_copts(),
+ msvc_copts = xnnpack_msvc_std_copts(),
+ deps = [
+ "@FP16",
+ "@FXdiv",
+ "@pthreadpool",
+ ],
+)
+
+xnnpack_cc_library(
name = "operator_run",
srcs = ["src/operator-run.c"],
hdrs = INTERNAL_HDRS + LOGGING_HDRS,
@@ -3234,6 +3264,7 @@
deps = [
":indirection",
":logging_utils",
+ ":packing",
"@FP16",
"@FXdiv",
"@clog",
@@ -3265,6 +3296,7 @@
deps = [
":indirection_test_mode",
":logging_utils",
+ ":packing_test_mode",
"@FP16",
"@FXdiv",
"@clog",
@@ -3461,7 +3493,7 @@
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_BENCHMARK_HDRS,
copts = xnnpack_optional_ruy_copts() + xnnpack_optional_gemmlowp_copts(),
- deps = MICROKERNEL_BENCHMARK_DEPS + xnnpack_optional_ruy_deps() + xnnpack_optional_gemmlowp_deps(),
+ deps = MICROKERNEL_BENCHMARK_DEPS + [":packing"] + xnnpack_optional_ruy_deps() + xnnpack_optional_gemmlowp_deps(),
)
xnnpack_benchmark(
@@ -3472,7 +3504,10 @@
"bench/google/conv.h",
"src/xnnpack/AlignedAllocator.h",
] + MICROKERNEL_BENCHMARK_HDRS,
- deps = MICROKERNEL_BENCHMARK_DEPS + [":indirection"],
+ deps = MICROKERNEL_BENCHMARK_DEPS + [
+ ":indirection",
+ ":packing",
+ ],
)
xnnpack_benchmark(
@@ -3482,7 +3517,9 @@
"bench/gemm.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_BENCHMARK_HDRS,
- deps = MICROKERNEL_BENCHMARK_DEPS,
+ deps = MICROKERNEL_BENCHMARK_DEPS + [
+ ":packing",
+ ],
)
xnnpack_benchmark(
@@ -3502,7 +3539,10 @@
"bench/conv.h",
"src/xnnpack/AlignedAllocator.h",
] + MICROKERNEL_BENCHMARK_HDRS,
- deps = MICROKERNEL_BENCHMARK_DEPS + [":indirection"],
+ deps = MICROKERNEL_BENCHMARK_DEPS + [
+ ":indirection",
+ ":packing",
+ ],
)
xnnpack_benchmark(
@@ -3512,7 +3552,9 @@
"bench/dconv.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_BENCHMARK_HDRS,
- deps = MICROKERNEL_BENCHMARK_DEPS,
+ deps = MICROKERNEL_BENCHMARK_DEPS + [
+ ":packing",
+ ],
)
xnnpack_benchmark(
@@ -3522,7 +3564,9 @@
"bench/dconv.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_BENCHMARK_HDRS,
- deps = MICROKERNEL_BENCHMARK_DEPS,
+ deps = MICROKERNEL_BENCHMARK_DEPS + [
+ ":packing",
+ ],
)
xnnpack_benchmark(
@@ -3533,7 +3577,10 @@
"bench/google/dwconv.h",
"src/xnnpack/AlignedAllocator.h",
] + MICROKERNEL_BENCHMARK_HDRS,
- deps = MICROKERNEL_BENCHMARK_DEPS + [":indirection"],
+ deps = MICROKERNEL_BENCHMARK_DEPS + [
+ ":indirection",
+ ":packing",
+ ],
)
xnnpack_benchmark(
@@ -3543,7 +3590,10 @@
"bench/dwconv.h",
"src/xnnpack/AlignedAllocator.h",
] + MICROKERNEL_BENCHMARK_HDRS,
- deps = MICROKERNEL_BENCHMARK_DEPS + [":indirection"],
+ deps = MICROKERNEL_BENCHMARK_DEPS + [
+ ":indirection",
+ ":packing",
+ ],
)
xnnpack_benchmark(
@@ -3553,7 +3603,10 @@
"bench/dwconv.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_BENCHMARK_HDRS,
- deps = MICROKERNEL_BENCHMARK_DEPS + [":indirection"],
+ deps = MICROKERNEL_BENCHMARK_DEPS + [
+ ":indirection",
+ ":packing",
+ ],
)
xnnpack_benchmark(
@@ -3564,7 +3617,7 @@
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_BENCHMARK_HDRS,
copts = xnnpack_optional_ruy_copts(),
- deps = MICROKERNEL_BENCHMARK_DEPS + xnnpack_optional_ruy_deps(),
+ deps = MICROKERNEL_BENCHMARK_DEPS + [":packing"] + xnnpack_optional_ruy_deps(),
)
xnnpack_benchmark(
@@ -3674,7 +3727,10 @@
"bench/conv.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_BENCHMARK_HDRS,
- deps = MICROKERNEL_BENCHMARK_DEPS + [":im2col"],
+ deps = MICROKERNEL_BENCHMARK_DEPS + [
+ ":im2col",
+ ":packing",
+ ],
)
xnnpack_benchmark(
@@ -4035,7 +4091,7 @@
"test/gemm-microkernel-tester.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
- deps = MICROKERNEL_TEST_DEPS,
+ deps = MICROKERNEL_TEST_DEPS + [":packing"],
)
xnnpack_unit_test(
@@ -4103,7 +4159,7 @@
"test/gemm-microkernel-tester.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
- deps = MICROKERNEL_TEST_DEPS,
+ deps = MICROKERNEL_TEST_DEPS + [":packing"],
)
xnnpack_unit_test(
@@ -4113,7 +4169,7 @@
"test/gemm-microkernel-tester.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
- deps = MICROKERNEL_TEST_DEPS,
+ deps = MICROKERNEL_TEST_DEPS + [":packing"],
)
xnnpack_unit_test(
@@ -4123,7 +4179,7 @@
"test/gemm-microkernel-tester.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
- deps = MICROKERNEL_TEST_DEPS,
+ deps = MICROKERNEL_TEST_DEPS + [":packing"],
)
xnnpack_unit_test(
@@ -4133,7 +4189,7 @@
"test/gemm-microkernel-tester.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
- deps = MICROKERNEL_TEST_DEPS,
+ deps = MICROKERNEL_TEST_DEPS + [":packing"],
)
xnnpack_unit_test(
@@ -4143,7 +4199,7 @@
"test/conv-hwc-microkernel-tester.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
- deps = MICROKERNEL_TEST_DEPS,
+ deps = MICROKERNEL_TEST_DEPS + [":packing"],
)
xnnpack_unit_test(
@@ -4153,7 +4209,7 @@
"test/conv-hwc2chw-microkernel-tester.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
- deps = MICROKERNEL_TEST_DEPS,
+ deps = MICROKERNEL_TEST_DEPS + [":packing"],
)
xnnpack_unit_test(
@@ -4163,7 +4219,7 @@
"test/dwconv-microkernel-tester.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
- deps = MICROKERNEL_TEST_DEPS,
+ deps = MICROKERNEL_TEST_DEPS + [":packing"],
)
xnnpack_unit_test(
@@ -4173,7 +4229,7 @@
"test/dwconv-microkernel-tester.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
- deps = MICROKERNEL_TEST_DEPS,
+ deps = MICROKERNEL_TEST_DEPS + [":packing"],
)
xnnpack_unit_test(
@@ -4183,7 +4239,7 @@
"test/dwconv-microkernel-tester.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
- deps = MICROKERNEL_TEST_DEPS,
+ deps = MICROKERNEL_TEST_DEPS + [":packing"],
)
xnnpack_unit_test(
@@ -4193,7 +4249,7 @@
"test/dwconv-chw-microkernel-tester.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
- deps = MICROKERNEL_TEST_DEPS,
+ deps = MICROKERNEL_TEST_DEPS + [":packing"],
)
xnnpack_unit_test(
@@ -4233,7 +4289,7 @@
"test/gemm-microkernel-tester.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
- deps = MICROKERNEL_TEST_DEPS,
+ deps = MICROKERNEL_TEST_DEPS + [":packing"],
)
xnnpack_unit_test(
@@ -4243,7 +4299,7 @@
"test/gemm-microkernel-tester.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
- deps = MICROKERNEL_TEST_DEPS,
+ deps = MICROKERNEL_TEST_DEPS + [":packing"],
)
xnnpack_unit_test(
@@ -4253,7 +4309,7 @@
"test/gemm-microkernel-tester.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
- deps = MICROKERNEL_TEST_DEPS,
+ deps = MICROKERNEL_TEST_DEPS + [":packing"],
)
xnnpack_unit_test(
@@ -4263,7 +4319,7 @@
"test/gemm-microkernel-tester.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
- deps = MICROKERNEL_TEST_DEPS,
+ deps = MICROKERNEL_TEST_DEPS + [":packing"],
)
xnnpack_unit_test(
@@ -4310,7 +4366,7 @@
"test/gemm-microkernel-tester.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
- deps = MICROKERNEL_TEST_DEPS,
+ deps = MICROKERNEL_TEST_DEPS + [":packing"],
)
xnnpack_unit_test(
@@ -4692,7 +4748,7 @@
"test/vmulcaddc-microkernel-tester.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
- deps = MICROKERNEL_TEST_DEPS,
+ deps = MICROKERNEL_TEST_DEPS + [":packing"],
)
xnnpack_unit_test(
@@ -4702,7 +4758,7 @@
"test/vmulcaddc-microkernel-tester.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
- deps = MICROKERNEL_TEST_DEPS,
+ deps = MICROKERNEL_TEST_DEPS + [":packing"],
)
xnnpack_unit_test(
@@ -4893,7 +4949,7 @@
"test/gemm-microkernel-tester.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
- deps = MICROKERNEL_TEST_DEPS,
+ deps = MICROKERNEL_TEST_DEPS + [":packing"],
)
xnnpack_unit_test(
@@ -4903,7 +4959,7 @@
"test/dwconv-microkernel-tester.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
- deps = MICROKERNEL_TEST_DEPS,
+ deps = MICROKERNEL_TEST_DEPS + [":packing"],
)
xnnpack_unit_test(
@@ -4923,7 +4979,7 @@
"test/gemm-microkernel-tester.h",
"src/xnnpack/AlignedAllocator.h",
] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
- deps = MICROKERNEL_TEST_DEPS,
+ deps = MICROKERNEL_TEST_DEPS + [":packing"],
)
xnnpack_unit_test(
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c603595..ff3e5c8 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -225,6 +225,7 @@
SET(XNNPACK_HOT_SRCS
src/indirection.c
+ src/packing.c
src/operator-run.c)
SET(XNNPACK_TABLE_SRCS
diff --git a/src/packing.c b/src/packing.c
new file mode 100644
index 0000000..d5357c3
--- /dev/null
+++ b/src/packing.c
@@ -0,0 +1,1300 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <stdint.h>
+#include <stddef.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/pack.h>
+
+
+void xnn_pack_q8_gemm_goi_w(
+ size_t g,
+ size_t nc,
+ size_t kc,
+ uint32_t nr,
+ uint32_t kr,
+ uint8_t izp,
+ uint8_t kzp,
+ const uint8_t* k,
+ const int32_t* b,
+ void* packed_w)
+{
+ const int32_t boff = (int32_t) kc * (int32_t) izp * (int32_t) kzp;
+ do {
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ int32_t* packed_b = (int32_t*) packed_w;
+ if XNN_LIKELY(b != NULL) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+ }
+ } else {
+ size_t n = nr_block_size;
+ do {
+ *((int32_t*) packed_w) = boff;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+ } while (--n != 0);
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
+ for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
+ const size_t kr_block_size = min(kc - kr_block_start, kr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ int32_t ksum = 0;
+ for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+ const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
+ ksum += (int32_t) kv;
+ *((uint8_t*) packed_w) = kv;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+ }
+ packed_b[nr_block_offset] -= ksum * (int32_t) izp;
+ packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
+ }
+ }
+ k += nc * kc;
+ if XNN_UNPREDICTABLE(b != NULL) {
+ b += nc;
+ }
+ } while (--g != 0);
+}
+
+void xnn_pack_q8_gemm_io_w(
+ size_t nc,
+ size_t kc,
+ uint32_t nr,
+ uint32_t kr,
+ uint8_t izp,
+ uint8_t kzp,
+ const uint8_t* k,
+ const int32_t* b,
+ void* packed_w)
+{
+ const int32_t boff = (int32_t) kc * (int32_t) izp * (int32_t) kzp;
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ int32_t* packed_b = (int32_t*) packed_w;
+ if XNN_LIKELY(b != NULL) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+ }
+ } else {
+ size_t n = nr_block_size;
+ do {
+ *((int32_t*) packed_w) = boff;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+ } while (--n != 0);
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
+ for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
+ const size_t kr_block_size = min(kc - kr_block_start, kr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ int32_t ksum = 0;
+ for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+ const uint8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
+ ksum += (int32_t) kv;
+ *((uint8_t*) packed_w) = kv;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+ }
+ packed_b[nr_block_offset] -= ksum * (int32_t) izp;
+ packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
+ }
+ }
+}
+
+void xnn_pack_q8_conv_goki_w(
+ size_t g,
+ size_t nc,
+ size_t ks,
+ size_t kc,
+ uint32_t nr,
+ uint32_t kr,
+ uint8_t izp,
+ uint8_t kzp,
+ const uint8_t* k,
+ const int32_t* b,
+ void* packed_w)
+{
+ const int32_t boff = (int32_t) ks * (int32_t) kc * (int32_t) izp * (int32_t) kzp;
+ do {
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ int32_t* packed_b = (int32_t*) packed_w;
+ if XNN_LIKELY(b != NULL) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+ }
+ } else {
+ size_t n = nr_block_size;
+ do {
+ *((int32_t*) packed_w) = boff;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+ } while (--n != 0);
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
+ for (size_t ki = 0; ki < ks; ki++) {
+ for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
+ const size_t kr_block_size = min(kc - kr_block_start, kr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ int32_t ksum = 0;
+ for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+ const uint8_t kv =
+ k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
+ ksum += (int32_t) kv;
+ *((uint8_t*) packed_w) = kv;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+ }
+ packed_b[nr_block_offset] -= ksum * (int32_t) izp;
+ packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
+ }
+ }
+ }
+ k += ks * kc * nc;
+ if XNN_UNPREDICTABLE(b != NULL) {
+ b += nc;
+ }
+ } while (--g != 0);
+}
+
+void xnn_pack_q8_conv_kgo_w(
+ size_t g,
+ size_t nc,
+ size_t ks,
+ uint32_t nr,
+ uint32_t kr,
+ uint8_t izp,
+ uint8_t kzp,
+ const uint8_t* k,
+ const int32_t* b,
+ void* packed_w)
+{
+ const int32_t boff = (int32_t) ks * (int32_t) izp * (int32_t) kzp;
+ for (size_t i = 0; i < g; i++) {
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ int32_t* packed_b = (int32_t*) packed_w;
+ if XNN_LIKELY(b != NULL) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+ }
+ } else {
+ size_t n = nr_block_size;
+ do {
+ *((int32_t*) packed_w) = boff;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+ } while (--n != 0);
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
+ for (size_t ki = 0; ki < ks; ki++) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ const uint8_t kv =
+ k[ki * g * nc + (nr_block_start + nr_block_offset)];
+ *((uint8_t*) packed_w) = kv;
+ packed_b[nr_block_offset] -= (int32_t) kv * (int32_t) izp;
+ packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(uint8_t));
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
+ }
+ }
+ k += nc;
+ if XNN_UNPREDICTABLE(b != NULL) {
+ b += nc;
+ }
+ }
+}
+
+void xnn_pack_q8_deconv_goki_w(
+ size_t g,
+ size_t nc,
+ size_t kh,
+ size_t kw,
+ size_t kc,
+ size_t sh,
+ size_t sw,
+ size_t nr,
+ size_t kr,
+ uint8_t izp,
+ uint8_t kzp,
+ const uint8_t* k,
+ const int32_t* b,
+ void* packed_w,
+ struct subconvolution_params* params)
+{
+ for (size_t i = 0; i < g; i++) {
+ for (size_t oy = 0; oy < sh; oy++) {
+ for (size_t ox = 0; ox < sw; ox++) {
+ if (i == 0) {
+ (*params++).weights = packed_w;
+ }
+ const int32_t boff = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * (int32_t) izp * (int32_t) kzp;
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ int32_t* packed_b = (int32_t*) packed_w;
+ if XNN_LIKELY(b != 0) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+ }
+ } else {
+ size_t n = nr_block_size;
+ do {
+ *((int32_t*) packed_w) = boff;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+ } while (--n != 0);
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
+ for (size_t ky = oy; ky < kh; ky += sh) {
+ for (size_t kx = ox; kx < kw; kx += sw) {
+ for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
+ const size_t kr_block_size = min(kc - kr_block_start, kr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ int32_t ksum = 0;
+ for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+ const uint8_t kv =
+ k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
+ ksum += (int32_t) kv;
+ *((uint8_t*) packed_w) = kv;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+ }
+ packed_b[nr_block_offset] -= ksum * (int32_t) izp;
+ packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
+ }
+ }
+ }
+ }
+ }
+ }
+ k += kh * kw * kc * nc;
+ if XNN_UNPREDICTABLE(b != NULL) {
+ b += nc;
+ }
+ }
+}
+
+void xnn_pack_q8_dwconv_ghw_w(
+ size_t h,
+ size_t w,
+ size_t c,
+ size_t cr,
+ uint8_t izp,
+ uint8_t kzp,
+ const uint8_t* k,
+ const int32_t* b,
+ void* packed_w)
+{
+ const int32_t boff = (int32_t) h * (int32_t) w * (int32_t) izp * (int32_t) kzp;
+ for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+ const size_t cr_block_size = min(c - cr_block_start, cr);
+ int32_t* packed_b = (int32_t*) packed_w;
+ if XNN_LIKELY(b != NULL) {
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+ }
+ } else {
+ size_t n = cr_block_size;
+ do {
+ *((int32_t*) packed_w) = boff;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+ } while (--n != 0);
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
+ for (size_t x = 0; x < w; x++) {
+ for (size_t y = 0; y < h; y++) {
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
+ packed_b[cr_block_offset] -= (int32_t) kv * (int32_t) izp;
+ *((uint8_t*) packed_w) = kv;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
+ }
+ }
+ }
+}
+
+void xnn_pack_q8_dwconv_hwg_w(
+ size_t h,
+ size_t w,
+ size_t c,
+ size_t cr,
+ uint8_t izp,
+ uint8_t kzp,
+ const uint8_t* k,
+ const int32_t* b,
+ void* packed_w)
+{
+ const int32_t boff = (int32_t) h * (int32_t) w * (int32_t) izp * (int32_t) kzp;
+ for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+ const size_t cr_block_size = min(c - cr_block_start, cr);
+ int32_t* packed_b = (int32_t*) packed_w;
+ if XNN_LIKELY(b != NULL) {
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+ }
+ } else {
+ size_t n = cr_block_size;
+ do {
+ *((int32_t*) packed_w) = boff;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+ } while (--n != 0);
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
+ for (size_t x = 0; x < w; x++) {
+ for (size_t y = 0; y < h; y++) {
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
+ packed_b[cr_block_offset] -= (int32_t) kv * (int32_t) izp;
+ *((uint8_t*) packed_w) = kv;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
+ }
+ }
+ }
+}
+
+void xnn_pack_f16_gemm_goi_w(
+ size_t g,
+ size_t nc,
+ size_t kc,
+ size_t nr,
+ size_t kr,
+ size_t sr,
+ const uint16_t* k,
+ const uint16_t* b,
+ uint16_t* packed_w)
+{
+ const size_t skr = sr * kr;
+ const size_t skc = round_down_po2(kc, skr);
+ const size_t sr_mask = (sr - 1) * kr;
+ do {
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ if XNN_LIKELY(b != NULL) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+ }
+ }
+ packed_w += nr;
+
+ for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+ *packed_w++ =
+ k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
+ }
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+
+ for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+ const size_t kr_block_size = min(kc - kr_block_start, kr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+ *packed_w++ =
+ k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
+ }
+ packed_w += kr - kr_block_size;
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+ }
+ k += nc * kc;
+ if XNN_UNPREDICTABLE(b != NULL) {
+ b += nc;
+ }
+ } while (--g != 0);
+}
+
+void xnn_pack_f16_gemm_io_w(
+ size_t nc,
+ size_t kc,
+ size_t nr,
+ size_t kr,
+ size_t sr,
+ const uint16_t* k,
+ const uint16_t* b,
+ uint16_t* packed_w)
+{
+ const size_t skr = sr * kr;
+ const size_t skc = round_down_po2(kc, skr);
+ const size_t sr_mask = (sr - 1) * kr;
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ if XNN_LIKELY(b != NULL) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+ }
+ }
+ packed_w += nr;
+
+ for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+ *packed_w++ =
+ k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
+ }
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+
+ for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+ const size_t kr_block_size = min(kc - kr_block_start, kr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+ *packed_w++ =
+ k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
+ }
+ packed_w += kr - kr_block_size;
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+ }
+}
+
+void xnn_pack_f16_gemminc_goi_w(
+ size_t g,
+ size_t nc,
+ size_t kc,
+ size_t nr,
+ size_t kr,
+ size_t sr,
+ const uint16_t* k,
+ uint16_t* packed_w)
+{
+ const size_t skr = sr * kr;
+ const size_t skc = round_down_po2(kc, skr);
+ const size_t sr_mask = (sr - 1) * kr;
+ do {
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+
+ for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+ *packed_w++ =
+ k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
+ }
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+
+ for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+ const size_t kr_block_size = min(kc - kr_block_start, kr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+ *packed_w++ =
+ k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
+ }
+ packed_w += kr - kr_block_size;
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+ }
+ k += nc * kc;
+ } while (--g != 0);
+}
+
+void xnn_pack_f16_conv_goki_w(
+ size_t g,
+ size_t nc,
+ size_t ks,
+ size_t kc,
+ size_t nr,
+ size_t kr,
+ size_t sr,
+ const uint16_t* k,
+ const uint16_t* b,
+ uint16_t* packed_w)
+{
+ const size_t skr = sr * kr;
+ const size_t skc = round_down_po2(kc, skr);
+ const size_t sr_mask = (sr - 1) * kr;
+ do {
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ if XNN_LIKELY(b != NULL) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+ }
+ }
+ packed_w += nr;
+
+ for (size_t ki = 0; ki < ks; ki++) {
+ for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+ *packed_w++ =
+ k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
+ }
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+
+ for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+ const size_t kr_block_size = min(kc - kr_block_start, kr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+ *packed_w++ =
+ k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
+ }
+ packed_w += kr - kr_block_size;
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+ }
+ }
+ k += ks * kc * nc;
+ if XNN_UNPREDICTABLE(b != NULL) {
+ b += nc;
+ }
+ } while (--g != 0);
+}
+
+void xnn_pack_f16_conv_kgo_w(
+ size_t g,
+ size_t nc,
+ size_t ks,
+ size_t nr,
+ size_t kr,
+ const uint16_t* k,
+ const uint16_t* b,
+ uint16_t* packed_w)
+{
+ for (size_t i = 0; i < g; i++) {
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ if XNN_LIKELY(b != NULL) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+ }
+ }
+ packed_w += nr;
+ for (size_t ki = 0; ki < ks; ki++) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ *packed_w =
+ k[ki * g * nc + (nr_block_start + nr_block_offset)];
+ packed_w += kr;
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+ }
+ k += nc;
+ if XNN_UNPREDICTABLE(b != NULL) {
+ b += nc;
+ }
+ }
+}
+
+void xnn_pack_f16_dconv_oki_w(
+ size_t nc,
+ size_t kc,
+ size_t nr,
+ size_t kh,
+ size_t kw,
+ const uint16_t* k,
+ const uint16_t* b,
+ uint16_t* packed_w)
+{
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ if XNN_LIKELY(b != NULL) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
+ *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
+ }
+ } else {
+ size_t n = nr;
+ do {
+ *packed_w++ = 0;
+ } while (--n != 0);
+ }
+
+ for (size_t kx = 0; kx < kw; kx++) {
+ for (size_t c = 0; c < kc; c++) {
+ for (size_t ky = 0; ky < kh; ky++) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
+ *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
+ }
+ }
+ }
+ }
+ if XNN_UNPREDICTABLE(b != NULL) {
+ b += nr;
+ }
+ }
+}
+
+void xnn_pack_f16_deconv_goki_w(
+ size_t g,
+ size_t nc,
+ size_t kh,
+ size_t kw,
+ size_t kc,
+ size_t sh,
+ size_t sw,
+ size_t nr,
+ size_t kr,
+ size_t sr,
+ const uint16_t* k,
+ const uint16_t* b,
+ uint16_t* packed_w,
+ struct subconvolution_params* params)
+{
+ const size_t skr = sr * kr;
+ const size_t skc = round_down_po2(kc, skr);
+ const size_t sr_mask = (sr - 1) * kr;
+ for (size_t i = 0; i < g; i++) {
+ for (size_t oy = 0; oy < sh; oy++) {
+ for (size_t ox = 0; ox < sw; ox++) {
+ if (i == 0) {
+ (*params++).weights = packed_w;
+ }
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ if XNN_LIKELY(b != NULL) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+ }
+ }
+ packed_w += nr;
+ for (size_t ky = oy; ky < kh; ky += sh) {
+ for (size_t kx = ox; kx < kw; kx += sw) {
+ for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+ *packed_w++ =
+ k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
+ }
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+
+ for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+ const size_t kr_block_size = min(kc - kr_block_start, kr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+ *packed_w++ =
+ k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
+ }
+ packed_w += kr - kr_block_size;
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+ }
+ }
+ }
+ }
+ }
+ k += kh * kw * kc * nc;
+ if XNN_UNPREDICTABLE(b != NULL) {
+ b += nc;
+ }
+ }
+}
+
+void xnn_pack_f16_dwconv_ghw_w(
+ size_t h,
+ size_t w,
+ size_t c,
+ size_t cr,
+ const uint16_t* k,
+ const uint16_t* b,
+ uint16_t* packed_w)
+{
+ for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+ const size_t cr_block_size = min(c - cr_block_start, cr);
+ if XNN_LIKELY(b != NULL) {
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ *packed_w++ = b[cr_block_start + cr_block_offset];
+ }
+ } else {
+ size_t n = cr_block_size;
+ do {
+ *packed_w++ = 0;
+ } while (--n != 0);
+ }
+ packed_w += cr - cr_block_size;
+ for (size_t x = 0; x < w; x++) {
+ for (size_t y = 0; y < h; y++) {
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
+ *packed_w++ = kv;
+ }
+ packed_w += cr - cr_block_size;
+ }
+ }
+ }
+}
+
+void xnn_pack_f16_dwconv_hwg_w(
+ size_t h,
+ size_t w,
+ size_t c,
+ size_t cr,
+ const uint16_t* k,
+ const uint16_t* b,
+ uint16_t* packed_w)
+{
+ for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+ const size_t cr_block_size = min(c - cr_block_start, cr);
+ if XNN_LIKELY(b != NULL) {
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ *packed_w++ = b[cr_block_start + cr_block_offset];
+ }
+ } else {
+ size_t n = cr_block_size;
+ do {
+ *packed_w++ = 0;
+ } while (--n != 0);
+ }
+ packed_w += cr - cr_block_size;
+ for (size_t x = 0; x < w; x++) {
+ for (size_t y = 0; y < h; y++) {
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
+ *packed_w++ = kv;
+ }
+ packed_w += cr - cr_block_size;
+ }
+ }
+ }
+}
+
+void xnn_pack_f16_chw_dwconv_ghw_w(
+ size_t kernel_size,
+ size_t groups,
+ const uint16_t* kernel,
+ const uint16_t* bias,
+ uint16_t* packed_weights)
+{
+ for (size_t g = 0; g < groups; g++) {
+ if XNN_LIKELY(bias != NULL) {
+ *packed_weights = *bias++;
+ } else {
+ *packed_weights = 0;
+ }
+ packed_weights += 1;
+ for (size_t i = 0; i < kernel_size; i++) {
+ *packed_weights++ = kernel[g * kernel_size + i];
+ }
+ }
+}
+
+void xnn_pack_f32_gemm_goi_w(
+ size_t g,
+ size_t nc,
+ size_t kc,
+ size_t nr,
+ size_t kr,
+ size_t sr,
+ const float* k,
+ const float* b,
+ float* packed_w)
+{
+ const size_t skr = sr * kr;
+ const size_t skc = round_down_po2(kc, skr);
+ const size_t sr_mask = (sr - 1) * kr;
+ do {
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ if XNN_LIKELY(b != NULL) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+ }
+ }
+ packed_w += nr;
+
+ for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+ *packed_w++ =
+ k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
+ }
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+
+ for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+ const size_t kr_block_size = min(kc - kr_block_start, kr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+ *packed_w++ =
+ k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
+ }
+ packed_w += kr - kr_block_size;
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+ }
+ k += nc * kc;
+ if XNN_UNPREDICTABLE(b != NULL) {
+ b += nc;
+ }
+ } while (--g != 0);
+}
+
+void xnn_pack_f32_gemm_io_w(
+ size_t nc,
+ size_t kc,
+ size_t nr,
+ size_t kr,
+ size_t sr,
+ const float* k,
+ const float* b,
+ float* packed_w)
+{
+ const size_t skr = sr * kr;
+ const size_t skc = round_down_po2(kc, skr);
+ const size_t sr_mask = (sr - 1) * kr;
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ if XNN_LIKELY(b != NULL) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+ }
+ }
+ packed_w += nr;
+
+ for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+ *packed_w++ =
+ k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
+ }
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+
+ for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+ const size_t kr_block_size = min(kc - kr_block_start, kr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+ *packed_w++ =
+ k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
+ }
+ packed_w += kr - kr_block_size;
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+ }
+}
+
+void xnn_pack_f32_gemminc_goi_w(
+ size_t g,
+ size_t nc,
+ size_t kc,
+ size_t nr,
+ size_t kr,
+ size_t sr,
+ const float* k,
+ float* packed_w)
+{
+ const size_t skr = sr * kr;
+ const size_t skc = round_down_po2(kc, skr);
+ const size_t sr_mask = (sr - 1) * kr;
+ do {
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+
+ for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+ *packed_w++ =
+ k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
+ }
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+
+ for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+ const size_t kr_block_size = min(kc - kr_block_start, kr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+ *packed_w++ =
+ k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
+ }
+ packed_w += kr - kr_block_size;
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+ }
+ k += nc * kc;
+ } while (--g != 0);
+}
+
+void xnn_pack_f32_conv_goki_w(
+ size_t g,
+ size_t nc,
+ size_t ks,
+ size_t kc,
+ size_t nr,
+ size_t kr,
+ size_t sr,
+ const float* k,
+ const float* b,
+ float* packed_w)
+{
+ const size_t skr = sr * kr;
+ const size_t skc = round_down_po2(kc, skr);
+ const size_t sr_mask = (sr - 1) * kr;
+ do {
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ if XNN_LIKELY(b != NULL) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+ }
+ }
+ packed_w += nr;
+
+ for (size_t ki = 0; ki < ks; ki++) {
+ for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+ *packed_w++ =
+ k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
+ }
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+
+ for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+ const size_t kr_block_size = min(kc - kr_block_start, kr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+ *packed_w++ =
+ k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
+ }
+ packed_w += kr - kr_block_size;
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+ }
+ }
+ k += ks * kc * nc;
+ if XNN_UNPREDICTABLE(b != NULL) {
+ b += nc;
+ }
+ } while (--g != 0);
+}
+
+void xnn_pack_f32_conv_kgo_w(
+ size_t g,
+ size_t nc,
+ size_t ks,
+ size_t nr,
+ size_t kr,
+ const float* k,
+ const float* b,
+ float* packed_w)
+{
+ for (size_t i = 0; i < g; i++) {
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ if XNN_LIKELY(b != NULL) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+ }
+ }
+ packed_w += nr;
+ for (size_t ki = 0; ki < ks; ki++) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ *packed_w =
+ k[ki * g * nc + (nr_block_start + nr_block_offset)];
+ packed_w += kr;
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+ }
+ k += nc;
+ if XNN_UNPREDICTABLE(b != NULL) {
+ b += nc;
+ }
+ }
+}
+
+void xnn_pack_f32_dconv_oki_w(
+ size_t nc,
+ size_t kc,
+ size_t nr,
+ size_t kh,
+ size_t kw,
+ const float* k,
+ const float* b,
+ float* packed_w)
+{
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ if XNN_LIKELY(b != NULL) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
+ *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
+ }
+ } else {
+ size_t n = nr;
+ do {
+ *packed_w++ = 0.0f;
+ } while (--n != 0);
+ }
+
+ for (size_t kx = 0; kx < kw; kx++) {
+ for (size_t c = 0; c < kc; c++) {
+ for (size_t ky = 0; ky < kh; ky++) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
+ *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
+ }
+ }
+ }
+ }
+ if XNN_UNPREDICTABLE(b != NULL) {
+ b += nr;
+ }
+ }
+}
+
+void xnn_pack_f32_deconv_goki_w(
+ size_t g,
+ size_t nc,
+ size_t kh,
+ size_t kw,
+ size_t kc,
+ size_t sh,
+ size_t sw,
+ size_t nr,
+ size_t kr,
+ size_t sr,
+ const float* k,
+ const float* b,
+ float* packed_w,
+ struct subconvolution_params* params)
+{
+ const size_t skr = sr * kr;
+ const size_t skc = round_down_po2(kc, skr);
+ const size_t sr_mask = (sr - 1) * kr;
+ for (size_t i = 0; i < g; i++) {
+ for (size_t oy = 0; oy < sh; oy++) {
+ for (size_t ox = 0; ox < sw; ox++) {
+ if (i == 0) {
+ (*params++).weights = packed_w;
+ }
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ if XNN_LIKELY(b != NULL) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+ }
+ }
+ packed_w += nr;
+ for (size_t ky = oy; ky < kh; ky += sh) {
+ for (size_t kx = ox; kx < kw; kx += sw) {
+ for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+ *packed_w++ =
+ k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
+ }
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+
+ for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+ const size_t kr_block_size = min(kc - kr_block_start, kr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+ *packed_w++ =
+ k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
+ }
+ packed_w += kr - kr_block_size;
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+ }
+ }
+ }
+ }
+ }
+ k += kh * kw * kc * nc;
+ if XNN_UNPREDICTABLE(b != NULL) {
+ b += nc;
+ }
+ }
+}
+
+void xnn_pack_f32_dwconv_ghw_w(
+ size_t h,
+ size_t w,
+ size_t c,
+ size_t cr,
+ const float* k,
+ const float* b,
+ float* packed_w)
+{
+ for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+ const size_t cr_block_size = min(c - cr_block_start, cr);
+ if XNN_LIKELY(b != NULL) {
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ *packed_w++ = b[cr_block_start + cr_block_offset];
+ }
+ } else {
+ size_t n = cr_block_size;
+ do {
+ *packed_w++ = 0.0f;
+ } while (--n != 0);
+ }
+ packed_w += cr - cr_block_size;
+ for (size_t x = 0; x < w; x++) {
+ for (size_t y = 0; y < h; y++) {
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
+ *packed_w++ = kv;
+ }
+ packed_w += cr - cr_block_size;
+ }
+ }
+ }
+}
+
+void xnn_pack_f32_dwconv_hwg_w(
+ size_t h,
+ size_t w,
+ size_t c,
+ size_t cr,
+ const float* k,
+ const float* b,
+ float* packed_w)
+{
+ for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+ const size_t cr_block_size = min(c - cr_block_start, cr);
+ if XNN_LIKELY(b != NULL) {
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ *packed_w++ = b[cr_block_start + cr_block_offset];
+ }
+ } else {
+ size_t n = cr_block_size;
+ do {
+ *packed_w++ = 0.0f;
+ } while (--n != 0);
+ }
+ packed_w += cr - cr_block_size;
+ for (size_t x = 0; x < w; x++) {
+ for (size_t y = 0; y < h; y++) {
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
+ *packed_w++ = kv;
+ }
+ packed_w += cr - cr_block_size;
+ }
+ }
+ }
+}
+
+void xnn_pack_f32_chw_dwconv_ghw_w(
+ size_t kernel_size,
+ size_t groups,
+ const float* kernel,
+ const float* bias,
+ float* packed_weights)
+{
+ for (size_t g = 0; g < groups; g++) {
+ if XNN_LIKELY(bias != NULL) {
+ *packed_weights = *bias++;
+ } else {
+ *packed_weights = 0.0f;
+ }
+ packed_weights += 1;
+ for (size_t i = 0; i < kernel_size; i++) {
+ *packed_weights++ = kernel[g * kernel_size + i];
+ }
+ }
+}
+
+void xnn_pack_f32_chw_dwconv_hwg_w(
+ size_t kernel_size,
+ size_t groups,
+ const float* kernel,
+ const float* bias,
+ float* packed_weights)
+{
+ for (size_t g = 0; g < groups; g++) {
+ if XNN_LIKELY(bias != NULL) {
+ *packed_weights = *bias++;
+ } else {
+ *packed_weights = 0.0f;
+ }
+ packed_weights += 1;
+ for (size_t i = 0; i < kernel_size; i++) {
+ *packed_weights++ = kernel[i * groups + g];
+ }
+ }
+}
+
+void xnn_pack_f32_vmulcaddc_w(
+ size_t c,
+ size_t cr,
+ const float* s,
+ const float* b,
+ float* packed_w)
+{
+ for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+ const size_t cr_block_size = min(c - cr_block_start, cr);
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ *packed_w++ = s[cr_block_start + cr_block_offset];
+ }
+ packed_w += cr - cr_block_size;
+ if XNN_LIKELY(b != NULL) {
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ *packed_w++ = b[cr_block_start + cr_block_offset];
+ }
+ } else {
+ size_t n = cr_block_size;
+ do {
+ *packed_w++ = 0.0f;
+ } while (--n != 0);
+ }
+ packed_w += cr - cr_block_size;
+ }
+}
+
+void xnn_pack_f16_vmulcaddc_w(
+ size_t c,
+ size_t cr,
+ const uint16_t* s,
+ const uint16_t* b,
+ uint16_t* packed_w)
+{
+ for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+ const size_t cr_block_size = min(c - cr_block_start, cr);
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ *packed_w++ = s[cr_block_start + cr_block_offset];
+ }
+ packed_w += cr - cr_block_size;
+ if XNN_LIKELY(b != NULL) {
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ *packed_w++ = b[cr_block_start + cr_block_offset];
+ }
+ } else {
+ size_t n = cr_block_size;
+ do {
+ *packed_w++ = 0;
+ } while (--n != 0);
+ }
+ packed_w += cr - cr_block_size;
+ }
+}
diff --git a/src/xnnpack/pack.h b/src/xnnpack/pack.h
index 95c55dd..4c7e13d 100644
--- a/src/xnnpack/pack.h
+++ b/src/xnnpack/pack.h
@@ -9,11 +9,17 @@
#pragma once
#include <stdint.h>
-#include <xnnpack/math.h>
+#include <stddef.h>
+
+#include <xnnpack/common.h>
#include <xnnpack/operator.h>
-static inline void xnn_pack_q8_gemm_goi_w(
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+XNN_INTERNAL void xnn_pack_q8_gemm_goi_w(
size_t g,
size_t nc,
size_t kc,
@@ -23,50 +29,9 @@
uint8_t kzp,
const uint8_t* k,
const int32_t* b,
- void* packed_w)
-{
- const int32_t boff = (int32_t) kc * (int32_t) izp * (int32_t) kzp;
- do {
- for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
- const size_t nr_block_size = min(nc - nr_block_start, nr);
- int32_t* packed_b = (int32_t*) packed_w;
- if XNN_LIKELY(b != NULL) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
- packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
- }
- } else {
- size_t n = nr_block_size;
- do {
- *((int32_t*) packed_w) = boff;
- packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
- } while (--n != 0);
- }
- packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
- for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
- const size_t kr_block_size = min(kc - kr_block_start, kr);
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- int32_t ksum = 0;
- for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
- const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
- ksum += (int32_t) kv;
- *((uint8_t*) packed_w) = kv;
- packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
- }
- packed_b[nr_block_offset] -= ksum * (int32_t) izp;
- packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
- }
- packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
- }
- }
- k += nc * kc;
- if XNN_UNPREDICTABLE(b != NULL) {
- b += nc;
- }
- } while (--g != 0);
-}
+ void* packed_w);
-static inline void xnn_pack_q8_gemm_io_w(
+XNN_INTERNAL void xnn_pack_q8_gemm_io_w(
size_t nc,
size_t kc,
uint32_t nr,
@@ -75,44 +40,9 @@
uint8_t kzp,
const uint8_t* k,
const int32_t* b,
- void* packed_w)
-{
- const int32_t boff = (int32_t) kc * (int32_t) izp * (int32_t) kzp;
- for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
- const size_t nr_block_size = min(nc - nr_block_start, nr);
- int32_t* packed_b = (int32_t*) packed_w;
- if XNN_LIKELY(b != NULL) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
- packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
- }
- } else {
- size_t n = nr_block_size;
- do {
- *((int32_t*) packed_w) = boff;
- packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
- } while (--n != 0);
- }
- packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
- for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
- const size_t kr_block_size = min(kc - kr_block_start, kr);
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- int32_t ksum = 0;
- for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
- const uint8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
- ksum += (int32_t) kv;
- *((uint8_t*) packed_w) = kv;
- packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
- }
- packed_b[nr_block_offset] -= ksum * (int32_t) izp;
- packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
- }
- packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
- }
- }
-}
+ void* packed_w);
-static inline void xnn_pack_q8_conv_goki_w(
+XNN_INTERNAL void xnn_pack_q8_conv_goki_w(
size_t g,
size_t nc,
size_t ks,
@@ -123,53 +53,9 @@
uint8_t kzp,
const uint8_t* k,
const int32_t* b,
- void* packed_w)
-{
- const int32_t boff = (int32_t) ks * (int32_t) kc * (int32_t) izp * (int32_t) kzp;
- do {
- for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
- const size_t nr_block_size = min(nc - nr_block_start, nr);
- int32_t* packed_b = (int32_t*) packed_w;
- if XNN_LIKELY(b != NULL) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
- packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
- }
- } else {
- size_t n = nr_block_size;
- do {
- *((int32_t*) packed_w) = boff;
- packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
- } while (--n != 0);
- }
- packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
- for (size_t ki = 0; ki < ks; ki++) {
- for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
- const size_t kr_block_size = min(kc - kr_block_start, kr);
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- int32_t ksum = 0;
- for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
- const uint8_t kv =
- k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
- ksum += (int32_t) kv;
- *((uint8_t*) packed_w) = kv;
- packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
- }
- packed_b[nr_block_offset] -= ksum * (int32_t) izp;
- packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
- }
- packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
- }
- }
- }
- k += ks * kc * nc;
- if XNN_UNPREDICTABLE(b != NULL) {
- b += nc;
- }
- } while (--g != 0);
-}
+ void* packed_w);
-static inline void xnn_pack_q8_conv_kgo_w(
+XNN_INTERNAL void xnn_pack_q8_conv_kgo_w(
size_t g,
size_t nc,
size_t ks,
@@ -179,45 +65,9 @@
uint8_t kzp,
const uint8_t* k,
const int32_t* b,
- void* packed_w)
-{
- const int32_t boff = (int32_t) ks * (int32_t) izp * (int32_t) kzp;
- for (size_t i = 0; i < g; i++) {
- for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
- const size_t nr_block_size = min(nc - nr_block_start, nr);
- int32_t* packed_b = (int32_t*) packed_w;
- if XNN_LIKELY(b != NULL) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
- packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
- }
- } else {
- size_t n = nr_block_size;
- do {
- *((int32_t*) packed_w) = boff;
- packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
- } while (--n != 0);
- }
- packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
- for (size_t ki = 0; ki < ks; ki++) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- const uint8_t kv =
- k[ki * g * nc + (nr_block_start + nr_block_offset)];
- *((uint8_t*) packed_w) = kv;
- packed_b[nr_block_offset] -= (int32_t) kv * (int32_t) izp;
- packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(uint8_t));
- }
- packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
- }
- }
- k += nc;
- if XNN_UNPREDICTABLE(b != NULL) {
- b += nc;
- }
- }
-}
+ void* packed_w);
-static inline void xnn_pack_q8_deconv_goki_w(
+XNN_INTERNAL void xnn_pack_q8_deconv_goki_w(
size_t g,
size_t nc,
size_t kh,
@@ -232,62 +82,9 @@
const uint8_t* k,
const int32_t* b,
void* packed_w,
- struct subconvolution_params* params)
-{
- for (size_t i = 0; i < g; i++) {
- for (size_t oy = 0; oy < sh; oy++) {
- for (size_t ox = 0; ox < sw; ox++) {
- if (i == 0) {
- (*params++).weights = packed_w;
- }
- const int32_t boff = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * (int32_t) izp * (int32_t) kzp;
- for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
- const size_t nr_block_size = min(nc - nr_block_start, nr);
- int32_t* packed_b = (int32_t*) packed_w;
- if XNN_LIKELY(b != 0) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
- packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
- }
- } else {
- size_t n = nr_block_size;
- do {
- *((int32_t*) packed_w) = boff;
- packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
- } while (--n != 0);
- }
- packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
- for (size_t ky = oy; ky < kh; ky += sh) {
- for (size_t kx = ox; kx < kw; kx += sw) {
- for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
- const size_t kr_block_size = min(kc - kr_block_start, kr);
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- int32_t ksum = 0;
- for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
- const uint8_t kv =
- k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
- ksum += (int32_t) kv;
- *((uint8_t*) packed_w) = kv;
- packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
- }
- packed_b[nr_block_offset] -= ksum * (int32_t) izp;
- packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
- }
- packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
- }
- }
- }
- }
- }
- }
- k += kh * kw * kc * nc;
- if XNN_UNPREDICTABLE(b != NULL) {
- b += nc;
- }
- }
-}
+ struct subconvolution_params* params);
-static inline void xnn_pack_q8_dwconv_ghw_w(
+XNN_INTERNAL void xnn_pack_q8_dwconv_ghw_w(
size_t h,
size_t w,
size_t c,
@@ -296,40 +93,9 @@
uint8_t kzp,
const uint8_t* k,
const int32_t* b,
- void* packed_w)
-{
- const int32_t boff = (int32_t) h * (int32_t) w * (int32_t) izp * (int32_t) kzp;
- for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
- const size_t cr_block_size = min(c - cr_block_start, cr);
- int32_t* packed_b = (int32_t*) packed_w;
- if XNN_LIKELY(b != NULL) {
- for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
- *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
- packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
- }
- } else {
- size_t n = cr_block_size;
- do {
- *((int32_t*) packed_w) = boff;
- packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
- } while (--n != 0);
- }
- packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
- for (size_t x = 0; x < w; x++) {
- for (size_t y = 0; y < h; y++) {
- for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
- const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
- packed_b[cr_block_offset] -= (int32_t) kv * (int32_t) izp;
- *((uint8_t*) packed_w) = kv;
- packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
- }
- packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
- }
- }
- }
-}
+ void* packed_w);
-static inline void xnn_pack_q8_dwconv_hwg_w(
+XNN_INTERNAL void xnn_pack_q8_dwconv_hwg_w(
size_t h,
size_t w,
size_t c,
@@ -338,40 +104,9 @@
uint8_t kzp,
const uint8_t* k,
const int32_t* b,
- void* packed_w)
-{
- const int32_t boff = (int32_t) h * (int32_t) w * (int32_t) izp * (int32_t) kzp;
- for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
- const size_t cr_block_size = min(c - cr_block_start, cr);
- int32_t* packed_b = (int32_t*) packed_w;
- if XNN_LIKELY(b != NULL) {
- for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
- *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
- packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
- }
- } else {
- size_t n = cr_block_size;
- do {
- *((int32_t*) packed_w) = boff;
- packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
- } while (--n != 0);
- }
- packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
- for (size_t x = 0; x < w; x++) {
- for (size_t y = 0; y < h; y++) {
- for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
- const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
- packed_b[cr_block_offset] -= (int32_t) kv * (int32_t) izp;
- *((uint8_t*) packed_w) = kv;
- packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
- }
- packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
- }
- }
- }
-}
+ void* packed_w);
-static inline void xnn_pack_f16_gemm_goi_w(
+XNN_INTERNAL void xnn_pack_f16_gemm_goi_w(
size_t g,
size_t nc,
size_t kc,
@@ -380,51 +115,9 @@
size_t sr,
const uint16_t* k,
const uint16_t* b,
- uint16_t* packed_w)
-{
- const size_t skr = sr * kr;
- const size_t skc = round_down_po2(kc, skr);
- const size_t sr_mask = (sr - 1) * kr;
- do {
- for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
- const size_t nr_block_size = min(nc - nr_block_start, nr);
- if XNN_LIKELY(b != NULL) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
- }
- }
- packed_w += nr;
+ uint16_t* packed_w);
- for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
- *packed_w++ =
- k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
- }
- }
- packed_w += (nr - nr_block_size) * kr;
- }
-
- for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
- const size_t kr_block_size = min(kc - kr_block_start, kr);
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
- *packed_w++ =
- k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
- }
- packed_w += kr - kr_block_size;
- }
- packed_w += (nr - nr_block_size) * kr;
- }
- }
- k += nc * kc;
- if XNN_UNPREDICTABLE(b != NULL) {
- b += nc;
- }
- } while (--g != 0);
-}
-
-static inline void xnn_pack_f16_gemm_io_w(
+XNN_INTERNAL void xnn_pack_f16_gemm_io_w(
size_t nc,
size_t kc,
size_t nr,
@@ -432,45 +125,9 @@
size_t sr,
const uint16_t* k,
const uint16_t* b,
- uint16_t* packed_w)
-{
- const size_t skr = sr * kr;
- const size_t skc = round_down_po2(kc, skr);
- const size_t sr_mask = (sr - 1) * kr;
- for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
- const size_t nr_block_size = min(nc - nr_block_start, nr);
- if XNN_LIKELY(b != NULL) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
- }
- }
- packed_w += nr;
+ uint16_t* packed_w);
- for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
- *packed_w++ =
- k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
- }
- }
- packed_w += (nr - nr_block_size) * kr;
- }
-
- for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
- const size_t kr_block_size = min(kc - kr_block_start, kr);
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
- *packed_w++ =
- k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
- }
- packed_w += kr - kr_block_size;
- }
- packed_w += (nr - nr_block_size) * kr;
- }
- }
-}
-
-static inline void xnn_pack_f16_gemminc_goi_w(
+XNN_INTERNAL void xnn_pack_f16_gemminc_goi_w(
size_t g,
size_t nc,
size_t kc,
@@ -478,42 +135,9 @@
size_t kr,
size_t sr,
const uint16_t* k,
- uint16_t* packed_w)
-{
- const size_t skr = sr * kr;
- const size_t skc = round_down_po2(kc, skr);
- const size_t sr_mask = (sr - 1) * kr;
- do {
- for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
- const size_t nr_block_size = min(nc - nr_block_start, nr);
+ uint16_t* packed_w);
- for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
- *packed_w++ =
- k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
- }
- }
- packed_w += (nr - nr_block_size) * kr;
- }
-
- for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
- const size_t kr_block_size = min(kc - kr_block_start, kr);
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
- *packed_w++ =
- k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
- }
- packed_w += kr - kr_block_size;
- }
- packed_w += (nr - nr_block_size) * kr;
- }
- }
- k += nc * kc;
- } while (--g != 0);
-}
-
-static inline void xnn_pack_f16_conv_goki_w(
+XNN_INTERNAL void xnn_pack_f16_conv_goki_w(
size_t g,
size_t nc,
size_t ks,
@@ -523,53 +147,9 @@
size_t sr,
const uint16_t* k,
const uint16_t* b,
- uint16_t* packed_w)
-{
- const size_t skr = sr * kr;
- const size_t skc = round_down_po2(kc, skr);
- const size_t sr_mask = (sr - 1) * kr;
- do {
- for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
- const size_t nr_block_size = min(nc - nr_block_start, nr);
- if XNN_LIKELY(b != NULL) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
- }
- }
- packed_w += nr;
+ uint16_t* packed_w);
- for (size_t ki = 0; ki < ks; ki++) {
- for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
- *packed_w++ =
- k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
- }
- }
- packed_w += (nr - nr_block_size) * kr;
- }
-
- for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
- const size_t kr_block_size = min(kc - kr_block_start, kr);
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
- *packed_w++ =
- k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
- }
- packed_w += kr - kr_block_size;
- }
- packed_w += (nr - nr_block_size) * kr;
- }
- }
- }
- k += ks * kc * nc;
- if XNN_UNPREDICTABLE(b != NULL) {
- b += nc;
- }
- } while (--g != 0);
-}
-
-static inline void xnn_pack_f16_conv_kgo_w(
+XNN_INTERNAL void xnn_pack_f16_conv_kgo_w(
size_t g,
size_t nc,
size_t ks,
@@ -577,34 +157,9 @@
size_t kr,
const uint16_t* k,
const uint16_t* b,
- uint16_t* packed_w)
-{
- for (size_t i = 0; i < g; i++) {
- for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
- const size_t nr_block_size = min(nc - nr_block_start, nr);
- if XNN_LIKELY(b != NULL) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
- }
- }
- packed_w += nr;
- for (size_t ki = 0; ki < ks; ki++) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- *packed_w =
- k[ki * g * nc + (nr_block_start + nr_block_offset)];
- packed_w += kr;
- }
- packed_w += (nr - nr_block_size) * kr;
- }
- }
- k += nc;
- if XNN_UNPREDICTABLE(b != NULL) {
- b += nc;
- }
- }
-}
+ uint16_t* packed_w);
-static inline void xnn_pack_f16_dconv_oki_w(
+XNN_INTERNAL void xnn_pack_f16_dconv_oki_w(
size_t nc,
size_t kc,
size_t nr,
@@ -612,37 +167,9 @@
size_t kw,
const uint16_t* k,
const uint16_t* b,
- uint16_t* packed_w)
-{
- for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
- const size_t nr_block_size = min(nc - nr_block_start, nr);
- if XNN_LIKELY(b != NULL) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
- *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
- }
- } else {
- size_t n = nr;
- do {
- *packed_w++ = 0;
- } while (--n != 0);
- }
+ uint16_t* packed_w);
- for (size_t kx = 0; kx < kw; kx++) {
- for (size_t c = 0; c < kc; c++) {
- for (size_t ky = 0; ky < kh; ky++) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
- *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
- }
- }
- }
- }
- if XNN_UNPREDICTABLE(b != NULL) {
- b += nr;
- }
- }
-}
-
-static inline void xnn_pack_f16_deconv_goki_w(
+XNN_INTERNAL void xnn_pack_f16_deconv_goki_w(
size_t g,
size_t nc,
size_t kh,
@@ -656,149 +183,34 @@
const uint16_t* k,
const uint16_t* b,
uint16_t* packed_w,
- struct subconvolution_params* params)
-{
- const size_t skr = sr * kr;
- const size_t skc = round_down_po2(kc, skr);
- const size_t sr_mask = (sr - 1) * kr;
- for (size_t i = 0; i < g; i++) {
- for (size_t oy = 0; oy < sh; oy++) {
- for (size_t ox = 0; ox < sw; ox++) {
- if (i == 0) {
- (*params++).weights = packed_w;
- }
- for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
- const size_t nr_block_size = min(nc - nr_block_start, nr);
- if XNN_LIKELY(b != NULL) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
- }
- }
- packed_w += nr;
- for (size_t ky = oy; ky < kh; ky += sh) {
- for (size_t kx = ox; kx < kw; kx += sw) {
- for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
- *packed_w++ =
- k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
- }
- }
- packed_w += (nr - nr_block_size) * kr;
- }
+ struct subconvolution_params* params);
- for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
- const size_t kr_block_size = min(kc - kr_block_start, kr);
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
- *packed_w++ =
- k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
- }
- packed_w += kr - kr_block_size;
- }
- packed_w += (nr - nr_block_size) * kr;
- }
- }
- }
- }
- }
- }
- k += kh * kw * kc * nc;
- if XNN_UNPREDICTABLE(b != NULL) {
- b += nc;
- }
- }
-}
-
-static inline void xnn_pack_f16_dwconv_ghw_w(
+XNN_INTERNAL void xnn_pack_f16_dwconv_ghw_w(
size_t h,
size_t w,
size_t c,
size_t cr,
const uint16_t* k,
const uint16_t* b,
- uint16_t* packed_w)
-{
- for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
- const size_t cr_block_size = min(c - cr_block_start, cr);
- if XNN_LIKELY(b != NULL) {
- for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
- *packed_w++ = b[cr_block_start + cr_block_offset];
- }
- } else {
- size_t n = cr_block_size;
- do {
- *packed_w++ = 0;
- } while (--n != 0);
- }
- packed_w += cr - cr_block_size;
- for (size_t x = 0; x < w; x++) {
- for (size_t y = 0; y < h; y++) {
- for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
- const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
- *packed_w++ = kv;
- }
- packed_w += cr - cr_block_size;
- }
- }
- }
-}
+ uint16_t* packed_w);
-static inline void xnn_pack_f16_dwconv_hwg_w(
+XNN_INTERNAL void xnn_pack_f16_dwconv_hwg_w(
size_t h,
size_t w,
size_t c,
size_t cr,
const uint16_t* k,
const uint16_t* b,
- uint16_t* packed_w)
-{
- for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
- const size_t cr_block_size = min(c - cr_block_start, cr);
- if XNN_LIKELY(b != NULL) {
- for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
- *packed_w++ = b[cr_block_start + cr_block_offset];
- }
- } else {
- size_t n = cr_block_size;
- do {
- *packed_w++ = 0;
- } while (--n != 0);
- }
- packed_w += cr - cr_block_size;
- for (size_t x = 0; x < w; x++) {
- for (size_t y = 0; y < h; y++) {
- for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
- const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
- *packed_w++ = kv;
- }
- packed_w += cr - cr_block_size;
- }
- }
- }
-}
+ uint16_t* packed_w);
-static inline void xnn_pack_f16_chw_dwconv_ghw_w(
+XNN_INTERNAL void xnn_pack_f16_chw_dwconv_ghw_w(
size_t kernel_size,
size_t groups,
const uint16_t* kernel,
const uint16_t* bias,
- uint16_t* packed_weights)
-{
- for (size_t g = 0; g < groups; g++) {
- if XNN_LIKELY(bias != NULL) {
- *packed_weights = *bias++;
- } else {
- *packed_weights = 0;
- }
- packed_weights += 1;
- for (size_t i = 0; i < kernel_size; i++) {
- *packed_weights++ = kernel[g * kernel_size + i];
- }
- }
-}
+ uint16_t* packed_weights);
-static inline void xnn_pack_f32_gemm_goi_w(
+XNN_INTERNAL void xnn_pack_f32_gemm_goi_w(
size_t g,
size_t nc,
size_t kc,
@@ -807,51 +219,9 @@
size_t sr,
const float* k,
const float* b,
- float* packed_w)
-{
- const size_t skr = sr * kr;
- const size_t skc = round_down_po2(kc, skr);
- const size_t sr_mask = (sr - 1) * kr;
- do {
- for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
- const size_t nr_block_size = min(nc - nr_block_start, nr);
- if XNN_LIKELY(b != NULL) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
- }
- }
- packed_w += nr;
+ float* packed_w);
- for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
- *packed_w++ =
- k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
- }
- }
- packed_w += (nr - nr_block_size) * kr;
- }
-
- for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
- const size_t kr_block_size = min(kc - kr_block_start, kr);
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
- *packed_w++ =
- k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
- }
- packed_w += kr - kr_block_size;
- }
- packed_w += (nr - nr_block_size) * kr;
- }
- }
- k += nc * kc;
- if XNN_UNPREDICTABLE(b != NULL) {
- b += nc;
- }
- } while (--g != 0);
-}
-
-static inline void xnn_pack_f32_gemm_io_w(
+XNN_INTERNAL void xnn_pack_f32_gemm_io_w(
size_t nc,
size_t kc,
size_t nr,
@@ -859,45 +229,9 @@
size_t sr,
const float* k,
const float* b,
- float* packed_w)
-{
- const size_t skr = sr * kr;
- const size_t skc = round_down_po2(kc, skr);
- const size_t sr_mask = (sr - 1) * kr;
- for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
- const size_t nr_block_size = min(nc - nr_block_start, nr);
- if XNN_LIKELY(b != NULL) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
- }
- }
- packed_w += nr;
+ float* packed_w);
- for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
- *packed_w++ =
- k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
- }
- }
- packed_w += (nr - nr_block_size) * kr;
- }
-
- for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
- const size_t kr_block_size = min(kc - kr_block_start, kr);
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
- *packed_w++ =
- k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
- }
- packed_w += kr - kr_block_size;
- }
- packed_w += (nr - nr_block_size) * kr;
- }
- }
-}
-
-static inline void xnn_pack_f32_gemminc_goi_w(
+XNN_INTERNAL void xnn_pack_f32_gemminc_goi_w(
size_t g,
size_t nc,
size_t kc,
@@ -905,42 +239,9 @@
size_t kr,
size_t sr,
const float* k,
- float* packed_w)
-{
- const size_t skr = sr * kr;
- const size_t skc = round_down_po2(kc, skr);
- const size_t sr_mask = (sr - 1) * kr;
- do {
- for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
- const size_t nr_block_size = min(nc - nr_block_start, nr);
+ float* packed_w);
- for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
- *packed_w++ =
- k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
- }
- }
- packed_w += (nr - nr_block_size) * kr;
- }
-
- for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
- const size_t kr_block_size = min(kc - kr_block_start, kr);
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
- *packed_w++ =
- k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
- }
- packed_w += kr - kr_block_size;
- }
- packed_w += (nr - nr_block_size) * kr;
- }
- }
- k += nc * kc;
- } while (--g != 0);
-}
-
-static inline void xnn_pack_f32_conv_goki_w(
+XNN_INTERNAL void xnn_pack_f32_conv_goki_w(
size_t g,
size_t nc,
size_t ks,
@@ -950,53 +251,9 @@
size_t sr,
const float* k,
const float* b,
- float* packed_w)
-{
- const size_t skr = sr * kr;
- const size_t skc = round_down_po2(kc, skr);
- const size_t sr_mask = (sr - 1) * kr;
- do {
- for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
- const size_t nr_block_size = min(nc - nr_block_start, nr);
- if XNN_LIKELY(b != NULL) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
- }
- }
- packed_w += nr;
+ float* packed_w);
- for (size_t ki = 0; ki < ks; ki++) {
- for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
- *packed_w++ =
- k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
- }
- }
- packed_w += (nr - nr_block_size) * kr;
- }
-
- for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
- const size_t kr_block_size = min(kc - kr_block_start, kr);
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
- *packed_w++ =
- k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
- }
- packed_w += kr - kr_block_size;
- }
- packed_w += (nr - nr_block_size) * kr;
- }
- }
- }
- k += ks * kc * nc;
- if XNN_UNPREDICTABLE(b != NULL) {
- b += nc;
- }
- } while (--g != 0);
-}
-
-static inline void xnn_pack_f32_conv_kgo_w(
+XNN_INTERNAL void xnn_pack_f32_conv_kgo_w(
size_t g,
size_t nc,
size_t ks,
@@ -1004,34 +261,9 @@
size_t kr,
const float* k,
const float* b,
- float* packed_w)
-{
- for (size_t i = 0; i < g; i++) {
- for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
- const size_t nr_block_size = min(nc - nr_block_start, nr);
- if XNN_LIKELY(b != NULL) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
- }
- }
- packed_w += nr;
- for (size_t ki = 0; ki < ks; ki++) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- *packed_w =
- k[ki * g * nc + (nr_block_start + nr_block_offset)];
- packed_w += kr;
- }
- packed_w += (nr - nr_block_size) * kr;
- }
- }
- k += nc;
- if XNN_UNPREDICTABLE(b != NULL) {
- b += nc;
- }
- }
-}
+ float* packed_w);
-static inline void xnn_pack_f32_dconv_oki_w(
+XNN_INTERNAL void xnn_pack_f32_dconv_oki_w(
size_t nc,
size_t kc,
size_t nr,
@@ -1039,37 +271,9 @@
size_t kw,
const float* k,
const float* b,
- float* packed_w)
-{
- for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
- const size_t nr_block_size = min(nc - nr_block_start, nr);
- if XNN_LIKELY(b != NULL) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
- *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
- }
- } else {
- size_t n = nr;
- do {
- *packed_w++ = 0.0f;
- } while (--n != 0);
- }
+ float* packed_w);
- for (size_t kx = 0; kx < kw; kx++) {
- for (size_t c = 0; c < kc; c++) {
- for (size_t ky = 0; ky < kh; ky++) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
- *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
- }
- }
- }
- }
- if XNN_UNPREDICTABLE(b != NULL) {
- b += nr;
- }
- }
-}
-
-static inline void xnn_pack_f32_deconv_goki_w(
+XNN_INTERNAL void xnn_pack_f32_deconv_goki_w(
size_t g,
size_t nc,
size_t kh,
@@ -1083,218 +287,54 @@
const float* k,
const float* b,
float* packed_w,
- struct subconvolution_params* params)
-{
- const size_t skr = sr * kr;
- const size_t skc = round_down_po2(kc, skr);
- const size_t sr_mask = (sr - 1) * kr;
- for (size_t i = 0; i < g; i++) {
- for (size_t oy = 0; oy < sh; oy++) {
- for (size_t ox = 0; ox < sw; ox++) {
- if (i == 0) {
- (*params++).weights = packed_w;
- }
- for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
- const size_t nr_block_size = min(nc - nr_block_start, nr);
- if XNN_LIKELY(b != NULL) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
- }
- }
- packed_w += nr;
- for (size_t ky = oy; ky < kh; ky += sh) {
- for (size_t kx = ox; kx < kw; kx += sw) {
- for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
- *packed_w++ =
- k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
- }
- }
- packed_w += (nr - nr_block_size) * kr;
- }
+ struct subconvolution_params* params);
- for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
- const size_t kr_block_size = min(kc - kr_block_start, kr);
- for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
- for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
- *packed_w++ =
- k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
- }
- packed_w += kr - kr_block_size;
- }
- packed_w += (nr - nr_block_size) * kr;
- }
- }
- }
- }
- }
- }
- k += kh * kw * kc * nc;
- if XNN_UNPREDICTABLE(b != NULL) {
- b += nc;
- }
- }
-}
-
-static inline void xnn_pack_f32_dwconv_ghw_w(
+XNN_INTERNAL void xnn_pack_f32_dwconv_ghw_w(
size_t h,
size_t w,
size_t c,
size_t cr,
const float* k,
const float* b,
- float* packed_w)
-{
- for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
- const size_t cr_block_size = min(c - cr_block_start, cr);
- if XNN_LIKELY(b != NULL) {
- for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
- *packed_w++ = b[cr_block_start + cr_block_offset];
- }
- } else {
- size_t n = cr_block_size;
- do {
- *packed_w++ = 0.0f;
- } while (--n != 0);
- }
- packed_w += cr - cr_block_size;
- for (size_t x = 0; x < w; x++) {
- for (size_t y = 0; y < h; y++) {
- for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
- const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
- *packed_w++ = kv;
- }
- packed_w += cr - cr_block_size;
- }
- }
- }
-}
+ float* packed_w);
-static inline void xnn_pack_f32_dwconv_hwg_w(
+XNN_INTERNAL void xnn_pack_f32_dwconv_hwg_w(
size_t h,
size_t w,
size_t c,
size_t cr,
const float* k,
const float* b,
- float* packed_w)
-{
- for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
- const size_t cr_block_size = min(c - cr_block_start, cr);
- if XNN_LIKELY(b != NULL) {
- for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
- *packed_w++ = b[cr_block_start + cr_block_offset];
- }
- } else {
- size_t n = cr_block_size;
- do {
- *packed_w++ = 0.0f;
- } while (--n != 0);
- }
- packed_w += cr - cr_block_size;
- for (size_t x = 0; x < w; x++) {
- for (size_t y = 0; y < h; y++) {
- for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
- const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
- *packed_w++ = kv;
- }
- packed_w += cr - cr_block_size;
- }
- }
- }
-}
+ float* packed_w);
-static inline void xnn_pack_f32_chw_dwconv_ghw_w(
+XNN_INTERNAL void xnn_pack_f32_chw_dwconv_ghw_w(
size_t kernel_size,
size_t groups,
const float* kernel,
const float* bias,
- float* packed_weights)
-{
- for (size_t g = 0; g < groups; g++) {
- if XNN_LIKELY(bias != NULL) {
- *packed_weights = *bias++;
- } else {
- *packed_weights = 0.0f;
- }
- packed_weights += 1;
- for (size_t i = 0; i < kernel_size; i++) {
- *packed_weights++ = kernel[g * kernel_size + i];
- }
- }
-}
+ float* packed_weights);
-static inline void xnn_pack_f32_chw_dwconv_hwg_w(
+XNN_INTERNAL void xnn_pack_f32_chw_dwconv_hwg_w(
size_t kernel_size,
size_t groups,
const float* kernel,
const float* bias,
- float* packed_weights)
-{
- for (size_t g = 0; g < groups; g++) {
- if XNN_LIKELY(bias != NULL) {
- *packed_weights = *bias++;
- } else {
- *packed_weights = 0.0f;
- }
- packed_weights += 1;
- for (size_t i = 0; i < kernel_size; i++) {
- *packed_weights++ = kernel[i * groups + g];
- }
- }
-}
+ float* packed_weights);
-static inline void xnn_pack_f16_vmulcaddc_w(
+XNN_INTERNAL void xnn_pack_f16_vmulcaddc_w(
size_t c,
size_t cr,
const uint16_t* s,
const uint16_t* b,
- uint16_t* packed_w)
-{
- for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
- const size_t cr_block_size = min(c - cr_block_start, cr);
- for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
- *packed_w++ = s[cr_block_start + cr_block_offset];
- }
- packed_w += cr - cr_block_size;
- if XNN_LIKELY(b != NULL) {
- for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
- *packed_w++ = b[cr_block_start + cr_block_offset];
- }
- } else {
- size_t n = cr_block_size;
- do {
- *packed_w++ = 0;
- } while (--n != 0);
- }
- packed_w += cr - cr_block_size;
- }
-}
+ uint16_t* packed_w);
-static inline void xnn_pack_f32_vmulcaddc_w(
+XNN_INTERNAL void xnn_pack_f32_vmulcaddc_w(
size_t c,
size_t cr,
const float* s,
const float* b,
- float* packed_w)
-{
- for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
- const size_t cr_block_size = min(c - cr_block_start, cr);
- for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
- *packed_w++ = s[cr_block_start + cr_block_offset];
- }
- packed_w += cr - cr_block_size;
- if XNN_LIKELY(b != NULL) {
- for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
- *packed_w++ = b[cr_block_start + cr_block_offset];
- }
- } else {
- size_t n = cr_block_size;
- do {
- *packed_w++ = 0.0f;
- } while (--n != 0);
- }
- packed_w += cr - cr_block_size;
- }
-}
+ float* packed_w);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif