Make packing functions non-inline

PiperOrigin-RevId: 319844892
diff --git a/BUILD.bazel b/BUILD.bazel
index 88fcdc1..1281696 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -3116,6 +3116,36 @@
 )
 
 xnnpack_cc_library(
+    name = "packing",
+    srcs = ["src/packing.c"],
+    hdrs = INTERNAL_HDRS,
+    gcc_copts = xnnpack_gcc_std_copts(),
+    msvc_copts = xnnpack_msvc_std_copts(),
+    deps = [
+        "@FP16",
+        "@FXdiv",
+        "@pthreadpool",
+    ],
+)
+
+xnnpack_cc_library(
+    name = "packing_test_mode",
+    srcs = ["src/packing.c"],
+    hdrs = INTERNAL_HDRS,
+    copts = [
+        "-UNDEBUG",
+        "-DXNN_TEST_MODE=1",
+    ],
+    gcc_copts = xnnpack_gcc_std_copts(),
+    msvc_copts = xnnpack_msvc_std_copts(),
+    deps = [
+        "@FP16",
+        "@FXdiv",
+        "@pthreadpool",
+    ],
+)
+
+xnnpack_cc_library(
     name = "operator_run",
     srcs = ["src/operator-run.c"],
     hdrs = INTERNAL_HDRS + LOGGING_HDRS,
@@ -3234,6 +3264,7 @@
     deps = [
         ":indirection",
         ":logging_utils",
+        ":packing",
         "@FP16",
         "@FXdiv",
         "@clog",
@@ -3265,6 +3296,7 @@
     deps = [
         ":indirection_test_mode",
         ":logging_utils",
+        ":packing_test_mode",
         "@FP16",
         "@FXdiv",
         "@clog",
@@ -3461,7 +3493,7 @@
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_BENCHMARK_HDRS,
     copts = xnnpack_optional_ruy_copts() + xnnpack_optional_gemmlowp_copts(),
-    deps = MICROKERNEL_BENCHMARK_DEPS + xnnpack_optional_ruy_deps() + xnnpack_optional_gemmlowp_deps(),
+    deps = MICROKERNEL_BENCHMARK_DEPS + [":packing"] + xnnpack_optional_ruy_deps() + xnnpack_optional_gemmlowp_deps(),
 )
 
 xnnpack_benchmark(
@@ -3472,7 +3504,10 @@
         "bench/google/conv.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + MICROKERNEL_BENCHMARK_HDRS,
-    deps = MICROKERNEL_BENCHMARK_DEPS + [":indirection"],
+    deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":indirection",
+        ":packing",
+    ],
 )
 
 xnnpack_benchmark(
@@ -3482,7 +3517,9 @@
         "bench/gemm.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_BENCHMARK_HDRS,
-    deps = MICROKERNEL_BENCHMARK_DEPS,
+    deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":packing",
+    ],
 )
 
 xnnpack_benchmark(
@@ -3502,7 +3539,10 @@
         "bench/conv.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + MICROKERNEL_BENCHMARK_HDRS,
-    deps = MICROKERNEL_BENCHMARK_DEPS + [":indirection"],
+    deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":indirection",
+        ":packing",
+    ],
 )
 
 xnnpack_benchmark(
@@ -3512,7 +3552,9 @@
         "bench/dconv.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_BENCHMARK_HDRS,
-    deps = MICROKERNEL_BENCHMARK_DEPS,
+    deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":packing",
+    ],
 )
 
 xnnpack_benchmark(
@@ -3522,7 +3564,9 @@
         "bench/dconv.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_BENCHMARK_HDRS,
-    deps = MICROKERNEL_BENCHMARK_DEPS,
+    deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":packing",
+    ],
 )
 
 xnnpack_benchmark(
@@ -3533,7 +3577,10 @@
         "bench/google/dwconv.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + MICROKERNEL_BENCHMARK_HDRS,
-    deps = MICROKERNEL_BENCHMARK_DEPS + [":indirection"],
+    deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":indirection",
+        ":packing",
+    ],
 )
 
 xnnpack_benchmark(
@@ -3543,7 +3590,10 @@
         "bench/dwconv.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + MICROKERNEL_BENCHMARK_HDRS,
-    deps = MICROKERNEL_BENCHMARK_DEPS + [":indirection"],
+    deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":indirection",
+        ":packing",
+    ],
 )
 
 xnnpack_benchmark(
@@ -3553,7 +3603,10 @@
         "bench/dwconv.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_BENCHMARK_HDRS,
-    deps = MICROKERNEL_BENCHMARK_DEPS + [":indirection"],
+    deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":indirection",
+        ":packing",
+    ],
 )
 
 xnnpack_benchmark(
@@ -3564,7 +3617,7 @@
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_BENCHMARK_HDRS,
     copts = xnnpack_optional_ruy_copts(),
-    deps = MICROKERNEL_BENCHMARK_DEPS + xnnpack_optional_ruy_deps(),
+    deps = MICROKERNEL_BENCHMARK_DEPS + [":packing"] + xnnpack_optional_ruy_deps(),
 )
 
 xnnpack_benchmark(
@@ -3674,7 +3727,10 @@
         "bench/conv.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_BENCHMARK_HDRS,
-    deps = MICROKERNEL_BENCHMARK_DEPS + [":im2col"],
+    deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":im2col",
+        ":packing",
+    ],
 )
 
 xnnpack_benchmark(
@@ -4035,7 +4091,7 @@
         "test/gemm-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
+    deps = MICROKERNEL_TEST_DEPS + [":packing"],
 )
 
 xnnpack_unit_test(
@@ -4103,7 +4159,7 @@
         "test/gemm-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
+    deps = MICROKERNEL_TEST_DEPS + [":packing"],
 )
 
 xnnpack_unit_test(
@@ -4113,7 +4169,7 @@
         "test/gemm-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
+    deps = MICROKERNEL_TEST_DEPS + [":packing"],
 )
 
 xnnpack_unit_test(
@@ -4123,7 +4179,7 @@
         "test/gemm-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
+    deps = MICROKERNEL_TEST_DEPS + [":packing"],
 )
 
 xnnpack_unit_test(
@@ -4133,7 +4189,7 @@
         "test/gemm-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
+    deps = MICROKERNEL_TEST_DEPS + [":packing"],
 )
 
 xnnpack_unit_test(
@@ -4143,7 +4199,7 @@
         "test/conv-hwc-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
+    deps = MICROKERNEL_TEST_DEPS + [":packing"],
 )
 
 xnnpack_unit_test(
@@ -4153,7 +4209,7 @@
         "test/conv-hwc2chw-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
+    deps = MICROKERNEL_TEST_DEPS + [":packing"],
 )
 
 xnnpack_unit_test(
@@ -4163,7 +4219,7 @@
         "test/dwconv-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
+    deps = MICROKERNEL_TEST_DEPS + [":packing"],
 )
 
 xnnpack_unit_test(
@@ -4173,7 +4229,7 @@
         "test/dwconv-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
+    deps = MICROKERNEL_TEST_DEPS + [":packing"],
 )
 
 xnnpack_unit_test(
@@ -4183,7 +4239,7 @@
         "test/dwconv-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
+    deps = MICROKERNEL_TEST_DEPS + [":packing"],
 )
 
 xnnpack_unit_test(
@@ -4193,7 +4249,7 @@
         "test/dwconv-chw-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
+    deps = MICROKERNEL_TEST_DEPS + [":packing"],
 )
 
 xnnpack_unit_test(
@@ -4233,7 +4289,7 @@
         "test/gemm-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
+    deps = MICROKERNEL_TEST_DEPS + [":packing"],
 )
 
 xnnpack_unit_test(
@@ -4243,7 +4299,7 @@
         "test/gemm-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
+    deps = MICROKERNEL_TEST_DEPS + [":packing"],
 )
 
 xnnpack_unit_test(
@@ -4253,7 +4309,7 @@
         "test/gemm-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
+    deps = MICROKERNEL_TEST_DEPS + [":packing"],
 )
 
 xnnpack_unit_test(
@@ -4263,7 +4319,7 @@
         "test/gemm-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
+    deps = MICROKERNEL_TEST_DEPS + [":packing"],
 )
 
 xnnpack_unit_test(
@@ -4310,7 +4366,7 @@
         "test/gemm-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
+    deps = MICROKERNEL_TEST_DEPS + [":packing"],
 )
 
 xnnpack_unit_test(
@@ -4692,7 +4748,7 @@
         "test/vmulcaddc-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
+    deps = MICROKERNEL_TEST_DEPS + [":packing"],
 )
 
 xnnpack_unit_test(
@@ -4702,7 +4758,7 @@
         "test/vmulcaddc-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
+    deps = MICROKERNEL_TEST_DEPS + [":packing"],
 )
 
 xnnpack_unit_test(
@@ -4893,7 +4949,7 @@
         "test/gemm-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
+    deps = MICROKERNEL_TEST_DEPS + [":packing"],
 )
 
 xnnpack_unit_test(
@@ -4903,7 +4959,7 @@
         "test/dwconv-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
+    deps = MICROKERNEL_TEST_DEPS + [":packing"],
 )
 
 xnnpack_unit_test(
@@ -4923,7 +4979,7 @@
         "test/gemm-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
-    deps = MICROKERNEL_TEST_DEPS,
+    deps = MICROKERNEL_TEST_DEPS + [":packing"],
 )
 
 xnnpack_unit_test(
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c603595..ff3e5c8 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -225,6 +225,7 @@
 
 SET(XNNPACK_HOT_SRCS
   src/indirection.c
+  src/packing.c
   src/operator-run.c)
 
 SET(XNNPACK_TABLE_SRCS
diff --git a/src/packing.c b/src/packing.c
new file mode 100644
index 0000000..d5357c3
--- /dev/null
+++ b/src/packing.c
@@ -0,0 +1,1300 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <stdint.h>
+#include <stddef.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/pack.h>
+
+
+void xnn_pack_q8_gemm_goi_w(
+  size_t g,
+  size_t nc,
+  size_t kc,
+  uint32_t nr,
+  uint32_t kr,
+  uint8_t izp,
+  uint8_t kzp,
+  const uint8_t* k,
+  const int32_t* b,
+  void* packed_w)
+{
+  const int32_t boff = (int32_t) kc * (int32_t) izp * (int32_t) kzp;
+  do {
+    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+      const size_t nr_block_size = min(nc - nr_block_start, nr);
+      int32_t* packed_b = (int32_t*) packed_w;
+      if XNN_LIKELY(b != NULL) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
+          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+        }
+      } else {
+        size_t n = nr_block_size;
+        do {
+          *((int32_t*) packed_w) = boff;
+          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+        } while (--n != 0);
+      }
+      packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
+      for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
+        const size_t kr_block_size = min(kc - kr_block_start, kr);
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          int32_t ksum = 0;
+          for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+            const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
+            ksum += (int32_t) kv;
+            *((uint8_t*) packed_w) = kv;
+            packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+          }
+          packed_b[nr_block_offset] -= ksum * (int32_t) izp;
+          packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
+        }
+        packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
+      }
+    }
+    k += nc * kc;
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nc;
+    }
+  } while (--g != 0);
+}
+
+void xnn_pack_q8_gemm_io_w(
+  size_t nc,
+  size_t kc,
+  uint32_t nr,
+  uint32_t kr,
+  uint8_t izp,
+  uint8_t kzp,
+  const uint8_t* k,
+  const int32_t* b,
+  void* packed_w)
+{
+  const int32_t boff = (int32_t) kc * (int32_t) izp * (int32_t) kzp;
+  for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+    const size_t nr_block_size = min(nc - nr_block_start, nr);
+    int32_t* packed_b = (int32_t*) packed_w;
+    if XNN_LIKELY(b != NULL) {
+      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+        *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
+        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+      }
+    } else {
+      size_t n = nr_block_size;
+      do {
+        *((int32_t*) packed_w) = boff;
+        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+      } while (--n != 0);
+    }
+    packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
+    for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
+      const size_t kr_block_size = min(kc - kr_block_start, kr);
+      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+        int32_t ksum = 0;
+        for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+          const uint8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
+          ksum += (int32_t) kv;
+          *((uint8_t*) packed_w) = kv;
+          packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+        }
+        packed_b[nr_block_offset] -= ksum * (int32_t) izp;
+        packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
+      }
+      packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
+    }
+  }
+}
+
+void xnn_pack_q8_conv_goki_w(
+  size_t g,
+  size_t nc,
+  size_t ks,
+  size_t kc,
+  uint32_t nr,
+  uint32_t kr,
+  uint8_t izp,
+  uint8_t kzp,
+  const uint8_t* k,
+  const int32_t* b,
+  void* packed_w)
+{
+  const int32_t boff = (int32_t) ks * (int32_t) kc * (int32_t) izp * (int32_t) kzp;
+  do {
+    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+      const size_t nr_block_size = min(nc - nr_block_start, nr);
+      int32_t* packed_b = (int32_t*) packed_w;
+      if XNN_LIKELY(b != NULL) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
+          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+        }
+      } else {
+        size_t n = nr_block_size;
+        do {
+          *((int32_t*) packed_w) = boff;
+          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+        } while (--n != 0);
+      }
+      packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
+      for (size_t ki = 0; ki < ks; ki++) {
+        for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
+          const size_t kr_block_size = min(kc - kr_block_start, kr);
+          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+            int32_t ksum = 0;
+            for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+              const uint8_t kv =
+                k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
+              ksum += (int32_t) kv;
+              *((uint8_t*) packed_w) = kv;
+              packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+            }
+            packed_b[nr_block_offset] -= ksum * (int32_t) izp;
+            packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
+          }
+          packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
+        }
+      }
+    }
+    k += ks * kc * nc;
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nc;
+    }
+  } while (--g != 0);
+}
+
+void xnn_pack_q8_conv_kgo_w(
+  size_t g,
+  size_t nc,
+  size_t ks,
+  uint32_t nr,
+  uint32_t kr,
+  uint8_t izp,
+  uint8_t kzp,
+  const uint8_t* k,
+  const int32_t* b,
+  void* packed_w)
+{
+  const int32_t boff = (int32_t) ks * (int32_t) izp * (int32_t) kzp;
+  for (size_t i = 0; i < g; i++) {
+    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+      const size_t nr_block_size = min(nc - nr_block_start, nr);
+      int32_t* packed_b = (int32_t*) packed_w;
+      if XNN_LIKELY(b != NULL) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
+          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+        }
+      } else {
+        size_t n = nr_block_size;
+        do {
+          *((int32_t*) packed_w) = boff;
+          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+        } while (--n != 0);
+      }
+      packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
+      for (size_t ki = 0; ki < ks; ki++) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          const uint8_t kv =
+            k[ki * g * nc + (nr_block_start + nr_block_offset)];
+          *((uint8_t*) packed_w) = kv;
+          packed_b[nr_block_offset] -= (int32_t) kv * (int32_t) izp;
+          packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(uint8_t));
+        }
+        packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
+      }
+    }
+    k += nc;
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nc;
+    }
+  }
+}
+
+void xnn_pack_q8_deconv_goki_w(
+  size_t g,
+  size_t nc,
+  size_t kh,
+  size_t kw,
+  size_t kc,
+  size_t sh,
+  size_t sw,
+  size_t nr,
+  size_t kr,
+  uint8_t izp,
+  uint8_t kzp,
+  const uint8_t* k,
+  const int32_t* b,
+  void* packed_w,
+  struct subconvolution_params* params)
+{
+  for (size_t i = 0; i < g; i++) {
+    for (size_t oy = 0; oy < sh; oy++) {
+      for (size_t ox = 0; ox < sw; ox++) {
+        if (i == 0) {
+          (*params++).weights = packed_w;
+        }
+        const int32_t boff = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * (int32_t) izp * (int32_t) kzp;
+        for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+          const size_t nr_block_size = min(nc - nr_block_start, nr);
+          int32_t* packed_b = (int32_t*) packed_w;
+          if XNN_LIKELY(b != 0) {
+            for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+              *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
+              packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+            }
+          } else {
+            size_t n = nr_block_size;
+            do {
+              *((int32_t*) packed_w) = boff;
+              packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+            } while (--n != 0);
+          }
+          packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
+          for (size_t ky = oy; ky < kh; ky += sh) {
+            for (size_t kx = ox; kx < kw; kx += sw) {
+              for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
+                const size_t kr_block_size = min(kc - kr_block_start, kr);
+                for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+                  int32_t ksum = 0;
+                  for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+                    const uint8_t kv =
+                      k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
+                    ksum += (int32_t) kv;
+                    *((uint8_t*) packed_w) = kv;
+                    packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+                  }
+                  packed_b[nr_block_offset] -= ksum * (int32_t) izp;
+                  packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
+                }
+                packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
+              }
+            }
+          }
+        }
+      }
+    }
+    k += kh * kw * kc * nc;
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nc;
+    }
+  }
+}
+
+void xnn_pack_q8_dwconv_ghw_w(
+  size_t h,
+  size_t w,
+  size_t c,
+  size_t cr,
+  uint8_t izp,
+  uint8_t kzp,
+  const uint8_t* k,
+  const int32_t* b,
+  void* packed_w)
+{
+  const int32_t boff = (int32_t) h * (int32_t) w * (int32_t) izp * (int32_t) kzp;
+  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+    const size_t cr_block_size = min(c - cr_block_start, cr);
+    int32_t* packed_b = (int32_t*) packed_w;
+    if XNN_LIKELY(b != NULL) {
+      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+        *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
+        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+      }
+    } else {
+      size_t n = cr_block_size;
+      do {
+        *((int32_t*) packed_w) = boff;
+        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+      } while (--n != 0);
+    }
+    packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
+    for (size_t x = 0; x < w; x++) {
+      for (size_t y = 0; y < h; y++) {
+        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+          const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
+          packed_b[cr_block_offset] -= (int32_t) kv * (int32_t) izp;
+          *((uint8_t*) packed_w) = kv;
+          packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+        }
+        packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
+      }
+    }
+  }
+}
+
+void xnn_pack_q8_dwconv_hwg_w(
+  size_t h,
+  size_t w,
+  size_t c,
+  size_t cr,
+  uint8_t izp,
+  uint8_t kzp,
+  const uint8_t* k,
+  const int32_t* b,
+  void* packed_w)
+{
+  const int32_t boff = (int32_t) h * (int32_t) w * (int32_t) izp * (int32_t) kzp;
+  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+    const size_t cr_block_size = min(c - cr_block_start, cr);
+    int32_t* packed_b = (int32_t*) packed_w;
+    if XNN_LIKELY(b != NULL) {
+      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+        *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
+        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+      }
+    } else {
+      size_t n = cr_block_size;
+      do {
+        *((int32_t*) packed_w) = boff;
+        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+      } while (--n != 0);
+    }
+    packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
+    for (size_t x = 0; x < w; x++) {
+      for (size_t y = 0; y < h; y++) {
+        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+          const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
+          packed_b[cr_block_offset] -= (int32_t) kv * (int32_t) izp;
+          *((uint8_t*) packed_w) = kv;
+          packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+        }
+        packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
+      }
+    }
+  }
+}
+
+void xnn_pack_f16_gemm_goi_w(
+  size_t g,
+  size_t nc,
+  size_t kc,
+  size_t nr,
+  size_t kr,
+  size_t sr,
+  const uint16_t* k,
+  const uint16_t* b,
+  uint16_t* packed_w)
+{
+  const size_t skr = sr * kr;
+  const size_t skc = round_down_po2(kc, skr);
+  const size_t sr_mask = (sr - 1) * kr;
+  do {
+    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+      const size_t nr_block_size = min(nc - nr_block_start, nr);
+      if XNN_LIKELY(b != NULL) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+        }
+      }
+      packed_w += nr;
+
+      for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+            *packed_w++ =
+              k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
+          }
+        }
+        packed_w += (nr - nr_block_size) * kr;
+      }
+
+      for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+        const size_t kr_block_size = min(kc - kr_block_start, kr);
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+            *packed_w++ =
+              k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
+          }
+          packed_w += kr - kr_block_size;
+        }
+        packed_w += (nr - nr_block_size) * kr;
+      }
+    }
+    k += nc * kc;
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nc;
+    }
+  } while (--g != 0);
+}
+
+void xnn_pack_f16_gemm_io_w(
+  size_t nc,
+  size_t kc,
+  size_t nr,
+  size_t kr,
+  size_t sr,
+  const uint16_t* k,
+  const uint16_t* b,
+  uint16_t* packed_w)
+{
+  const size_t skr = sr * kr;
+  const size_t skc = round_down_po2(kc, skr);
+  const size_t sr_mask = (sr - 1) * kr;
+  for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+    const size_t nr_block_size = min(nc - nr_block_start, nr);
+    if XNN_LIKELY(b != NULL) {
+      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+        packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+      }
+    }
+    packed_w += nr;
+
+    for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+        for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+          *packed_w++ =
+            k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
+        }
+      }
+      packed_w += (nr - nr_block_size) * kr;
+    }
+
+    for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+      const size_t kr_block_size = min(kc - kr_block_start, kr);
+      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+        for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+          *packed_w++ =
+            k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
+        }
+        packed_w += kr - kr_block_size;
+      }
+      packed_w += (nr - nr_block_size) * kr;
+    }
+  }
+}
+
+void xnn_pack_f16_gemminc_goi_w(
+  size_t g,
+  size_t nc,
+  size_t kc,
+  size_t nr,
+  size_t kr,
+  size_t sr,
+  const uint16_t* k,
+  uint16_t* packed_w)
+{
+  const size_t skr = sr * kr;
+  const size_t skc = round_down_po2(kc, skr);
+  const size_t sr_mask = (sr - 1) * kr;
+  do {
+    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+      const size_t nr_block_size = min(nc - nr_block_start, nr);
+
+      for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+            *packed_w++ =
+              k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
+          }
+        }
+        packed_w += (nr - nr_block_size) * kr;
+      }
+
+      for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+        const size_t kr_block_size = min(kc - kr_block_start, kr);
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+            *packed_w++ =
+              k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
+          }
+          packed_w += kr - kr_block_size;
+        }
+        packed_w += (nr - nr_block_size) * kr;
+      }
+    }
+    k += nc * kc;
+  } while (--g != 0);
+}
+
+void xnn_pack_f16_conv_goki_w(
+  size_t g,
+  size_t nc,
+  size_t ks,
+  size_t kc,
+  size_t nr,
+  size_t kr,
+  size_t sr,
+  const uint16_t* k,
+  const uint16_t* b,
+  uint16_t* packed_w)
+{
+  const size_t skr = sr * kr;
+  const size_t skc = round_down_po2(kc, skr);
+  const size_t sr_mask = (sr - 1) * kr;
+  do {
+    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+      const size_t nr_block_size = min(nc - nr_block_start, nr);
+      if XNN_LIKELY(b != NULL) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+        }
+      }
+      packed_w += nr;
+
+      for (size_t ki = 0; ki < ks; ki++) {
+        for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+            for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+              *packed_w++ =
+                k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
+            }
+          }
+          packed_w += (nr - nr_block_size) * kr;
+        }
+
+        for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+          const size_t kr_block_size = min(kc - kr_block_start, kr);
+          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+            for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+              *packed_w++ =
+                k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
+            }
+            packed_w += kr - kr_block_size;
+          }
+          packed_w += (nr - nr_block_size) * kr;
+        }
+      }
+    }
+    k += ks * kc * nc;
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nc;
+    }
+  } while (--g != 0);
+}
+
+void xnn_pack_f16_conv_kgo_w(
+  size_t g,
+  size_t nc,
+  size_t ks,
+  size_t nr,
+  size_t kr,
+  const uint16_t* k,
+  const uint16_t* b,
+  uint16_t* packed_w)
+{
+  for (size_t i = 0; i < g; i++) {
+    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+      const size_t nr_block_size = min(nc - nr_block_start, nr);
+      if XNN_LIKELY(b != NULL) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+        }
+      }
+      packed_w += nr;
+      for (size_t ki = 0; ki < ks; ki++) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          *packed_w =
+            k[ki * g * nc + (nr_block_start + nr_block_offset)];
+          packed_w += kr;
+        }
+        packed_w += (nr - nr_block_size) * kr;
+      }
+    }
+    k += nc;
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nc;
+    }
+  }
+}
+
+void xnn_pack_f16_dconv_oki_w(
+  size_t nc,
+  size_t kc,
+  size_t nr,
+  size_t kh,
+  size_t kw,
+  const uint16_t* k,
+  const uint16_t* b,
+  uint16_t* packed_w)
+{
+  for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+    const size_t nr_block_size = min(nc - nr_block_start, nr);
+    if XNN_LIKELY(b != NULL) {
+      for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
+        *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
+      }
+    } else {
+      size_t n = nr;
+      do {
+        *packed_w++ = 0;
+      } while (--n != 0);
+    }
+
+    for (size_t kx = 0; kx < kw; kx++) {
+      for (size_t c = 0; c < kc; c++) {
+        for (size_t ky = 0; ky < kh; ky++) {
+          for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
+            *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
+          }
+        }
+      }
+    }
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nr;
+    }
+  }
+}
+
+void xnn_pack_f16_deconv_goki_w(
+  size_t g,
+  size_t nc,
+  size_t kh,
+  size_t kw,
+  size_t kc,
+  size_t sh,
+  size_t sw,
+  size_t nr,
+  size_t kr,
+  size_t sr,
+  const uint16_t* k,
+  const uint16_t* b,
+  uint16_t* packed_w,
+  struct subconvolution_params* params)
+{
+  const size_t skr = sr * kr;
+  const size_t skc = round_down_po2(kc, skr);
+  const size_t sr_mask = (sr - 1) * kr;
+  for (size_t i = 0; i < g; i++) {
+    for (size_t oy = 0; oy < sh; oy++) {
+      for (size_t ox = 0; ox < sw; ox++) {
+        if (i == 0) {
+          (*params++).weights = packed_w;
+        }
+        for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+          const size_t nr_block_size = min(nc - nr_block_start, nr);
+          if XNN_LIKELY(b != NULL) {
+            for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+              packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+            }
+          }
+          packed_w += nr;
+          for (size_t ky = oy; ky < kh; ky += sh) {
+            for (size_t kx = ox; kx < kw; kx += sw) {
+              for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+                for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+                  for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+                    *packed_w++ =
+                      k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
+                  }
+                }
+                packed_w += (nr - nr_block_size) * kr;
+              }
+
+              for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+                const size_t kr_block_size = min(kc - kr_block_start, kr);
+                for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+                  for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+                    *packed_w++ =
+                      k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
+                  }
+                  packed_w += kr - kr_block_size;
+                }
+                packed_w += (nr - nr_block_size) * kr;
+              }
+            }
+          }
+        }
+      }
+    }
+    k += kh * kw * kc * nc;
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nc;
+    }
+  }
+}
+
+void xnn_pack_f16_dwconv_ghw_w(
+  size_t h,
+  size_t w,
+  size_t c,
+  size_t cr,
+  const uint16_t* k,
+  const uint16_t* b,
+  uint16_t* packed_w)
+{
+  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+    const size_t cr_block_size = min(c - cr_block_start, cr);
+    if XNN_LIKELY(b != NULL) {
+      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+        *packed_w++ = b[cr_block_start + cr_block_offset];
+      }
+    } else {
+      size_t n = cr_block_size;
+      do {
+        *packed_w++ = 0;
+      } while (--n != 0);
+    }
+    packed_w += cr - cr_block_size;
+    for (size_t x = 0; x < w; x++) {
+      for (size_t y = 0; y < h; y++) {
+        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+          const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
+          *packed_w++ = kv;
+        }
+        packed_w += cr - cr_block_size;
+      }
+    }
+  }
+}
+
+void xnn_pack_f16_dwconv_hwg_w(
+  size_t h,
+  size_t w,
+  size_t c,
+  size_t cr,
+  const uint16_t* k,
+  const uint16_t* b,
+  uint16_t* packed_w)
+{
+  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+    const size_t cr_block_size = min(c - cr_block_start, cr);
+    if XNN_LIKELY(b != NULL) {
+      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+        *packed_w++ = b[cr_block_start + cr_block_offset];
+      }
+    } else {
+      size_t n = cr_block_size;
+      do {
+        *packed_w++ = 0;
+      } while (--n != 0);
+    }
+    packed_w += cr - cr_block_size;
+    for (size_t x = 0; x < w; x++) {
+      for (size_t y = 0; y < h; y++) {
+        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+          const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
+          *packed_w++ = kv;
+        }
+        packed_w += cr - cr_block_size;
+      }
+    }
+  }
+}
+
+void xnn_pack_f16_chw_dwconv_ghw_w(
+  size_t kernel_size,
+  size_t groups,
+  const uint16_t* kernel,
+  const uint16_t* bias,
+  uint16_t* packed_weights)
+{
+  for (size_t g = 0; g < groups; g++) {
+    if XNN_LIKELY(bias != NULL) {
+      *packed_weights = *bias++;
+    } else {
+      *packed_weights = 0;
+    }
+    packed_weights += 1;
+    for (size_t i = 0; i < kernel_size; i++) {
+      *packed_weights++ = kernel[g * kernel_size + i];
+    }
+  }
+}
+
+void xnn_pack_f32_gemm_goi_w(
+  size_t g,
+  size_t nc,
+  size_t kc,
+  size_t nr,
+  size_t kr,
+  size_t sr,
+  const float* k,
+  const float* b,
+  float* packed_w)
+{
+  const size_t skr = sr * kr;
+  const size_t skc = round_down_po2(kc, skr);
+  const size_t sr_mask = (sr - 1) * kr;
+  do {
+    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+      const size_t nr_block_size = min(nc - nr_block_start, nr);
+      if XNN_LIKELY(b != NULL) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+        }
+      }
+      packed_w += nr;
+
+      for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+            *packed_w++ =
+              k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
+          }
+        }
+        packed_w += (nr - nr_block_size) * kr;
+      }
+
+      for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+        const size_t kr_block_size = min(kc - kr_block_start, kr);
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+            *packed_w++ =
+              k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
+          }
+          packed_w += kr - kr_block_size;
+        }
+        packed_w += (nr - nr_block_size) * kr;
+      }
+    }
+    k += nc * kc;
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nc;
+    }
+  } while (--g != 0);
+}
+
+void xnn_pack_f32_gemm_io_w(
+  size_t nc,
+  size_t kc,
+  size_t nr,
+  size_t kr,
+  size_t sr,
+  const float* k,
+  const float* b,
+  float* packed_w)
+{
+  const size_t skr = sr * kr;
+  const size_t skc = round_down_po2(kc, skr);
+  const size_t sr_mask = (sr - 1) * kr;
+  for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+    const size_t nr_block_size = min(nc - nr_block_start, nr);
+    if XNN_LIKELY(b != NULL) {
+      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+        packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+      }
+    }
+    packed_w += nr;
+
+    for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+        for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+          *packed_w++ =
+            k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
+        }
+      }
+      packed_w += (nr - nr_block_size) * kr;
+    }
+
+    for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+      const size_t kr_block_size = min(kc - kr_block_start, kr);
+      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+        for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+          *packed_w++ =
+            k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
+        }
+        packed_w += kr - kr_block_size;
+      }
+      packed_w += (nr - nr_block_size) * kr;
+    }
+  }
+}
+
+void xnn_pack_f32_gemminc_goi_w(
+  size_t g,
+  size_t nc,
+  size_t kc,
+  size_t nr,
+  size_t kr,
+  size_t sr,
+  const float* k,
+  float* packed_w)
+{
+  const size_t skr = sr * kr;
+  const size_t skc = round_down_po2(kc, skr);
+  const size_t sr_mask = (sr - 1) * kr;
+  do {
+    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+      const size_t nr_block_size = min(nc - nr_block_start, nr);
+
+      for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+            *packed_w++ =
+              k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
+          }
+        }
+        packed_w += (nr - nr_block_size) * kr;
+      }
+
+      for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+        const size_t kr_block_size = min(kc - kr_block_start, kr);
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+            *packed_w++ =
+              k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
+          }
+          packed_w += kr - kr_block_size;
+        }
+        packed_w += (nr - nr_block_size) * kr;
+      }
+    }
+    k += nc * kc;
+  } while (--g != 0);
+}
+
+void xnn_pack_f32_conv_goki_w(
+  size_t g,
+  size_t nc,
+  size_t ks,
+  size_t kc,
+  size_t nr,
+  size_t kr,
+  size_t sr,
+  const float* k,
+  const float* b,
+  float* packed_w)
+{
+  const size_t skr = sr * kr;
+  const size_t skc = round_down_po2(kc, skr);
+  const size_t sr_mask = (sr - 1) * kr;
+  do {
+    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+      const size_t nr_block_size = min(nc - nr_block_start, nr);
+      if XNN_LIKELY(b != NULL) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+        }
+      }
+      packed_w += nr;
+
+      for (size_t ki = 0; ki < ks; ki++) {
+        for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+            for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+              *packed_w++ =
+                k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
+            }
+          }
+          packed_w += (nr - nr_block_size) * kr;
+        }
+
+        for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+          const size_t kr_block_size = min(kc - kr_block_start, kr);
+          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+            for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+              *packed_w++ =
+                k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
+            }
+            packed_w += kr - kr_block_size;
+          }
+          packed_w += (nr - nr_block_size) * kr;
+        }
+      }
+    }
+    k += ks * kc * nc;
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nc;
+    }
+  } while (--g != 0);
+}
+
+void xnn_pack_f32_conv_kgo_w(
+  size_t g,
+  size_t nc,
+  size_t ks,
+  size_t nr,
+  size_t kr,
+  const float* k,
+  const float* b,
+  float* packed_w)
+{
+  for (size_t i = 0; i < g; i++) {
+    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+      const size_t nr_block_size = min(nc - nr_block_start, nr);
+      if XNN_LIKELY(b != NULL) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+        }
+      }
+      packed_w += nr;
+      for (size_t ki = 0; ki < ks; ki++) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          *packed_w =
+            k[ki * g * nc + (nr_block_start + nr_block_offset)];
+          packed_w += kr;
+        }
+        packed_w += (nr - nr_block_size) * kr;
+      }
+    }
+    k += nc;
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nc;
+    }
+  }
+}
+
+void xnn_pack_f32_dconv_oki_w(
+  size_t nc,
+  size_t kc,
+  size_t nr,
+  size_t kh,
+  size_t kw,
+  const float* k,
+  const float* b,
+  float* packed_w)
+{
+  for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+    const size_t nr_block_size = min(nc - nr_block_start, nr);
+    if XNN_LIKELY(b != NULL) {
+      for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
+        *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
+      }
+    } else {
+      size_t n = nr;
+      do {
+        *packed_w++ = 0.0f;
+      } while (--n != 0);
+    }
+
+    for (size_t kx = 0; kx < kw; kx++) {
+      for (size_t c = 0; c < kc; c++) {
+        for (size_t ky = 0; ky < kh; ky++) {
+          for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
+            *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
+          }
+        }
+      }
+    }
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nr;
+    }
+  }
+}
+
+void xnn_pack_f32_deconv_goki_w(
+  size_t g,
+  size_t nc,
+  size_t kh,
+  size_t kw,
+  size_t kc,
+  size_t sh,
+  size_t sw,
+  size_t nr,
+  size_t kr,
+  size_t sr,
+  const float* k,
+  const float* b,
+  float* packed_w,
+  struct subconvolution_params* params)
+{
+  const size_t skr = sr * kr;
+  const size_t skc = round_down_po2(kc, skr);
+  const size_t sr_mask = (sr - 1) * kr;
+  for (size_t i = 0; i < g; i++) {
+    for (size_t oy = 0; oy < sh; oy++) {
+      for (size_t ox = 0; ox < sw; ox++) {
+        if (i == 0) {
+          (*params++).weights = packed_w;
+        }
+        for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+          const size_t nr_block_size = min(nc - nr_block_start, nr);
+          if XNN_LIKELY(b != NULL) {
+            for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+              packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
+            }
+          }
+          packed_w += nr;
+          for (size_t ky = oy; ky < kh; ky += sh) {
+            for (size_t kx = ox; kx < kw; kx += sw) {
+              for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+                for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+                  for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+                    *packed_w++ =
+                      k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
+                  }
+                }
+                packed_w += (nr - nr_block_size) * kr;
+              }
+
+              for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+                const size_t kr_block_size = min(kc - kr_block_start, kr);
+                for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+                  for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+                    *packed_w++ =
+                      k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
+                  }
+                  packed_w += kr - kr_block_size;
+                }
+                packed_w += (nr - nr_block_size) * kr;
+              }
+            }
+          }
+        }
+      }
+    }
+    k += kh * kw * kc * nc;
+    if XNN_UNPREDICTABLE(b != NULL) {
+      b += nc;
+    }
+  }
+}
+
+void xnn_pack_f32_dwconv_ghw_w(
+  size_t h,
+  size_t w,
+  size_t c,
+  size_t cr,
+  const float* k,
+  const float* b,
+  float* packed_w)
+{
+  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+    const size_t cr_block_size = min(c - cr_block_start, cr);
+    if XNN_LIKELY(b != NULL) {
+      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+        *packed_w++ = b[cr_block_start + cr_block_offset];
+      }
+    } else {
+      size_t n = cr_block_size;
+      do {
+        *packed_w++ = 0.0f;
+      } while (--n != 0);
+    }
+    packed_w += cr - cr_block_size;
+    for (size_t x = 0; x < w; x++) {
+      for (size_t y = 0; y < h; y++) {
+        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+          const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
+          *packed_w++ = kv;
+        }
+        packed_w += cr - cr_block_size;
+      }
+    }
+  }
+}
+
+void xnn_pack_f32_dwconv_hwg_w(
+  size_t h,
+  size_t w,
+  size_t c,
+  size_t cr,
+  const float* k,
+  const float* b,
+  float* packed_w)
+{
+  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+    const size_t cr_block_size = min(c - cr_block_start, cr);
+    if XNN_LIKELY(b != NULL) {
+      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+        *packed_w++ = b[cr_block_start + cr_block_offset];
+      }
+    } else {
+      size_t n = cr_block_size;
+      do {
+        *packed_w++ = 0.0f;
+      } while (--n != 0);
+    }
+    packed_w += cr - cr_block_size;
+    for (size_t x = 0; x < w; x++) {
+      for (size_t y = 0; y < h; y++) {
+        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+          const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
+          *packed_w++ = kv;
+        }
+        packed_w += cr - cr_block_size;
+      }
+    }
+  }
+}
+
+void xnn_pack_f32_chw_dwconv_ghw_w(
+  size_t kernel_size,
+  size_t groups,
+  const float* kernel,
+  const float* bias,
+  float* packed_weights)
+{
+  for (size_t g = 0; g < groups; g++) {
+    if XNN_LIKELY(bias != NULL) {
+      *packed_weights = *bias++;
+    } else {
+      *packed_weights = 0.0f;
+    }
+    packed_weights += 1;
+    for (size_t i = 0; i < kernel_size; i++) {
+      *packed_weights++ = kernel[g * kernel_size + i];
+    }
+  }
+}
+
+void xnn_pack_f32_chw_dwconv_hwg_w(
+  size_t kernel_size,
+  size_t groups,
+  const float* kernel,
+  const float* bias,
+  float* packed_weights)
+{
+  for (size_t g = 0; g < groups; g++) {
+    if XNN_LIKELY(bias != NULL) {
+      *packed_weights = *bias++;
+    } else {
+      *packed_weights = 0.0f;
+    }
+    packed_weights += 1;
+    for (size_t i = 0; i < kernel_size; i++) {
+      *packed_weights++ = kernel[i * groups + g];
+    }
+  }
+}
+
+void xnn_pack_f32_vmulcaddc_w(
+  size_t c,
+  size_t cr,
+  const float* s,
+  const float* b,
+  float* packed_w)
+{
+  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+    const size_t cr_block_size = min(c - cr_block_start, cr);
+    for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+      *packed_w++ = s[cr_block_start + cr_block_offset];
+    }
+    packed_w += cr - cr_block_size;
+    if XNN_LIKELY(b != NULL) {
+      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+        *packed_w++ = b[cr_block_start + cr_block_offset];
+      }
+    } else {
+      size_t n = cr_block_size;
+      do {
+        *packed_w++ = 0.0f;
+      } while (--n != 0);
+    }
+    packed_w += cr - cr_block_size;
+  }
+}
+
+void xnn_pack_f16_vmulcaddc_w(
+  size_t c,
+  size_t cr,
+  const uint16_t* s,
+  const uint16_t* b,
+  uint16_t* packed_w)
+{
+  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+    const size_t cr_block_size = min(c - cr_block_start, cr);
+    for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+      *packed_w++ = s[cr_block_start + cr_block_offset];
+    }
+    packed_w += cr - cr_block_size;
+    if XNN_LIKELY(b != NULL) {
+      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+        *packed_w++ = b[cr_block_start + cr_block_offset];
+      }
+    } else {
+      size_t n = cr_block_size;
+      do {
+        *packed_w++ = 0;
+      } while (--n != 0);
+    }
+    packed_w += cr - cr_block_size;
+  }
+}
diff --git a/src/xnnpack/pack.h b/src/xnnpack/pack.h
index 95c55dd..4c7e13d 100644
--- a/src/xnnpack/pack.h
+++ b/src/xnnpack/pack.h
@@ -9,11 +9,17 @@
 #pragma once
 
 #include <stdint.h>
-#include <xnnpack/math.h>
+#include <stddef.h>
+
+#include <xnnpack/common.h>
 #include <xnnpack/operator.h>
 
 
-static inline void xnn_pack_q8_gemm_goi_w(
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+XNN_INTERNAL void xnn_pack_q8_gemm_goi_w(
   size_t g,
   size_t nc,
   size_t kc,
@@ -23,50 +29,9 @@
   uint8_t kzp,
   const uint8_t* k,
   const int32_t* b,
-  void* packed_w)
-{
-  const int32_t boff = (int32_t) kc * (int32_t) izp * (int32_t) kzp;
-  do {
-    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
-      const size_t nr_block_size = min(nc - nr_block_start, nr);
-      int32_t* packed_b = (int32_t*) packed_w;
-      if XNN_LIKELY(b != NULL) {
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
-          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
-        }
-      } else {
-        size_t n = nr_block_size;
-        do {
-          *((int32_t*) packed_w) = boff;
-          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
-        } while (--n != 0);
-      }
-      packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
-      for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
-        const size_t kr_block_size = min(kc - kr_block_start, kr);
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          int32_t ksum = 0;
-          for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
-            const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
-            ksum += (int32_t) kv;
-            *((uint8_t*) packed_w) = kv;
-            packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
-          }
-          packed_b[nr_block_offset] -= ksum * (int32_t) izp;
-          packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
-        }
-        packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
-      }
-    }
-    k += nc * kc;
-    if XNN_UNPREDICTABLE(b != NULL) {
-      b += nc;
-    }
-  } while (--g != 0);
-}
+  void* packed_w);
 
-static inline void xnn_pack_q8_gemm_io_w(
+XNN_INTERNAL void xnn_pack_q8_gemm_io_w(
   size_t nc,
   size_t kc,
   uint32_t nr,
@@ -75,44 +40,9 @@
   uint8_t kzp,
   const uint8_t* k,
   const int32_t* b,
-  void* packed_w)
-{
-  const int32_t boff = (int32_t) kc * (int32_t) izp * (int32_t) kzp;
-  for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
-    const size_t nr_block_size = min(nc - nr_block_start, nr);
-    int32_t* packed_b = (int32_t*) packed_w;
-    if XNN_LIKELY(b != NULL) {
-      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-        *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
-        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
-      }
-    } else {
-      size_t n = nr_block_size;
-      do {
-        *((int32_t*) packed_w) = boff;
-        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
-      } while (--n != 0);
-    }
-    packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
-    for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
-      const size_t kr_block_size = min(kc - kr_block_start, kr);
-      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-        int32_t ksum = 0;
-        for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
-          const uint8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
-          ksum += (int32_t) kv;
-          *((uint8_t*) packed_w) = kv;
-          packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
-        }
-        packed_b[nr_block_offset] -= ksum * (int32_t) izp;
-        packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
-      }
-      packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
-    }
-  }
-}
+  void* packed_w);
 
-static inline void xnn_pack_q8_conv_goki_w(
+XNN_INTERNAL void xnn_pack_q8_conv_goki_w(
   size_t g,
   size_t nc,
   size_t ks,
@@ -123,53 +53,9 @@
   uint8_t kzp,
   const uint8_t* k,
   const int32_t* b,
-  void* packed_w)
-{
-  const int32_t boff = (int32_t) ks * (int32_t) kc * (int32_t) izp * (int32_t) kzp;
-  do {
-    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
-      const size_t nr_block_size = min(nc - nr_block_start, nr);
-      int32_t* packed_b = (int32_t*) packed_w;
-      if XNN_LIKELY(b != NULL) {
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
-          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
-        }
-      } else {
-        size_t n = nr_block_size;
-        do {
-          *((int32_t*) packed_w) = boff;
-          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
-        } while (--n != 0);
-      }
-      packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
-      for (size_t ki = 0; ki < ks; ki++) {
-        for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
-          const size_t kr_block_size = min(kc - kr_block_start, kr);
-          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-            int32_t ksum = 0;
-            for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
-              const uint8_t kv =
-                k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
-              ksum += (int32_t) kv;
-              *((uint8_t*) packed_w) = kv;
-              packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
-            }
-            packed_b[nr_block_offset] -= ksum * (int32_t) izp;
-            packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
-          }
-          packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
-        }
-      }
-    }
-    k += ks * kc * nc;
-    if XNN_UNPREDICTABLE(b != NULL) {
-      b += nc;
-    }
-  } while (--g != 0);
-}
+  void* packed_w);
 
-static inline void xnn_pack_q8_conv_kgo_w(
+XNN_INTERNAL void xnn_pack_q8_conv_kgo_w(
   size_t g,
   size_t nc,
   size_t ks,
@@ -179,45 +65,9 @@
   uint8_t kzp,
   const uint8_t* k,
   const int32_t* b,
-  void* packed_w)
-{
-  const int32_t boff = (int32_t) ks * (int32_t) izp * (int32_t) kzp;
-  for (size_t i = 0; i < g; i++) {
-    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
-      const size_t nr_block_size = min(nc - nr_block_start, nr);
-      int32_t* packed_b = (int32_t*) packed_w;
-      if XNN_LIKELY(b != NULL) {
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
-          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
-        }
-      } else {
-        size_t n = nr_block_size;
-        do {
-          *((int32_t*) packed_w) = boff;
-          packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
-        } while (--n != 0);
-      }
-      packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
-      for (size_t ki = 0; ki < ks; ki++) {
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          const uint8_t kv =
-            k[ki * g * nc + (nr_block_start + nr_block_offset)];
-          *((uint8_t*) packed_w) = kv;
-          packed_b[nr_block_offset] -= (int32_t) kv * (int32_t) izp;
-          packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(uint8_t));
-        }
-        packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
-      }
-    }
-    k += nc;
-    if XNN_UNPREDICTABLE(b != NULL) {
-      b += nc;
-    }
-  }
-}
+  void* packed_w);
 
-static inline void xnn_pack_q8_deconv_goki_w(
+XNN_INTERNAL void xnn_pack_q8_deconv_goki_w(
   size_t g,
   size_t nc,
   size_t kh,
@@ -232,62 +82,9 @@
   const uint8_t* k,
   const int32_t* b,
   void* packed_w,
-  struct subconvolution_params* params)
-{
-  for (size_t i = 0; i < g; i++) {
-    for (size_t oy = 0; oy < sh; oy++) {
-      for (size_t ox = 0; ox < sw; ox++) {
-        if (i == 0) {
-          (*params++).weights = packed_w;
-        }
-        const int32_t boff = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * (int32_t) izp * (int32_t) kzp;
-        for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
-          const size_t nr_block_size = min(nc - nr_block_start, nr);
-          int32_t* packed_b = (int32_t*) packed_w;
-          if XNN_LIKELY(b != 0) {
-            for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-              *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
-              packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
-            }
-          } else {
-            size_t n = nr_block_size;
-            do {
-              *((int32_t*) packed_w) = boff;
-              packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
-            } while (--n != 0);
-          }
-          packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
-          for (size_t ky = oy; ky < kh; ky += sh) {
-            for (size_t kx = ox; kx < kw; kx += sw) {
-              for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
-                const size_t kr_block_size = min(kc - kr_block_start, kr);
-                for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-                  int32_t ksum = 0;
-                  for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
-                    const uint8_t kv =
-                      k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
-                    ksum += (int32_t) kv;
-                    *((uint8_t*) packed_w) = kv;
-                    packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
-                  }
-                  packed_b[nr_block_offset] -= ksum * (int32_t) izp;
-                  packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
-                }
-                packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
-              }
-            }
-          }
-        }
-      }
-    }
-    k += kh * kw * kc * nc;
-    if XNN_UNPREDICTABLE(b != NULL) {
-      b += nc;
-    }
-  }
-}
+  struct subconvolution_params* params);
 
-static inline void xnn_pack_q8_dwconv_ghw_w(
+XNN_INTERNAL void xnn_pack_q8_dwconv_ghw_w(
   size_t h,
   size_t w,
   size_t c,
@@ -296,40 +93,9 @@
   uint8_t kzp,
   const uint8_t* k,
   const int32_t* b,
-  void* packed_w)
-{
-  const int32_t boff = (int32_t) h * (int32_t) w * (int32_t) izp * (int32_t) kzp;
-  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
-    const size_t cr_block_size = min(c - cr_block_start, cr);
-    int32_t* packed_b = (int32_t*) packed_w;
-    if XNN_LIKELY(b != NULL) {
-      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
-        *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
-        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
-      }
-    } else {
-      size_t n = cr_block_size;
-      do {
-        *((int32_t*) packed_w) = boff;
-        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
-      } while (--n != 0);
-    }
-    packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
-    for (size_t x = 0; x < w; x++) {
-      for (size_t y = 0; y < h; y++) {
-        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
-          const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
-          packed_b[cr_block_offset] -= (int32_t) kv * (int32_t) izp;
-          *((uint8_t*) packed_w) = kv;
-          packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
-        }
-        packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
-      }
-    }
-  }
-}
+  void* packed_w);
 
-static inline void xnn_pack_q8_dwconv_hwg_w(
+XNN_INTERNAL void xnn_pack_q8_dwconv_hwg_w(
   size_t h,
   size_t w,
   size_t c,
@@ -338,40 +104,9 @@
   uint8_t kzp,
   const uint8_t* k,
   const int32_t* b,
-  void* packed_w)
-{
-  const int32_t boff = (int32_t) h * (int32_t) w * (int32_t) izp * (int32_t) kzp;
-  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
-    const size_t cr_block_size = min(c - cr_block_start, cr);
-    int32_t* packed_b = (int32_t*) packed_w;
-    if XNN_LIKELY(b != NULL) {
-      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
-        *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
-        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
-      }
-    } else {
-      size_t n = cr_block_size;
-      do {
-        *((int32_t*) packed_w) = boff;
-        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
-      } while (--n != 0);
-    }
-    packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
-    for (size_t x = 0; x < w; x++) {
-      for (size_t y = 0; y < h; y++) {
-        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
-          const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
-          packed_b[cr_block_offset] -= (int32_t) kv * (int32_t) izp;
-          *((uint8_t*) packed_w) = kv;
-          packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
-        }
-        packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
-      }
-    }
-  }
-}
+  void* packed_w);
 
-static inline void xnn_pack_f16_gemm_goi_w(
+XNN_INTERNAL void xnn_pack_f16_gemm_goi_w(
   size_t g,
   size_t nc,
   size_t kc,
@@ -380,51 +115,9 @@
   size_t sr,
   const uint16_t* k,
   const uint16_t* b,
-  uint16_t* packed_w)
-{
-  const size_t skr = sr * kr;
-  const size_t skc = round_down_po2(kc, skr);
-  const size_t sr_mask = (sr - 1) * kr;
-  do {
-    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
-      const size_t nr_block_size = min(nc - nr_block_start, nr);
-      if XNN_LIKELY(b != NULL) {
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
-        }
-      }
-      packed_w += nr;
+  uint16_t* packed_w);
 
-      for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
-            *packed_w++ =
-              k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
-          }
-        }
-        packed_w += (nr - nr_block_size) * kr;
-      }
-
-      for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
-        const size_t kr_block_size = min(kc - kr_block_start, kr);
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
-            *packed_w++ =
-              k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
-          }
-          packed_w += kr - kr_block_size;
-        }
-        packed_w += (nr - nr_block_size) * kr;
-      }
-    }
-    k += nc * kc;
-    if XNN_UNPREDICTABLE(b != NULL) {
-      b += nc;
-    }
-  } while (--g != 0);
-}
-
-static inline void xnn_pack_f16_gemm_io_w(
+XNN_INTERNAL void xnn_pack_f16_gemm_io_w(
   size_t nc,
   size_t kc,
   size_t nr,
@@ -432,45 +125,9 @@
   size_t sr,
   const uint16_t* k,
   const uint16_t* b,
-  uint16_t* packed_w)
-{
-  const size_t skr = sr * kr;
-  const size_t skc = round_down_po2(kc, skr);
-  const size_t sr_mask = (sr - 1) * kr;
-  for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
-    const size_t nr_block_size = min(nc - nr_block_start, nr);
-    if XNN_LIKELY(b != NULL) {
-      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-        packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
-      }
-    }
-    packed_w += nr;
+  uint16_t* packed_w);
 
-    for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
-      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-        for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
-          *packed_w++ =
-            k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
-        }
-      }
-      packed_w += (nr - nr_block_size) * kr;
-    }
-
-    for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
-      const size_t kr_block_size = min(kc - kr_block_start, kr);
-      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-        for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
-          *packed_w++ =
-            k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
-        }
-        packed_w += kr - kr_block_size;
-      }
-      packed_w += (nr - nr_block_size) * kr;
-    }
-  }
-}
-
-static inline void xnn_pack_f16_gemminc_goi_w(
+XNN_INTERNAL void xnn_pack_f16_gemminc_goi_w(
   size_t g,
   size_t nc,
   size_t kc,
@@ -478,42 +135,9 @@
   size_t kr,
   size_t sr,
   const uint16_t* k,
-  uint16_t* packed_w)
-{
-  const size_t skr = sr * kr;
-  const size_t skc = round_down_po2(kc, skr);
-  const size_t sr_mask = (sr - 1) * kr;
-  do {
-    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
-      const size_t nr_block_size = min(nc - nr_block_start, nr);
+  uint16_t* packed_w);
 
-      for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
-            *packed_w++ =
-              k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
-          }
-        }
-        packed_w += (nr - nr_block_size) * kr;
-      }
-
-      for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
-        const size_t kr_block_size = min(kc - kr_block_start, kr);
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
-            *packed_w++ =
-              k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
-          }
-          packed_w += kr - kr_block_size;
-        }
-        packed_w += (nr - nr_block_size) * kr;
-      }
-    }
-    k += nc * kc;
-  } while (--g != 0);
-}
-
-static inline void xnn_pack_f16_conv_goki_w(
+XNN_INTERNAL void xnn_pack_f16_conv_goki_w(
   size_t g,
   size_t nc,
   size_t ks,
@@ -523,53 +147,9 @@
   size_t sr,
   const uint16_t* k,
   const uint16_t* b,
-  uint16_t* packed_w)
-{
-  const size_t skr = sr * kr;
-  const size_t skc = round_down_po2(kc, skr);
-  const size_t sr_mask = (sr - 1) * kr;
-  do {
-    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
-      const size_t nr_block_size = min(nc - nr_block_start, nr);
-      if XNN_LIKELY(b != NULL) {
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
-        }
-      }
-      packed_w += nr;
+  uint16_t* packed_w);
 
-      for (size_t ki = 0; ki < ks; ki++) {
-        for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
-          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-            for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
-              *packed_w++ =
-                k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
-            }
-          }
-          packed_w += (nr - nr_block_size) * kr;
-        }
-
-        for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
-          const size_t kr_block_size = min(kc - kr_block_start, kr);
-          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-            for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
-              *packed_w++ =
-                k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
-            }
-            packed_w += kr - kr_block_size;
-          }
-          packed_w += (nr - nr_block_size) * kr;
-        }
-      }
-    }
-    k += ks * kc * nc;
-    if XNN_UNPREDICTABLE(b != NULL) {
-      b += nc;
-    }
-  } while (--g != 0);
-}
-
-static inline void xnn_pack_f16_conv_kgo_w(
+XNN_INTERNAL void xnn_pack_f16_conv_kgo_w(
   size_t g,
   size_t nc,
   size_t ks,
@@ -577,34 +157,9 @@
   size_t kr,
   const uint16_t* k,
   const uint16_t* b,
-  uint16_t* packed_w)
-{
-  for (size_t i = 0; i < g; i++) {
-    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
-      const size_t nr_block_size = min(nc - nr_block_start, nr);
-      if XNN_LIKELY(b != NULL) {
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
-        }
-      }
-      packed_w += nr;
-      for (size_t ki = 0; ki < ks; ki++) {
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          *packed_w =
-            k[ki * g * nc + (nr_block_start + nr_block_offset)];
-          packed_w += kr;
-        }
-        packed_w += (nr - nr_block_size) * kr;
-      }
-    }
-    k += nc;
-    if XNN_UNPREDICTABLE(b != NULL) {
-      b += nc;
-    }
-  }
-}
+  uint16_t* packed_w);
 
-static inline void xnn_pack_f16_dconv_oki_w(
+XNN_INTERNAL void xnn_pack_f16_dconv_oki_w(
   size_t nc,
   size_t kc,
   size_t nr,
@@ -612,37 +167,9 @@
   size_t kw,
   const uint16_t* k,
   const uint16_t* b,
-  uint16_t* packed_w)
-{
-  for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
-    const size_t nr_block_size = min(nc - nr_block_start, nr);
-    if XNN_LIKELY(b != NULL) {
-      for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
-        *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
-      }
-    } else {
-      size_t n = nr;
-      do {
-        *packed_w++ = 0;
-      } while (--n != 0);
-    }
+  uint16_t* packed_w);
 
-    for (size_t kx = 0; kx < kw; kx++) {
-      for (size_t c = 0; c < kc; c++) {
-        for (size_t ky = 0; ky < kh; ky++) {
-          for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
-            *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
-          }
-        }
-      }
-    }
-    if XNN_UNPREDICTABLE(b != NULL) {
-      b += nr;
-    }
-  }
-}
-
-static inline void xnn_pack_f16_deconv_goki_w(
+XNN_INTERNAL void xnn_pack_f16_deconv_goki_w(
   size_t g,
   size_t nc,
   size_t kh,
@@ -656,149 +183,34 @@
   const uint16_t* k,
   const uint16_t* b,
   uint16_t* packed_w,
-  struct subconvolution_params* params)
-{
-  const size_t skr = sr * kr;
-  const size_t skc = round_down_po2(kc, skr);
-  const size_t sr_mask = (sr - 1) * kr;
-  for (size_t i = 0; i < g; i++) {
-    for (size_t oy = 0; oy < sh; oy++) {
-      for (size_t ox = 0; ox < sw; ox++) {
-        if (i == 0) {
-          (*params++).weights = packed_w;
-        }
-        for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
-          const size_t nr_block_size = min(nc - nr_block_start, nr);
-          if XNN_LIKELY(b != NULL) {
-            for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-              packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
-            }
-          }
-          packed_w += nr;
-          for (size_t ky = oy; ky < kh; ky += sh) {
-            for (size_t kx = ox; kx < kw; kx += sw) {
-              for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
-                for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-                  for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
-                    *packed_w++ =
-                      k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
-                  }
-                }
-                packed_w += (nr - nr_block_size) * kr;
-              }
+  struct subconvolution_params* params);
 
-              for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
-                const size_t kr_block_size = min(kc - kr_block_start, kr);
-                for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-                  for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
-                    *packed_w++ =
-                      k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
-                  }
-                  packed_w += kr - kr_block_size;
-                }
-                packed_w += (nr - nr_block_size) * kr;
-              }
-            }
-          }
-        }
-      }
-    }
-    k += kh * kw * kc * nc;
-    if XNN_UNPREDICTABLE(b != NULL) {
-      b += nc;
-    }
-  }
-}
-
-static inline void xnn_pack_f16_dwconv_ghw_w(
+XNN_INTERNAL void xnn_pack_f16_dwconv_ghw_w(
   size_t h,
   size_t w,
   size_t c,
   size_t cr,
   const uint16_t* k,
   const uint16_t* b,
-  uint16_t* packed_w)
-{
-  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
-    const size_t cr_block_size = min(c - cr_block_start, cr);
-    if XNN_LIKELY(b != NULL) {
-      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
-        *packed_w++ = b[cr_block_start + cr_block_offset];
-      }
-    } else {
-      size_t n = cr_block_size;
-      do {
-        *packed_w++ = 0;
-      } while (--n != 0);
-    }
-    packed_w += cr - cr_block_size;
-    for (size_t x = 0; x < w; x++) {
-      for (size_t y = 0; y < h; y++) {
-        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
-          const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
-          *packed_w++ = kv;
-        }
-        packed_w += cr - cr_block_size;
-      }
-    }
-  }
-}
+  uint16_t* packed_w);
 
-static inline void xnn_pack_f16_dwconv_hwg_w(
+XNN_INTERNAL void xnn_pack_f16_dwconv_hwg_w(
   size_t h,
   size_t w,
   size_t c,
   size_t cr,
   const uint16_t* k,
   const uint16_t* b,
-  uint16_t* packed_w)
-{
-  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
-    const size_t cr_block_size = min(c - cr_block_start, cr);
-    if XNN_LIKELY(b != NULL) {
-      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
-        *packed_w++ = b[cr_block_start + cr_block_offset];
-      }
-    } else {
-      size_t n = cr_block_size;
-      do {
-        *packed_w++ = 0;
-      } while (--n != 0);
-    }
-    packed_w += cr - cr_block_size;
-    for (size_t x = 0; x < w; x++) {
-      for (size_t y = 0; y < h; y++) {
-        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
-          const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
-          *packed_w++ = kv;
-        }
-        packed_w += cr - cr_block_size;
-      }
-    }
-  }
-}
+  uint16_t* packed_w);
 
-static inline void xnn_pack_f16_chw_dwconv_ghw_w(
+XNN_INTERNAL void xnn_pack_f16_chw_dwconv_ghw_w(
   size_t kernel_size,
   size_t groups,
   const uint16_t* kernel,
   const uint16_t* bias,
-  uint16_t* packed_weights)
-{
-  for (size_t g = 0; g < groups; g++) {
-    if XNN_LIKELY(bias != NULL) {
-      *packed_weights = *bias++;
-    } else {
-      *packed_weights = 0;
-    }
-    packed_weights += 1;
-    for (size_t i = 0; i < kernel_size; i++) {
-      *packed_weights++ = kernel[g * kernel_size + i];
-    }
-  }
-}
+  uint16_t* packed_weights);
 
-static inline void xnn_pack_f32_gemm_goi_w(
+XNN_INTERNAL void xnn_pack_f32_gemm_goi_w(
   size_t g,
   size_t nc,
   size_t kc,
@@ -807,51 +219,9 @@
   size_t sr,
   const float* k,
   const float* b,
-  float* packed_w)
-{
-  const size_t skr = sr * kr;
-  const size_t skc = round_down_po2(kc, skr);
-  const size_t sr_mask = (sr - 1) * kr;
-  do {
-    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
-      const size_t nr_block_size = min(nc - nr_block_start, nr);
-      if XNN_LIKELY(b != NULL) {
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
-        }
-      }
-      packed_w += nr;
+  float* packed_w);
 
-      for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
-            *packed_w++ =
-              k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
-          }
-        }
-        packed_w += (nr - nr_block_size) * kr;
-      }
-
-      for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
-        const size_t kr_block_size = min(kc - kr_block_start, kr);
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
-            *packed_w++ =
-              k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
-          }
-          packed_w += kr - kr_block_size;
-        }
-        packed_w += (nr - nr_block_size) * kr;
-      }
-    }
-    k += nc * kc;
-    if XNN_UNPREDICTABLE(b != NULL) {
-      b += nc;
-    }
-  } while (--g != 0);
-}
-
-static inline void xnn_pack_f32_gemm_io_w(
+XNN_INTERNAL void xnn_pack_f32_gemm_io_w(
   size_t nc,
   size_t kc,
   size_t nr,
@@ -859,45 +229,9 @@
   size_t sr,
   const float* k,
   const float* b,
-  float* packed_w)
-{
-  const size_t skr = sr * kr;
-  const size_t skc = round_down_po2(kc, skr);
-  const size_t sr_mask = (sr - 1) * kr;
-  for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
-    const size_t nr_block_size = min(nc - nr_block_start, nr);
-    if XNN_LIKELY(b != NULL) {
-      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-        packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
-      }
-    }
-    packed_w += nr;
+  float* packed_w);
 
-    for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
-      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-        for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
-          *packed_w++ =
-            k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
-        }
-      }
-      packed_w += (nr - nr_block_size) * kr;
-    }
-
-    for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
-      const size_t kr_block_size = min(kc - kr_block_start, kr);
-      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-        for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
-          *packed_w++ =
-            k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
-        }
-        packed_w += kr - kr_block_size;
-      }
-      packed_w += (nr - nr_block_size) * kr;
-    }
-  }
-}
-
-static inline void xnn_pack_f32_gemminc_goi_w(
+XNN_INTERNAL void xnn_pack_f32_gemminc_goi_w(
   size_t g,
   size_t nc,
   size_t kc,
@@ -905,42 +239,9 @@
   size_t kr,
   size_t sr,
   const float* k,
-  float* packed_w)
-{
-  const size_t skr = sr * kr;
-  const size_t skc = round_down_po2(kc, skr);
-  const size_t sr_mask = (sr - 1) * kr;
-  do {
-    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
-      const size_t nr_block_size = min(nc - nr_block_start, nr);
+  float* packed_w);
 
-      for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
-            *packed_w++ =
-              k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
-          }
-        }
-        packed_w += (nr - nr_block_size) * kr;
-      }
-
-      for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
-        const size_t kr_block_size = min(kc - kr_block_start, kr);
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
-            *packed_w++ =
-              k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
-          }
-          packed_w += kr - kr_block_size;
-        }
-        packed_w += (nr - nr_block_size) * kr;
-      }
-    }
-    k += nc * kc;
-  } while (--g != 0);
-}
-
-static inline void xnn_pack_f32_conv_goki_w(
+XNN_INTERNAL void xnn_pack_f32_conv_goki_w(
   size_t g,
   size_t nc,
   size_t ks,
@@ -950,53 +251,9 @@
   size_t sr,
   const float* k,
   const float* b,
-  float* packed_w)
-{
-  const size_t skr = sr * kr;
-  const size_t skc = round_down_po2(kc, skr);
-  const size_t sr_mask = (sr - 1) * kr;
-  do {
-    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
-      const size_t nr_block_size = min(nc - nr_block_start, nr);
-      if XNN_LIKELY(b != NULL) {
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
-        }
-      }
-      packed_w += nr;
+  float* packed_w);
 
-      for (size_t ki = 0; ki < ks; ki++) {
-        for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
-          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-            for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
-              *packed_w++ =
-                k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
-            }
-          }
-          packed_w += (nr - nr_block_size) * kr;
-        }
-
-        for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
-          const size_t kr_block_size = min(kc - kr_block_start, kr);
-          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-            for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
-              *packed_w++ =
-                k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
-            }
-            packed_w += kr - kr_block_size;
-          }
-          packed_w += (nr - nr_block_size) * kr;
-        }
-      }
-    }
-    k += ks * kc * nc;
-    if XNN_UNPREDICTABLE(b != NULL) {
-      b += nc;
-    }
-  } while (--g != 0);
-}
-
-static inline void xnn_pack_f32_conv_kgo_w(
+XNN_INTERNAL void xnn_pack_f32_conv_kgo_w(
   size_t g,
   size_t nc,
   size_t ks,
@@ -1004,34 +261,9 @@
   size_t kr,
   const float* k,
   const float* b,
-  float* packed_w)
-{
-  for (size_t i = 0; i < g; i++) {
-    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
-      const size_t nr_block_size = min(nc - nr_block_start, nr);
-      if XNN_LIKELY(b != NULL) {
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
-        }
-      }
-      packed_w += nr;
-      for (size_t ki = 0; ki < ks; ki++) {
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          *packed_w =
-            k[ki * g * nc + (nr_block_start + nr_block_offset)];
-          packed_w += kr;
-        }
-        packed_w += (nr - nr_block_size) * kr;
-      }
-    }
-    k += nc;
-    if XNN_UNPREDICTABLE(b != NULL) {
-      b += nc;
-    }
-  }
-}
+  float* packed_w);
 
-static inline void xnn_pack_f32_dconv_oki_w(
+XNN_INTERNAL void xnn_pack_f32_dconv_oki_w(
   size_t nc,
   size_t kc,
   size_t nr,
@@ -1039,37 +271,9 @@
   size_t kw,
   const float* k,
   const float* b,
-  float* packed_w)
-{
-  for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
-    const size_t nr_block_size = min(nc - nr_block_start, nr);
-    if XNN_LIKELY(b != NULL) {
-      for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
-        *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
-      }
-    } else {
-      size_t n = nr;
-      do {
-        *packed_w++ = 0.0f;
-      } while (--n != 0);
-    }
+  float* packed_w);
 
-    for (size_t kx = 0; kx < kw; kx++) {
-      for (size_t c = 0; c < kc; c++) {
-        for (size_t ky = 0; ky < kh; ky++) {
-          for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
-            *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
-          }
-        }
-      }
-    }
-    if XNN_UNPREDICTABLE(b != NULL) {
-      b += nr;
-    }
-  }
-}
-
-static inline void xnn_pack_f32_deconv_goki_w(
+XNN_INTERNAL void xnn_pack_f32_deconv_goki_w(
   size_t g,
   size_t nc,
   size_t kh,
@@ -1083,218 +287,54 @@
   const float* k,
   const float* b,
   float* packed_w,
-  struct subconvolution_params* params)
-{
-  const size_t skr = sr * kr;
-  const size_t skc = round_down_po2(kc, skr);
-  const size_t sr_mask = (sr - 1) * kr;
-  for (size_t i = 0; i < g; i++) {
-    for (size_t oy = 0; oy < sh; oy++) {
-      for (size_t ox = 0; ox < sw; ox++) {
-        if (i == 0) {
-          (*params++).weights = packed_w;
-        }
-        for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
-          const size_t nr_block_size = min(nc - nr_block_start, nr);
-          if XNN_LIKELY(b != NULL) {
-            for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-              packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
-            }
-          }
-          packed_w += nr;
-          for (size_t ky = oy; ky < kh; ky += sh) {
-            for (size_t kx = ox; kx < kw; kx += sw) {
-              for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
-                for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-                  for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
-                    *packed_w++ =
-                      k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
-                  }
-                }
-                packed_w += (nr - nr_block_size) * kr;
-              }
+  struct subconvolution_params* params);
 
-              for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
-                const size_t kr_block_size = min(kc - kr_block_start, kr);
-                for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-                  for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
-                    *packed_w++ =
-                      k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
-                  }
-                  packed_w += kr - kr_block_size;
-                }
-                packed_w += (nr - nr_block_size) * kr;
-              }
-            }
-          }
-        }
-      }
-    }
-    k += kh * kw * kc * nc;
-    if XNN_UNPREDICTABLE(b != NULL) {
-      b += nc;
-    }
-  }
-}
-
-static inline void xnn_pack_f32_dwconv_ghw_w(
+XNN_INTERNAL void xnn_pack_f32_dwconv_ghw_w(
   size_t h,
   size_t w,
   size_t c,
   size_t cr,
   const float* k,
   const float* b,
-  float* packed_w)
-{
-  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
-    const size_t cr_block_size = min(c - cr_block_start, cr);
-    if XNN_LIKELY(b != NULL) {
-      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
-        *packed_w++ = b[cr_block_start + cr_block_offset];
-      }
-    } else {
-      size_t n = cr_block_size;
-      do {
-        *packed_w++ = 0.0f;
-      } while (--n != 0);
-    }
-    packed_w += cr - cr_block_size;
-    for (size_t x = 0; x < w; x++) {
-      for (size_t y = 0; y < h; y++) {
-        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
-          const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
-          *packed_w++ = kv;
-        }
-        packed_w += cr - cr_block_size;
-      }
-    }
-  }
-}
+  float* packed_w);
 
-static inline void xnn_pack_f32_dwconv_hwg_w(
+XNN_INTERNAL void xnn_pack_f32_dwconv_hwg_w(
   size_t h,
   size_t w,
   size_t c,
   size_t cr,
   const float* k,
   const float* b,
-  float* packed_w)
-{
-  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
-    const size_t cr_block_size = min(c - cr_block_start, cr);
-    if XNN_LIKELY(b != NULL) {
-      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
-        *packed_w++ = b[cr_block_start + cr_block_offset];
-      }
-    } else {
-      size_t n = cr_block_size;
-      do {
-        *packed_w++ = 0.0f;
-      } while (--n != 0);
-    }
-    packed_w += cr - cr_block_size;
-    for (size_t x = 0; x < w; x++) {
-      for (size_t y = 0; y < h; y++) {
-        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
-          const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
-          *packed_w++ = kv;
-        }
-        packed_w += cr - cr_block_size;
-      }
-    }
-  }
-}
+  float* packed_w);
 
-static inline void xnn_pack_f32_chw_dwconv_ghw_w(
+XNN_INTERNAL void xnn_pack_f32_chw_dwconv_ghw_w(
   size_t kernel_size,
   size_t groups,
   const float* kernel,
   const float* bias,
-  float* packed_weights)
-{
-  for (size_t g = 0; g < groups; g++) {
-    if XNN_LIKELY(bias != NULL) {
-      *packed_weights = *bias++;
-    } else {
-      *packed_weights = 0.0f;
-    }
-    packed_weights += 1;
-    for (size_t i = 0; i < kernel_size; i++) {
-      *packed_weights++ = kernel[g * kernel_size + i];
-    }
-  }
-}
+  float* packed_weights);
 
-static inline void xnn_pack_f32_chw_dwconv_hwg_w(
+XNN_INTERNAL void xnn_pack_f32_chw_dwconv_hwg_w(
   size_t kernel_size,
   size_t groups,
   const float* kernel,
   const float* bias,
-  float* packed_weights)
-{
-  for (size_t g = 0; g < groups; g++) {
-    if XNN_LIKELY(bias != NULL) {
-      *packed_weights = *bias++;
-    } else {
-      *packed_weights = 0.0f;
-    }
-    packed_weights += 1;
-    for (size_t i = 0; i < kernel_size; i++) {
-      *packed_weights++ = kernel[i * groups + g];
-    }
-  }
-}
+  float* packed_weights);
 
-static inline void xnn_pack_f16_vmulcaddc_w(
+XNN_INTERNAL void xnn_pack_f16_vmulcaddc_w(
   size_t c,
   size_t cr,
   const uint16_t* s,
   const uint16_t* b,
-  uint16_t* packed_w)
-{
-  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
-    const size_t cr_block_size = min(c - cr_block_start, cr);
-    for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
-      *packed_w++ = s[cr_block_start + cr_block_offset];
-    }
-    packed_w += cr - cr_block_size;
-    if XNN_LIKELY(b != NULL) {
-      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
-        *packed_w++ = b[cr_block_start + cr_block_offset];
-      }
-    } else {
-      size_t n = cr_block_size;
-      do {
-        *packed_w++ = 0;
-      } while (--n != 0);
-    }
-    packed_w += cr - cr_block_size;
-  }
-}
+  uint16_t* packed_w);
 
-static inline void xnn_pack_f32_vmulcaddc_w(
+XNN_INTERNAL void xnn_pack_f32_vmulcaddc_w(
   size_t c,
   size_t cr,
   const float* s,
   const float* b,
-  float* packed_w)
-{
-  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
-    const size_t cr_block_size = min(c - cr_block_start, cr);
-    for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
-      *packed_w++ = s[cr_block_start + cr_block_offset];
-    }
-    packed_w += cr - cr_block_size;
-    if XNN_LIKELY(b != NULL) {
-      for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
-        *packed_w++ = b[cr_block_start + cr_block_offset];
-      }
-    } else {
-      size_t n = cr_block_size;
-      do {
-        *packed_w++ = 0.0f;
-      } while (--n != 0);
-    }
-    packed_w += cr - cr_block_size;
-  }
-}
+  float* packed_w);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif