Initial open-source release

PiperOrigin-RevId: 271685289
diff --git a/src/xnnpack/AlignedAllocator.h b/src/xnnpack/AlignedAllocator.h
new file mode 100644
index 0000000..ee12481
--- /dev/null
+++ b/src/xnnpack/AlignedAllocator.h
@@ -0,0 +1,104 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <cstddef>
+#include <limits>
+#include <type_traits>
+#include <utility>
+
+#include <stdlib.h>
+
+template <typename T, size_t Alignment>
+class AlignedAllocator;
+
+template <size_t Alignment>
+class AlignedAllocator<void, Alignment> {
+ public:
+  typedef void* pointer;
+  typedef const void* const_pointer;
+  typedef void value_type;
+
+  template <class U>
+  struct rebind {
+    typedef AlignedAllocator<U, Alignment> other;
+  };
+};
+
+template <typename T, size_t Alignment>
+class AlignedAllocator {
+ public:
+  typedef T value_type;
+  typedef T* pointer;
+  typedef const T* const_pointer;
+  typedef T& reference;
+  typedef const T& const_reference;
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+
+#if __cplusplus >= 201402L
+  typedef std::true_type propagate_on_container_move_assignment;
+#endif
+
+  template <class U>
+  struct rebind {
+    typedef AlignedAllocator<U, Alignment> other;
+  };
+
+ public:
+  inline AlignedAllocator() noexcept {}
+
+  template <class U>
+  inline AlignedAllocator(
+      const AlignedAllocator<U, Alignment>& other) noexcept {}
+
+  inline size_type max_size() const noexcept {
+    return (std::numeric_limits<size_type>::max() - size_type(Alignment)) /
+        sizeof(T);
+  }
+
+  inline pointer address(reference x) const noexcept {
+    return std::addressof(x);
+  }
+
+  inline const_pointer address(const_reference x) const noexcept {
+    return std::addressof(x);
+  }
+
+  inline pointer allocate(
+      size_type n,
+      typename AlignedAllocator<void, Alignment>::const_pointer hint = 0) {
+#if defined(__ANDROID__)
+    void* memory = memalign(Alignment, n * sizeof(T));
+    if (memory == 0) {
+#if !defined(__GNUC__) || defined(__EXCEPTIONS)
+      throw std::bad_alloc();
+#endif
+    }
+#else
+    void* memory = nullptr;
+    if (posix_memalign(&memory, Alignment, n * sizeof(T)) != 0) {
+#if !defined(__GNUC__) || defined(__EXCEPTIONS)
+      throw std::bad_alloc();
+#endif
+    }
+#endif
+    return static_cast<pointer>(memory);
+  }
+
+  inline void deallocate(pointer p, size_type n) noexcept {
+    free(static_cast<void*>(p));
+  }
+
+  template <class U, class... Args>
+  inline void construct(U* p, Args&&... args) {
+    ::new (static_cast<void*>(p)) U(std::forward<Args>(args)...);
+  }
+
+  template <class U>
+  inline void destroy(U* p) {
+    p->~U();
+  }
+};
diff --git a/src/xnnpack/allocator.h b/src/xnnpack/allocator.h
new file mode 100644
index 0000000..303aa37
--- /dev/null
+++ b/src/xnnpack/allocator.h
@@ -0,0 +1,47 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef __ANDROID__
+  #include <malloc.h>
+#endif
+
+#include <cpuinfo.h>
+
+extern int posix_memalign(void **memptr, size_t alignment, size_t size);
+
+
+#define XNN_ALLOCATION_ALIGNMENT 16
+
+
+inline static void* xnn_allocate_memory(size_t memory_size) {
+  void* memory_ptr = NULL;
+#if CPUINFO_ARCH_ASMJS || CPUINFO_ARCH_WASM
+  memory_ptr = malloc(memory_size);
+#elif defined(__ANDROID__)
+  memory_ptr = memalign(XNN_ALLOCATION_ALIGNMENT, memory_size);
+#else
+  if (posix_memalign(&memory_ptr, XNN_ALLOCATION_ALIGNMENT, memory_size) != 0) {
+    return NULL;
+  }
+#endif
+  return memory_ptr;
+}
+
+inline static void* xnn_allocate_zero_memory(size_t memory_size) {
+  void* memory_ptr = xnn_allocate_memory(memory_size);
+  if (memory_ptr != NULL) {
+    memset(memory_ptr, 0, memory_size);
+  }
+  return memory_ptr;
+}
+
+inline static void xnn_release_memory(void* memory_ptr) {
+  free(memory_ptr);
+}
diff --git a/src/xnnpack/argmaxpool.h b/src/xnnpack/argmaxpool.h
new file mode 100644
index 0000000..5b9776d
--- /dev/null
+++ b/src/xnnpack/argmaxpool.h
@@ -0,0 +1,60 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                                     \
+      size_t n,                                                  \
+      size_t ks,                                                 \
+      size_t kc,                                                 \
+      const float** x,                                           \
+      float* y,                                                  \
+      uint32_t* i,                                               \
+      size_t x_increment,                                        \
+      size_t y_increment,                                        \
+      const union xnn_f32_output_params* params);
+
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up4__psimd)
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up4__scalar)
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up4__sse2)
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up9__psimd)
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up9__scalar)
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up9__sse2)
+
+
+#define DECLARE_F32_ARGMAXPOOL_MULTIPASS_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                                       \
+      size_t n,                                                    \
+      size_t ks,                                                   \
+      size_t kc,                                                   \
+      const float** x,                                             \
+      float* ab,                                                   \
+      uint32_t* ib,                                                \
+      float* y,                                                    \
+      uint32_t* i,                                                 \
+      size_t x_increment,                                          \
+      size_t y_increment,                                          \
+      const union xnn_f32_output_params* params);
+
+DECLARE_F32_ARGMAXPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd)
+DECLARE_F32_ARGMAXPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar)
+DECLARE_F32_ARGMAXPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/assembly.h b/src/xnnpack/assembly.h
new file mode 100644
index 0000000..4ed7270
--- /dev/null
+++ b/src/xnnpack/assembly.h
@@ -0,0 +1,32 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#ifdef __ELF__
+  .macro BEGIN_FUNCTION name
+    .text
+    .p2align 4
+    .global \name
+    .type \name, %function
+    \name:
+  .endm
+
+  .macro END_FUNCTION name
+    .size \name, .-\name
+  .endm
+#elif defined(__MACH__)
+  .macro BEGIN_FUNCTION name
+    .text
+    .p2align 4
+    .global _\name
+    .private_extern _\name
+    _\name:
+  .endm
+
+  .macro END_FUNCTION name
+  .endm
+#endif
diff --git a/src/xnnpack/avgpool.h b/src/xnnpack/avgpool.h
new file mode 100644
index 0000000..5fd51b9
--- /dev/null
+++ b/src/xnnpack/avgpool.h
@@ -0,0 +1,96 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                                    \
+      size_t n,                                                 \
+      size_t ks,                                                \
+      size_t kc,                                                \
+      const float** x,                                          \
+      const float* zero,                                        \
+      float* buffer,                                            \
+      float* y,                                                 \
+      size_t x_increment,                                       \
+      size_t y_increment,                                       \
+      const union xnn_f32_avgpool_params* params);
+
+DECLARE_F32_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_mp9p8q__neon)
+DECLARE_F32_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_mp9p8q__psimd)
+DECLARE_F32_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_mp9p8q__scalar)
+DECLARE_F32_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_mp9p8q__sse)
+
+
+#define DECLARE_F32_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                                  \
+      size_t n,                                               \
+      size_t ks,                                              \
+      size_t kc,                                              \
+      const float** x,                                        \
+      const float* zero,                                      \
+      float* y,                                               \
+      size_t x_increment,                                     \
+      size_t y_increment,                                     \
+      const union xnn_f32_avgpool_params* params);
+
+DECLARE_F32_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_up9__neon)
+DECLARE_F32_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_up9__psimd)
+DECLARE_F32_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_up9__scalar)
+DECLARE_F32_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_up9__sse)
+
+
+#define DECLARE_Q8_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(fn_name)           \
+  XNN_INTERNAL void fn_name(                                             \
+      size_t n,                                                          \
+      size_t ks,                                                         \
+      size_t kc,                                                         \
+      const uint8_t** x,                                                 \
+      const uint8_t* zero,                                               \
+      int32_t* buffer,                                                   \
+      uint8_t* y,                                                        \
+      size_t x_increment,                                                \
+      size_t y_increment,                                                \
+      const union xnn_q8_avgpool_params* params);
+
+DECLARE_Q8_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_q8_avgpool_ukernel_mp9p8q__neon)
+DECLARE_Q8_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_q8_avgpool_ukernel_mp9p8q__scalar)
+DECLARE_Q8_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_q8_avgpool_ukernel_mp9p8q__sse2)
+
+
+#define DECLARE_Q8_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(fn_name)             \
+  XNN_INTERNAL void fn_name(                                             \
+      size_t n,                                                          \
+      size_t ks,                                                         \
+      size_t kc,                                                         \
+      const uint8_t** x,                                                 \
+      const uint8_t* zero,                                               \
+      uint8_t* y,                                                        \
+      size_t x_increment,                                                \
+      size_t y_increment,                                                \
+      const union xnn_q8_avgpool_params* params);
+
+DECLARE_Q8_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_q8_avgpool_ukernel_up9__neon)
+DECLARE_Q8_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_q8_avgpool_ukernel_up9__scalar)
+DECLARE_Q8_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_q8_avgpool_ukernel_up9__sse2)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/clamp.h b/src/xnnpack/clamp.h
new file mode 100644
index 0000000..db19d28
--- /dev/null
+++ b/src/xnnpack/clamp.h
@@ -0,0 +1,49 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_CLAMP_UKERNEL_FUNCTION(fn_name)   \
+  XNN_INTERNAL void fn_name(                          \
+      size_t n,                                       \
+      const float* x,                                 \
+      float* y,                                       \
+      const union xnn_f32_output_params* params);
+
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__psimd)
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__neon)
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__sse)
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__scalar)
+
+
+#define DECLARE_U8_CLAMP_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                       \
+      size_t n,                                    \
+      const uint8_t* x,                            \
+      uint8_t* y,                                  \
+      const union xnn_u8_output_params* params);
+
+DECLARE_U8_CLAMP_UKERNEL_FUNCTION(xnn_u8_clamp_ukernel__neon)
+DECLARE_U8_CLAMP_UKERNEL_FUNCTION(xnn_u8_clamp_ukernel__sse2)
+DECLARE_U8_CLAMP_UKERNEL_FUNCTION(xnn_u8_clamp_ukernel__scalar)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/common.h b/src/xnnpack/common.h
new file mode 100644
index 0000000..0fc7011
--- /dev/null
+++ b/src/xnnpack/common.h
@@ -0,0 +1,67 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+
+#if defined(__GNUC__)
+  #if defined(__clang__) || (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 5)
+    #define XNN_UNREACHABLE do { __builtin_unreachable(); } while (0)
+  #else
+    #define XNN_UNREACHABLE do { __builtin_trap(); } while (0)
+  #endif
+#elif defined(_MSC_VER)
+  #define XNN_UNREACHABLE __assume(0)
+#else
+  #define XNN_UNREACHABLE do { } while (0)
+#endif
+
+#define XNN_ALIGN(alignment) __attribute__((__aligned__(alignment)))
+
+#define XNN_COUNT_OF(array) (sizeof(array) / sizeof(0[array]))
+
+#if defined(__GNUC__)
+  #define XNN_LIKELY(condition) (__builtin_expect(!!(condition), 1))
+  #define XNN_UNLIKELY(condition) (__builtin_expect(!!(condition), 0))
+#else
+  #define XNN_LIKELY(condition) (!!(condition))
+  #define XNN_UNLIKELY(condition) (!!(condition))
+#endif
+
+// TODO - __builtin_expect_with_probability for GCC 9+
+#if defined(__clang__) && (__has_builtin(__builtin_unpredictable))
+  #define XNN_UNPREDICTABLE(condition) (__builtin_unpredictable(!!(condition)))
+#else
+  #define XNN_UNPREDICTABLE(condition) (!!(condition))
+#endif
+
+#if defined(__GNUC__)
+  #define XNN_INLINE inline __attribute__((__always_inline__))
+#else
+  #define XNN_INLINE inline
+#endif
+
+#ifndef XNN_INTERNAL
+  #if defined(__ELF__)
+    #define XNN_INTERNAL __attribute__((__visibility__("internal")))
+  #elif defined(__MACH__)
+    #define XNN_INTERNAL __attribute__((__visibility__("hidden")))
+  #else
+    #define XNN_INTERNAL
+  #endif
+#endif
+
+#ifndef XNN_PRIVATE
+  #if defined(__ELF__)
+    #define XNN_PRIVATE __attribute__((__visibility__("hidden")))
+  #elif defined(__MACH__)
+    #define XNN_PRIVATE __attribute__((__visibility__("hidden")))
+  #else
+    #define XNN_PRIVATE
+  #endif
+#endif
diff --git a/src/xnnpack/compute.h b/src/xnnpack/compute.h
new file mode 100644
index 0000000..fc8693a
--- /dev/null
+++ b/src/xnnpack/compute.h
@@ -0,0 +1,709 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack.h>
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/params.h>
+
+
+enum xnn_parallelization_type {
+  xnn_parallelization_type_invalid = 0,
+  xnn_parallelization_type_1d,
+  xnn_parallelization_type_1d_tile_1d,
+  xnn_parallelization_type_2d,
+  xnn_parallelization_type_2d_tile_1d,
+  xnn_parallelization_type_2d_tile_2d,
+  xnn_parallelization_type_3d_tile_2d,
+  xnn_parallelization_type_4d_tile_2d,
+  xnn_parallelization_type_5d_tile_2d,
+  xnn_parallelization_type_6d_tile_2d,
+};
+
+struct compute_parameters {
+  enum xnn_parallelization_type type;
+  union {
+    pthreadpool_task_1d_t task_1d;
+    pthreadpool_task_1d_tile_1d_t task_1d_tile_1d;
+    pthreadpool_task_2d_t task_2d;
+    pthreadpool_task_2d_tile_1d_t task_2d_tile_1d;
+    pthreadpool_task_2d_tile_2d_t task_2d_tile_2d;
+    pthreadpool_task_3d_tile_2d_t task_3d_tile_2d;
+    pthreadpool_task_4d_tile_2d_t task_4d_tile_2d;
+    pthreadpool_task_5d_tile_2d_t task_5d_tile_2d;
+    pthreadpool_task_6d_tile_2d_t task_6d_tile_2d;
+  };
+  size_t range[6];
+  size_t tile[2];
+};
+
+struct gemm_context {
+  size_t k_scaled;
+  const void* a;
+  size_t a_stride;
+  const void* packed_w;
+  size_t w_stride;
+  size_t wg_stride;
+  void* c;
+  size_t cm_stride;
+  size_t cn_stride;
+  size_t cg_stride;
+  uint32_t log2_csize;
+  xnn_gemm_ukernel_function ukernel;
+  union {
+    union xnn_q8_gemm_params q8;
+    union xnn_f32_output_params f32;
+  } params;
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_ggemm(
+      const struct gemm_context context[restrict static 1],
+      size_t group_index,
+      size_t mr_block_start,
+      size_t nr_block_start,
+      size_t mr_block_size,
+      size_t nr_block_size);
+
+  XNN_PRIVATE void xnn_compute_gemm(
+      const struct gemm_context context[restrict static 1],
+      size_t mr_block_start,
+      size_t nr_block_start,
+      size_t mr_block_size,
+      size_t nr_block_size);
+#endif
+
+// Context for Sparse Matrix-Dense Matrix Multiplication.
+// C [MxN] := A [MxK] * B [KxN] + bias [N]
+// A and C are dense matrices with row-major storage, B is a sparse matrix.
+struct spmm_context {
+  // N dimension of the B and C matrices.
+  // Corresponds to number of output channels in 1x1 convolution.
+  size_t n;
+  // Input matrix A.
+  const void* a;
+  // Packed bias elements and non-zero filter elements.
+  const void* packed_weights;
+  // Input pointer increments, in bytes, after each processed non-zero weight.
+  const int32_t* input_increments;
+  // Number of non-zero filter elements per each N (output channel) dimension.
+  const uint32_t* output_channel_nonzeros;
+  // Output matrix C.
+  void* c;
+  // Stride, in bytes, between matrices A corresponding to different images in batched 1x1 Convolution
+  size_t batched_a_stride;
+  // Stride, in bytes, between matrices C corresponding to different images in batched 1x1 Convolution
+  size_t batched_c_stride;
+  // Micro-kernel function pointer.
+  xnn_spmm_ukernel_function ukernel;
+  // Output activation parameters.
+  union {
+    union xnn_f32_output_params f32;
+  } params;
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_spmm(
+    const struct spmm_context context[restrict static 1],
+    size_t batch_index,
+    size_t mr_block_start,
+    size_t mr_block_size);
+#endif
+
+struct igemm_context {
+  size_t ks;
+  size_t ks_scaled;
+  size_t kc;
+  size_t w_stride;
+  const void** indirect_a;
+  size_t a_offset;
+  void* zero;
+  const void* packed_w;
+  void* c;
+  size_t cm_stride;
+  size_t cn_stride;
+  size_t ga_stride;
+  size_t gw_stride;
+  size_t gc_stride;
+  size_t ba_stride;
+  size_t bc_stride;
+  uint32_t log2_csize;
+  xnn_igemm_ukernel_function ukernel;
+  union {
+    union xnn_q8_gemm_params q8;
+    union xnn_f32_output_params f32;
+  } params;
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_gigemm(
+      const struct igemm_context context[restrict static 1],
+      size_t batch_index,
+      size_t group_index,
+      size_t mr_block_start,
+      size_t nr_block_start,
+      size_t mr_block_size,
+      size_t nr_block_size);
+
+  XNN_PRIVATE void xnn_compute_igemm(
+      const struct igemm_context context[restrict static 1],
+      size_t batch_index,
+      size_t mr_block_start,
+      size_t nr_block_start,
+      size_t mr_block_size,
+      size_t nr_block_size);
+#endif
+
+struct subconv_context {
+  const struct subconvolution_params* subconvolution_params;
+  size_t kc;
+  size_t a_offset;
+  void* zero;
+  size_t cx_stride;
+  size_t cy_stride;
+  size_t cn_stride;
+  size_t ga_stride;
+  size_t gw_stride;
+  size_t gc_stride;
+  size_t ba_stride;
+  size_t bc_stride;
+  uint32_t log2_csize;
+  xnn_igemm_ukernel_function ukernel;
+  union {
+    union xnn_q8_gemm_params q8;
+    union xnn_f32_output_params f32;
+  } params;
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_gsubconv2d(
+      const struct subconv_context context[restrict static 1],
+      size_t batch_index,
+      size_t group_index,
+      size_t subkernel_index,
+      size_t slice_y,
+      size_t slice_x_start,
+      size_t nr_block_start,
+      size_t slice_x_max,
+      size_t nr_block_size);
+
+  XNN_PRIVATE void xnn_compute_subconv2d(
+      const struct subconv_context context[restrict static 1],
+      size_t batch_index,
+      size_t subkernel_index,
+      size_t slice_y,
+      size_t slice_x_start,
+      size_t nr_block_start,
+      size_t slice_x_max,
+      size_t nr_block_size);
+#endif
+
+struct dconv2d_context {
+  size_t input_height;
+  size_t input_width;
+  const void* input;
+  size_t input_batch_stride;
+  const void* zero;
+  const void* packed_weights;
+  void* output;
+  size_t output_batch_stride;
+  size_t input_padding_top;
+  size_t output_channels;
+  size_t output_height_stride;
+  size_t output_channel_stride;
+  union {
+    xnn_conv_hwc2spchw_ukernel_function hwc2spchw_ukernel;
+  };
+  union {
+    union xnn_f32_output_params f32;
+  } params;
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_dconv2d_hwc2spchw(
+      const struct dconv2d_context context[restrict static 1],
+      size_t batch_index,
+      size_t output_y_start,
+      size_t output_y_slice);
+#endif
+
+struct dwconv_context {
+  size_t groups;
+  const void** indirection_buffer;
+  size_t indirection_buffer_row_stride;
+  size_t indirection_buffer_col_stride;
+  const void* packed_weights;
+  void* output;
+  size_t output_width;
+  size_t output_row_stride;
+  size_t output_col_increment;
+  union {
+    union xnn_q8_gemm_params q8;
+    union xnn_f32_output_params f32;
+  } params;
+  union {
+    xnn_dwconv_up_ukernel_function unipass_ukernel;
+  };
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_dwconv_unipass(
+      const struct dwconv_context context[restrict static 1],
+      size_t output_y);
+#endif
+
+struct dwconv2d_context {
+  size_t output_height;
+  size_t input_width;
+  const void* input;
+  size_t input_channel_stride;
+  size_t input_batch_stride;
+  const void* packed_weights;
+  size_t weights_channel_stride;
+  void* output;
+  size_t output_channel_stride;
+  size_t output_batch_stride;
+  size_t input_tuple_stride;
+  size_t output_tuple_stride;
+  size_t input_pixel_stride;
+  size_t output_pixel_stride;
+  union {
+    union xnn_f32_spchw_params f32;
+  } params;
+  union {
+    xnn_dwconv_spchw_ukernel_function spchw_ukernel;
+  };
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_dwconv2d_spchw(
+      const struct dwconv2d_context context[restrict static 1],
+      size_t batch_index,
+      size_t channel);
+#endif
+
+struct max_pooling_context {
+  const void** indirect_input;
+  size_t indirect_input_batch_stride;
+  size_t indirect_input_height_stride;
+  void* output;
+  size_t output_batch_stride;
+  size_t output_height_stride;
+  size_t output_width;
+  size_t pooling_size;
+  size_t channels;
+  size_t input_increment;
+  size_t output_increment;
+  union {
+    union xnn_u8_output_params u8;
+    union xnn_f32_output_params f32;
+  } params;
+  xnn_maxpool_ukernel_function ukernel;
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_max_pooling(
+      const struct max_pooling_context context[restrict static 1],
+      size_t batch_index,
+      size_t output_y);
+#endif
+
+struct unpooling_context {
+  const void* input;
+  size_t input_height_stride;
+  size_t input_width_stride;
+  const uint32_t* index;
+  size_t index_height_stride;
+  size_t index_width_stride;
+  void** indirect_output;
+  size_t indirect_output_height_stride;
+  size_t indirect_output_width_stride;
+  size_t pooling_size;
+  size_t channels;
+  uint32_t fill_value;
+  xnn_unpool_ukernel_function ukernel;
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_unpooling(
+      const struct unpooling_context context[restrict static 1],
+      size_t input_y,
+      size_t input_x);
+#endif
+
+struct argmax_pooling_context {
+  const void** indirect_input;
+  size_t indirect_input_batch_stride;
+  size_t indirect_input_height_stride;
+  void* output;
+  size_t output_batch_stride;
+  size_t output_height_stride;
+  size_t output_width;
+  uint32_t* index;
+  size_t index_batch_stride;
+  size_t index_height_stride;
+  size_t pooling_size;
+  size_t channels;
+  size_t input_increment;
+  size_t output_increment;
+  union {
+    union xnn_f32_output_params f32;
+  } params;
+  union {
+    xnn_argmaxpool_up_ukernel_function unipass_ukernel;
+    xnn_argmaxpool_mp_ukernel_function multipass_ukernel;
+  };
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_argmax_pooling_unipass(
+      const struct argmax_pooling_context context[restrict static 1],
+      size_t batch_index,
+      size_t output_y);
+
+  XNN_PRIVATE void xnn_compute_argmax_pooling_multipass(
+      const struct argmax_pooling_context context[restrict static 1],
+      size_t batch_index,
+      size_t output_y);
+#endif
+
+struct average_pooling_context {
+  const void** indirect_input;
+  size_t indirect_input_batch_stride;
+  size_t indirect_input_height_stride;
+  void* output;
+  size_t output_batch_stride;
+  size_t output_height_stride;
+  size_t output_width;
+  size_t pooling_size;
+  size_t channels;
+  const void* zero;
+  size_t input_increment;
+  size_t output_increment;
+  union {
+    union xnn_q8_avgpool_params q8;
+    union xnn_f32_avgpool_params f32;
+  } params;
+  union {
+    xnn_avgpool_up_ukernel_function unipass_ukernel;
+    xnn_avgpool_mp_ukernel_function multipass_ukernel;
+  };
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_average_pooling_unipass(
+      const struct average_pooling_context context[restrict static 1],
+      size_t batch_index,
+      size_t output_y);
+
+  XNN_PRIVATE void xnn_compute_average_pooling_multipass(
+      const struct average_pooling_context context[restrict static 1],
+      size_t batch_index,
+      size_t output_y);
+#endif
+
+struct pixelwise_average_pooling_context {
+  const void** indirect_input;
+  size_t indirect_input_batch_stride;
+  size_t indirect_input_height_stride;
+  const void* pixelwise_buffer;
+  size_t pixelwise_buffer_height_stride;
+  void* output;
+  size_t output_batch_stride;
+  size_t output_height_stride;
+  size_t output_width;
+  size_t pooling_size;
+  size_t channels;
+  const void* zero;
+  size_t input_increment;
+  size_t output_increment;
+  union {
+    union xnn_u8_output_params u8;
+    union xnn_f32_output_params f32;
+  } params;
+  union {
+    xnn_pavgpool_up_ukernel_function unipass_ukernel;
+    xnn_pavgpool_mp_ukernel_function multipass_ukernel;
+  };
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_pixelwise_average_pooling_unipass(
+      const struct pixelwise_average_pooling_context context[restrict static 1],
+      size_t batch_index,
+      size_t output_y);
+
+  XNN_PRIVATE void xnn_compute_pixelwise_average_pooling_multipass(
+      const struct pixelwise_average_pooling_context context[restrict static 1],
+      size_t batch_index,
+      size_t output_y);
+#endif
+
+struct global_average_pooling_context {
+  const void* input;
+  const void* zero;
+  size_t input_pixel_stride;
+  size_t input_batch_stride;
+  size_t input_elements;
+  size_t channels;
+  void* output;
+  size_t output_batch_stride;
+  union {
+    union xnn_q8_avgpool_params q8;
+    union xnn_f32_avgpool_params f32;
+  } params;
+  union {
+    xnn_gavgpool_up_ukernel_function unipass_ukernel;
+    xnn_gavgpool_mp_ukernel_function multipass_ukernel;
+  };
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_global_average_pooling_unipass(
+      const struct global_average_pooling_context context[restrict static 1],
+      size_t batch_index);
+
+  XNN_PRIVATE void xnn_compute_global_average_pooling_multipass(
+      const struct global_average_pooling_context context[restrict static 1],
+      size_t batch_index);
+#endif
+
+struct global_average_pooling_spnchw_context {
+  size_t input_elements;
+  const void* input;
+  size_t input_channel_stride;
+  size_t input_batch_stride;
+  void* output;
+  size_t output_channel_stride;
+  size_t output_batch_stride;
+  xnn_gavgpool_spchw_ukernel_function ukernel;
+  union {
+    union xnn_f32_gavgpool_params f32;
+  } params;
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_global_average_pooling_spnchw(
+      const struct global_average_pooling_spnchw_context context[restrict static 1],
+      size_t batch_index,
+      size_t channels_start,
+      size_t channels_slice);
+#endif
+
+struct add_strided_context {
+  size_t n;
+  const void* a;
+  size_t a_stride;
+  const void* b;
+  size_t b_stride;
+  const void* y;
+  size_t y_stride;
+  union {
+    union xnn_q8_add_params q8;
+    union xnn_f32_output_params f32;
+  } params;
+  xnn_vadd_ukernel_function ukernel;
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_add_strided(
+      const struct add_strided_context context[restrict static 1],
+      size_t batch_index,
+      size_t batch_range);
+#endif
+
+struct add_contiguous_context {
+  const void* a;
+  const void* b;
+  void* y;
+  union {
+    union xnn_q8_add_params q8;
+    union xnn_f32_output_params f32;
+  } params;
+  xnn_vadd_ukernel_function ukernel;
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_add_contiguous(
+      const struct add_contiguous_context context[restrict static 1],
+      size_t offset,
+      size_t size);
+#endif
+
+struct channel_shuffle_context {
+  const void* x;
+  size_t x_stride;
+  void* y;
+  size_t y_stride;
+  size_t n;
+  size_t m;
+  union {
+    xnn_zipc_ukernel_function fixed_ukernel;
+    xnn_zipv_ukernel_function variable_ukernel;
+  };
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_channel_shuffle_fixed(
+      const struct channel_shuffle_context context[restrict static 1],
+      size_t index);
+
+  XNN_PRIVATE void xnn_compute_channel_shuffle_variable(
+      const struct channel_shuffle_context context[restrict static 1],
+      size_t index);
+#endif
+
+struct lut_strided_context {
+  size_t n;
+  const void* x;
+  size_t x_stride;
+  const void* t;
+  void* y;
+  size_t y_stride;
+  xnn_x8_lut_ukernel_function ukernel;
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_lut_strided(
+      const struct lut_strided_context context[restrict static 1],
+      size_t batch_index);
+#endif
+
+struct lut_contiguous_context {
+  const void* x;
+  size_t x_stride;
+  const void* t;
+  void* y;
+  size_t y_stride;
+  xnn_x8_lut_ukernel_function ukernel;
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_lut_contiguous(
+      const struct lut_contiguous_context context[restrict static 1],
+      size_t offset,
+      size_t size);
+#endif
+
+struct univector_strided_context {
+  size_t n;
+  const void* x;
+  size_t x_stride;
+  void* y;
+  size_t y_stride;
+  xnn_univector_ukernel_function ukernel;
+  union {
+    union xnn_u8_output_params u8_output;
+    union xnn_f32_output_params f32_output;
+    union xnn_f32_hswish_params f32_hswish;
+  } params;
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_univector_strided(
+      const struct univector_strided_context context[restrict static 1],
+      size_t batch_index,
+      size_t batch_range);
+#endif
+
+struct univector_contiguous_context {
+  const void* x;
+  size_t x_stride;
+  void* y;
+  size_t y_stride;
+  xnn_univector_ukernel_function ukernel;
+  union {
+    union xnn_u8_output_params u8_output;
+    union xnn_f32_output_params f32_output;
+    union xnn_f32_hswish_params f32_hswish;
+  } params;
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_univector_contiguous(
+      const struct univector_contiguous_context context[restrict static 1],
+      size_t offset,
+      size_t size);
+#endif
+
+struct prelu_context {
+  size_t n;
+  const void* x;
+  size_t x_stride;
+  const void* w;
+  void* y;
+  size_t y_stride;
+  xnn_prelu_ukernel_function ukernel;
+  union xnn_f32_output_params params;
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_prelu(
+      const struct prelu_context context[restrict static 1],
+      size_t batch_start,
+      size_t batch_range);
+#endif
+
+struct vmulcaddc_context {
+  size_t n;
+  const void* x;
+  size_t x_stride;
+  const void* w;
+  void* y;
+  size_t y_stride;
+  xnn_vmulcaddc_ukernel_function ukernel;
+  union {
+    union xnn_f32_output_params f32;
+  } params;
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_vmulcaddc(
+      const struct vmulcaddc_context context[restrict static 1],
+      size_t batch_start,
+      size_t batch_size);
+#endif
+
+struct channel_pad_context {
+  size_t n;
+  size_t l;
+  size_t r;
+  uint32_t c;
+  const void* x;
+  size_t x_stride;
+  void* y;
+  size_t y_stride;
+  xnn_pad_ukernel_function ukernel;
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_channel_pad(
+      const struct channel_pad_context context[restrict static 1],
+      size_t batch_start,
+      size_t batch_range);
+#endif
+
+struct u8_softargmax_context {
+  size_t n;
+  const uint8_t* x;
+  size_t x_stride;
+  const uint32_t* t;
+  uint8_t* y;
+  size_t y_stride;
+  xnn_u8_rmax_ukernel_function rmax_ukernel;
+  xnn_u8_lut32norm_ukernel_function lut_norm_ukernel;
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_u8_softargmax(
+      const struct u8_softargmax_context context[restrict static 1],
+      size_t batch_index);
+#endif
diff --git a/src/xnnpack/conv.h b/src/xnnpack/conv.h
new file mode 100644
index 0000000..c1bdec3
--- /dev/null
+++ b/src/xnnpack/conv.h
@@ -0,0 +1,63 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_CONV_HWC_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                           \
+      size_t input_height,                             \
+      size_t input_width,                              \
+      size_t output_y_start,                           \
+      size_t output_y_end,                             \
+      const float* input,                              \
+      const float* zero,                               \
+      const float* weights,                            \
+      float* output,                                   \
+      size_t input_padding_top,                        \
+      size_t output_channels,                          \
+      size_t output_height_stride,                     \
+      size_t output_width_stride,                      \
+      const union xnn_f32_output_params* params);
+
+DECLARE_F32_CONV_HWC_UKERNEL_FUNCTION(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2)
+DECLARE_F32_CONV_HWC_UKERNEL_FUNCTION(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2)
+
+
+#define DECLARE_F32_CONV_HWC2SPCHW_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                                 \
+      size_t input_height,                                   \
+      size_t input_width,                                    \
+      size_t output_y_start,                                 \
+      size_t output_y_end,                                   \
+      const float* input,                                    \
+      const float* zero,                                     \
+      const float* weights,                                  \
+      float* output,                                         \
+      size_t input_padding_top,                              \
+      size_t output_channels,                                \
+      size_t output_height_stride,                           \
+      size_t output_channel_stride,                          \
+      const union xnn_f32_output_params* params);
+
+DECLARE_F32_CONV_HWC2SPCHW_UKERNEL_FUNCTION(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__neonfma_2x2)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/dwconv.h b/src/xnnpack/dwconv.h
new file mode 100644
index 0000000..dc52a61
--- /dev/null
+++ b/src/xnnpack/dwconv.h
@@ -0,0 +1,88 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                                 \
+    size_t channels,                                         \
+    size_t output_width,                                     \
+    const float** input,                                     \
+    const float* weights,                                    \
+    float* output,                                           \
+    size_t input_stride,                                     \
+    size_t output_increment,                                 \
+    const union xnn_f32_output_params* params);
+
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x25__scalar)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x4__scalar)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x9__scalar)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x25__psimd)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x25__sse)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x4__psimd)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x4__sse)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__aarch64_neonfma)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__aarch64_neonfma_cortex_a55)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__neon)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__neonfma)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__psimd)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__sse)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x9__neonfma)
+
+
+#define DECLARE_Q8_DWCONV_UNIPASS_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                                \
+    size_t channels,                                        \
+    size_t output_width,                                    \
+    const uint8_t** input,                                  \
+    const void* weights,                                    \
+    uint8_t* output,                                        \
+    size_t input_stride,                                    \
+    size_t output_increment,                                \
+    const union xnn_q8_gemm_params* params);
+
+DECLARE_Q8_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_q8_dwconv_ukernel_up1x9__scalar)
+DECLARE_Q8_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_q8_dwconv_ukernel_up8x9__aarch32_neon)
+DECLARE_Q8_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_q8_dwconv_ukernel_up8x9__neon)
+DECLARE_Q8_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_q8_dwconv_ukernel_up8x9__sse2)
+
+
+#define DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                               \
+    size_t m,                                              \
+    size_t n,                                              \
+    const float* input,                                    \
+    const float* weights,                                  \
+    float* output,                                         \
+    size_t input_tuple_stride,                             \
+    size_t output_tuple_stride,                            \
+    size_t input_height_stride,                            \
+    size_t output_height_stride,                           \
+    const union xnn_f32_spchw_params* params);
+
+DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma)
+DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma)
+DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(xnn_f32_dwconv_spchw_ukernel_3x3p1__sse)
+DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma)
+DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma)
+DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/gavgpool.h b/src/xnnpack/gavgpool.h
new file mode 100644
index 0000000..b567196
--- /dev/null
+++ b/src/xnnpack/gavgpool.h
@@ -0,0 +1,99 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                                     \
+      size_t m,                                                  \
+      size_t n,                                                  \
+      const float* x,                                            \
+      size_t x_stride,                                           \
+      const float* zero,                                         \
+      float* buffer,                                             \
+      float* y,                                                  \
+      const union xnn_f32_avgpool_params* params);
+
+DECLARE_F32_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_mp7p7q__neon)
+DECLARE_F32_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_mp7p7q__psimd)
+DECLARE_F32_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_mp7p7q__scalar)
+DECLARE_F32_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_mp7p7q__sse)
+
+
+#define DECLARE_F32_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                                   \
+      size_t m,                                                \
+      size_t n,                                                \
+      const float* x,                                          \
+      size_t x_stride,                                         \
+      const float* zero,                                       \
+      float* y,                                                \
+      const union xnn_f32_avgpool_params* params);
+
+DECLARE_F32_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_up7__neon)
+DECLARE_F32_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_up7__psimd)
+DECLARE_F32_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_up7__scalar)
+DECLARE_F32_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_up7__sse)
+
+#define DECLARE_Q8_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(fn_name)          \
+  XNN_INTERNAL void fn_name(                                             \
+      size_t m,                                                          \
+      size_t n,                                                          \
+      const uint8_t* x,                                                  \
+      size_t x_stride,                                                   \
+      const uint8_t* zero,                                               \
+      int32_t* buffer,                                                   \
+      uint8_t* y,                                                        \
+      const union xnn_q8_avgpool_params* params);
+
+DECLARE_Q8_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_q8_gavgpool_ukernel_mp7p7q__neon)
+DECLARE_Q8_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_q8_gavgpool_ukernel_mp7p7q__scalar)
+DECLARE_Q8_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_q8_gavgpool_ukernel_mp7p7q__sse2)
+
+
+#define DECLARE_Q8_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(fn_name)            \
+  XNN_INTERNAL void fn_name(                                             \
+      size_t m,                                                          \
+      size_t n,                                                          \
+      const uint8_t* x,                                                  \
+      size_t x_stride,                                                   \
+      const uint8_t* zero,                                               \
+      uint8_t* y,                                                        \
+      const union xnn_q8_avgpool_params* params);
+
+DECLARE_Q8_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_q8_gavgpool_ukernel_up7__neon)
+DECLARE_Q8_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_q8_gavgpool_ukernel_up7__scalar)
+DECLARE_Q8_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_q8_gavgpool_ukernel_up7__sse2)
+
+
+#define DECLARE_F32_GAVGPOOL_SPCHW_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                                 \
+      size_t elements,                                       \
+      size_t channels,                                       \
+      const float* input,                                    \
+      float* output,                                         \
+      const union xnn_f32_gavgpool_params* params);
+
+DECLARE_F32_GAVGPOOL_SPCHW_UKERNEL_FUNCTION(xnn_f32_gavgpool_spchw_ukernel__neon_x4)
+DECLARE_F32_GAVGPOOL_SPCHW_UKERNEL_FUNCTION(xnn_f32_gavgpool_spchw_ukernel__sse_x4)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
new file mode 100644
index 0000000..27f591d
--- /dev/null
+++ b/src/xnnpack/gemm.h
@@ -0,0 +1,189 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_GEMM_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                       \
+      size_t mr,                                   \
+      size_t nr,                                   \
+      size_t k,                                    \
+      const float* a,                              \
+      size_t a_stride,                             \
+      const float* w,                              \
+      float* c,                                    \
+      size_t cm_stride,                            \
+      size_t cn_stride,                            \
+      const union xnn_f32_output_params* params);
+
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x4__scalar)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__neon_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__psimd_splat)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__sse_dup)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__sse_load1)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8s4__psimd)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8s4__sse)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_2x4__scalar)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x12__neon_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x12__neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__neon_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__scalar)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x4__scalar)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neon_ld128)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neon_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neonfma_ld128)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__psimd_splat)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__sse_dup)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__sse_load1)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8s4__psimd)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8s4__sse)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__neon_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neon_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__psimd_splat)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8s4__psimd)
+
+#define DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                          \
+      size_t mr,                                      \
+      size_t nr,                                      \
+      size_t k,                                       \
+      const float* a,                                 \
+      size_t a_stride,                                \
+      const float* w,                                 \
+      float* c,                                       \
+      size_t cm_stride,                               \
+      size_t cn_stride,                               \
+      const float* acc,                               \
+      const union xnn_f32_output_params* params);
+
+
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x4__scalar)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__neon_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__psimd_splat)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__sse_dup)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__sse_load1)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8s4__psimd)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8s4__sse)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_2x4__scalar)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x12__neon_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x12__neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x4__scalar)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neon_ld128)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neon_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__psimd_splat)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__sse_dup)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__sse_load1)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8s4__psimd)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8s4__sse)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__neon_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neon_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__psimd_splat)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8s4__psimd)
+
+
+#define DECLARE_F16_GEMM_UKERNEL_FUNCTION(fn_name) \
+  void fn_name(                                    \
+      size_t mr,                                   \
+      size_t nr,                                   \
+      size_t k,                                    \
+      const void* a,                               \
+      size_t a_stride,                             \
+      const void* w,                               \
+      void* c,                                     \
+      size_t cm_stride,                            \
+      size_t cn_stride,                            \
+      const struct xnn_f16_output_params* params);
+
+DECLARE_F16_GEMM_UKERNEL_FUNCTION(xnn_f16_gemm_ukernel_4x8__neonfp16arith_ld64)
+DECLARE_F16_GEMM_UKERNEL_FUNCTION(xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64)
+DECLARE_F16_GEMM_UKERNEL_FUNCTION(xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64)
+
+
+#define DECLARE_Q8_GEMM_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                      \
+      size_t mr,                                  \
+      size_t nr,                                  \
+      size_t k,                                   \
+      const uint8_t* a,                           \
+      size_t a_stride,                            \
+      const void* w,                              \
+      uint8_t* c,                                 \
+      size_t cm_stride,                           \
+      size_t cn_stride,                           \
+      const union xnn_q8_gemm_params* params);
+
+DECLARE_Q8_GEMM_UKERNEL_FUNCTION(xnn_q8_gemm_ukernel_2x2__scalar)
+DECLARE_Q8_GEMM_UKERNEL_FUNCTION(xnn_q8_gemm_ukernel_2x4c8__neon)
+DECLARE_Q8_GEMM_UKERNEL_FUNCTION(xnn_q8_gemm_ukernel_2x4c8__sse2)
+DECLARE_Q8_GEMM_UKERNEL_FUNCTION(xnn_q8_gemm_ukernel_3x3c8__neon)
+DECLARE_Q8_GEMM_UKERNEL_FUNCTION(xnn_q8_gemm_ukernel_4x4c2__sse2)
+DECLARE_Q8_GEMM_UKERNEL_FUNCTION(xnn_q8_gemm_ukernel_4x8__aarch32_neon)
+DECLARE_Q8_GEMM_UKERNEL_FUNCTION(xnn_q8_gemm_ukernel_4x8__neon)
+DECLARE_Q8_GEMM_UKERNEL_FUNCTION(xnn_q8_gemm_ukernel_6x4__neon)
+DECLARE_Q8_GEMM_UKERNEL_FUNCTION(xnn_q8_gemm_ukernel_8x8__aarch64_neon)
+DECLARE_Q8_GEMM_UKERNEL_FUNCTION(xnn_q8_gemm_ukernel_8x8__neon)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/hswish.h b/src/xnnpack/hswish.h
new file mode 100644
index 0000000..8d0ab93
--- /dev/null
+++ b/src/xnnpack/hswish.h
@@ -0,0 +1,35 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_HSWISH_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                         \
+      size_t n,                                      \
+      const float* x,                                \
+      float* y,                                      \
+      const union xnn_f32_hswish_params* params);
+
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__psimd)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__neon)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__neonfma)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__sse)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__scalar)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
new file mode 100644
index 0000000..4d30c6f
--- /dev/null
+++ b/src/xnnpack/igemm.h
@@ -0,0 +1,105 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_IGEMM_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                        \
+      size_t mr,                                    \
+      size_t nr,                                    \
+      size_t kc,                                    \
+      size_t ks,                                    \
+      const float** a,                              \
+      const float* w,                               \
+      float* c,                                     \
+      size_t cm_stride,                             \
+      size_t cn_stride,                             \
+      size_t a_offset,                              \
+      const float* zero,                            \
+      const union xnn_f32_output_params* params);
+
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x4__scalar)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__neon_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__psimd_splat)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__sse_dup)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__sse_load1)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8s4__psimd)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8s4__sse)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_2x4__scalar)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x12__neon_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x12__neonfma_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__neon_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__neonfma_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__scalar)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2c4__psimd)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2c4__sse)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__neon_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__neonfma_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__scalar)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neon_ld128)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neon_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neonfma_ld128)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neonfma_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__psimd_splat)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__sse_dup)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__sse_load1)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8s4__psimd)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8s4__sse)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neon_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neonfma_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__psimd_splat)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8s4__psimd)
+
+
+#define DECLARE_Q8_IGEMM_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                       \
+      size_t mr,                                   \
+      size_t nr,                                   \
+      size_t kc,                                   \
+      size_t ks,                                   \
+      const uint8_t** a,                           \
+      const void* w,                               \
+      uint8_t* c,                                  \
+      size_t cm_stride,                            \
+      size_t cn_stride,                            \
+      size_t a_offset,                             \
+      const uint8_t* zero,                         \
+      const union xnn_q8_gemm_params* params);
+
+DECLARE_Q8_IGEMM_UKERNEL_FUNCTION(xnn_q8_igemm_ukernel_2x2__scalar)
+DECLARE_Q8_IGEMM_UKERNEL_FUNCTION(xnn_q8_igemm_ukernel_4x4c2__sse2)
+DECLARE_Q8_IGEMM_UKERNEL_FUNCTION(xnn_q8_igemm_ukernel_4x8__neon)
+DECLARE_Q8_IGEMM_UKERNEL_FUNCTION(xnn_q8_igemm_ukernel_8x8__neon)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/im2col.h b/src/xnnpack/im2col.h
new file mode 100644
index 0000000..07323e3
--- /dev/null
+++ b/src/xnnpack/im2col.h
@@ -0,0 +1,37 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+XNN_INTERNAL void xnn_im2col_conv2d(
+  size_t output_height,
+  size_t output_width,
+  size_t kernel_height,
+  size_t kernel_width,
+  size_t subsampling_height,
+  size_t subsampling_width,
+  size_t dilation_height,
+  size_t dilation_width,
+  size_t input_width,
+  size_t input_padding_top,
+  size_t input_padding_left,
+  size_t group_input_channels_in_bytes,
+  size_t input_pixel_stride_in_bytes,
+  const void* input,
+  void* output);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/indirection.h b/src/xnnpack/indirection.h
new file mode 100644
index 0000000..60be1f6
--- /dev/null
+++ b/src/xnnpack/indirection.h
@@ -0,0 +1,57 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+XNN_INTERNAL void xnn_indirection_init_conv2d(
+  xnn_operator_t op,
+  size_t output_tile_size,
+  uint32_t log2_element_size);
+
+XNN_INTERNAL void xnn_indirection_init_dwconv2d(
+  xnn_operator_t op,
+  size_t batch_start,
+  size_t step_height,
+  size_t step_width,
+  uint32_t log2_element_size);
+
+XNN_INTERNAL void xnn_indirection_init_deconv2d(
+  xnn_operator_t op,
+  size_t output_tile_size,
+  uint32_t log2_element_size);
+
+XNN_INTERNAL void xnn_indirection_init_subconv2d(
+  xnn_operator_t op,
+  size_t output_tile_size,
+  uint32_t log2_element_size);
+
+XNN_INTERNAL void xnn_indirection_init_maxpool2d(
+  xnn_operator_t op,
+  size_t batch_start,
+  size_t step_height,
+  size_t step_width,
+  uint32_t log2_element_size);
+
+XNN_INTERNAL void xnn_indirection_init_unpool2d(
+  xnn_operator_t op,
+  size_t batch_start,
+  uint32_t log2_element_size);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/isa-checks.h b/src/xnnpack/isa-checks.h
new file mode 100644
index 0000000..0bdf97c
--- /dev/null
+++ b/src/xnnpack/isa-checks.h
@@ -0,0 +1,79 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <cpuinfo.h>
+
+
+#if CPUINFO_ARCH_PNACL || CPUINFO_ARCH_WASMSIMD
+  #define TEST_REQUIRES_PSIMD
+#else
+  #define TEST_REQUIRES_PSIMD \
+    do { \
+      if (!cpuinfo_initialize() || !(cpuinfo_has_arm_neon() || cpuinfo_has_x86_sse2())) { \
+        GTEST_SKIP(); \
+      } \
+    } while (0)
+#endif
+
+#define TEST_REQUIRES_X86_SSE \
+  do { \
+    if (!cpuinfo_initialize() || !cpuinfo_has_x86_sse()) { \
+      GTEST_SKIP(); \
+    } \
+  } while (0)
+
+#define TEST_REQUIRES_X86_SSE2 \
+  do { \
+    if (!cpuinfo_initialize() || !cpuinfo_has_x86_sse2()) { \
+      GTEST_SKIP(); \
+    } \
+  } while (0)
+
+#define TEST_REQUIRES_X86_AVX \
+  do { \
+    if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx()) { \
+      GTEST_SKIP(); \
+    } \
+  } while (0)
+
+#define TEST_REQUIRES_X86_AVX2 \
+  do { \
+    if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx2()) { \
+      GTEST_SKIP(); \
+    } \
+  } while (0)
+
+#define TEST_REQUIRES_X86_AVX512F \
+  do { \
+    if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx512f()) { \
+      GTEST_SKIP(); \
+    } \
+  } while (0)
+
+#define TEST_REQUIRES_ARM_NEON \
+  do { \
+    if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon()) { \
+      GTEST_SKIP(); \
+    } \
+  } while (0)
+
+#define TEST_REQUIRES_ARM_NEON_FMA \
+  do { \
+    if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_fma()) { \
+      GTEST_SKIP(); \
+    } \
+  } while (0)
+
+#define TEST_REQUIRES_ARM_NEON_FP16_ARITH \
+  do { \
+    if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_fp16_arith()) { \
+      GTEST_SKIP(); \
+    } \
+  } while (0)
diff --git a/src/xnnpack/log.h b/src/xnnpack/log.h
new file mode 100644
index 0000000..9eb5abf
--- /dev/null
+++ b/src/xnnpack/log.h
@@ -0,0 +1,23 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <inttypes.h>
+
+#include <clog.h>
+
+#ifndef XNN_LOG_LEVEL
+#define XNN_LOG_LEVEL CLOG_DEBUG
+#endif
+
+CLOG_DEFINE_LOG_DEBUG(xnn_log_debug, "XNNPACK", XNN_LOG_LEVEL);
+CLOG_DEFINE_LOG_INFO(xnn_log_info, "XNNPACK", XNN_LOG_LEVEL);
+CLOG_DEFINE_LOG_WARNING(xnn_log_warning, "XNNPACK", XNN_LOG_LEVEL);
+CLOG_DEFINE_LOG_ERROR(xnn_log_error, "XNNPACK", XNN_LOG_LEVEL);
+CLOG_DEFINE_LOG_FATAL(xnn_log_fatal, "XNNPACK", XNN_LOG_LEVEL);
diff --git a/src/xnnpack/lut.h b/src/xnnpack/lut.h
new file mode 100644
index 0000000..49b0ec4
--- /dev/null
+++ b/src/xnnpack/lut.h
@@ -0,0 +1,44 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_X8_LUT_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                     \
+      size_t n,                                  \
+      const uint8_t* x,                          \
+      const uint8_t* t,                          \
+      uint8_t* y);
+
+DECLARE_X8_LUT_UKERNEL_FUNCTION(xnn_x8_lut_ukernel__scalar)
+
+
+#define DECLARE_U8_LUT32NORM_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                           \
+      size_t n,                                        \
+      const uint8_t* x,                                \
+      const uint32_t* t,                               \
+      uint8_t* y);
+
+DECLARE_U8_LUT32NORM_UKERNEL_FUNCTION(xnn_u8_lut32norm_ukernel__scalar)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/math.h b/src/xnnpack/math.h
new file mode 100644
index 0000000..60e46dc
--- /dev/null
+++ b/src/xnnpack/math.h
@@ -0,0 +1,64 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <assert.h>
+
+inline static size_t min(size_t a, size_t b) {
+  return a < b ? a : b;
+}
+
+inline static size_t max(size_t a, size_t b) {
+  return a > b ? a : b;
+}
+
+inline static size_t doz(size_t a, size_t b) {
+  return a >= b ? a - b : 0;
+}
+
+inline static size_t divide_round_up(size_t n, size_t q) {
+  return n % q == 0 ? n / q : n / q + 1;
+}
+
+inline static size_t round_up(size_t n, size_t q) {
+  return divide_round_up(n, q) * q;
+}
+
+inline static size_t round_down_po2(size_t n, size_t q) {
+  assert(q != 0);
+  assert((q & (q - 1)) == 0);
+  return n & -q;
+}
+
+inline static size_t round_up_po2(size_t n, size_t q) {
+  return round_down_po2(n + q - 1, q);
+}
+
+inline static size_t subtract_modulo(size_t a, size_t b, size_t m) {
+  assert(a < m);
+  assert(b < m);
+  return a >= b ? a - b : a - b + m;
+}
+
+inline static float math_min_f32(float a, float b) {
+  #if defined(__wasm__)
+    return __builtin_wasm_min_f32(a, b);
+  #else
+    return a < b ? a : b;
+  #endif
+}
+
+inline static float math_max_f32(float a, float b) {
+  #if defined(__wasm__)
+    return __builtin_wasm_max_f32(a, b);
+  #else
+    return a > b ? a : b;
+  #endif
+}
diff --git a/src/xnnpack/maxpool.h b/src/xnnpack/maxpool.h
new file mode 100644
index 0000000..1c134d7
--- /dev/null
+++ b/src/xnnpack/maxpool.h
@@ -0,0 +1,56 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                          \
+      size_t n,                                       \
+      size_t ks,                                      \
+      size_t kc,                                      \
+      const float** x,                                \
+      float* y,                                       \
+      size_t x_increment,                             \
+      size_t y_increment,                             \
+      const union xnn_f32_output_params* params);
+
+DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8q__psimd)
+DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8q__scalar)
+DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8q__sse)
+
+
+#define DECLARE_U8_MAXPOOL_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                         \
+      size_t n,                                      \
+      size_t ks,                                     \
+      size_t kc,                                     \
+      const uint8_t** x,                             \
+      uint8_t* y,                                    \
+      size_t x_increment,                            \
+      size_t y_increment,                            \
+      const union xnn_u8_output_params* params);
+
+DECLARE_U8_MAXPOOL_UKERNEL_FUNCTION(xnn_u8_maxpool_ukernel_9p8q__neon)
+DECLARE_U8_MAXPOOL_UKERNEL_FUNCTION(xnn_u8_maxpool_ukernel_9p8q__sse2)
+DECLARE_U8_MAXPOOL_UKERNEL_FUNCTION(xnn_u8_maxpool_ukernel_9p8q__scalar)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h
new file mode 100644
index 0000000..a34d6fd
--- /dev/null
+++ b/src/xnnpack/operator.h
@@ -0,0 +1,275 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <pthreadpool.h>
+
+#include <xnnpack/requantization.h>
+#include <xnnpack/compute.h>
+
+
+enum xnn_ukernel_type {
+  xnn_ukernel_type_none = 0,
+  xnn_ukernel_type_add,
+  xnn_ukernel_type_argmax_pooling,
+  xnn_ukernel_type_average_pooling,
+  xnn_ukernel_type_channel_shuffle,
+  xnn_ukernel_type_clamp,
+  xnn_ukernel_type_igemm,
+  xnn_ukernel_type_dconv2d_hwc2spchw,
+  xnn_ukernel_type_dwconv,
+  xnn_ukernel_type_gemm,
+  xnn_ukernel_type_global_average_pooling,
+  xnn_ukernel_type_hswish,
+  xnn_ukernel_type_lut,
+  xnn_ukernel_type_max_pooling,
+  xnn_ukernel_type_pad,
+  xnn_ukernel_type_pixelwise_average_pooling,
+  xnn_ukernel_type_prelu,
+  xnn_ukernel_type_softargmax,
+  xnn_ukernel_type_spmm,
+  xnn_ukernel_type_subconv2d,
+  xnn_ukernel_type_unpooling,
+  xnn_ukernel_type_vmulcaddc,
+};
+
+enum xnn_operator_type {
+  xnn_operator_type_none = 0,
+  xnn_operator_type_add_f32,
+  xnn_operator_type_add_q8,
+  xnn_operator_type_argmax_pooling_f32,
+  xnn_operator_type_average_pooling_f32,
+  xnn_operator_type_average_pooling_q8,
+  xnn_operator_type_channel_pad_x32,
+  xnn_operator_type_channel_shuffle_x8,
+  xnn_operator_type_channel_shuffle_x32,
+  xnn_operator_type_clamp_f32,
+  xnn_operator_type_clamp_u8,
+  xnn_operator_type_convolution_f32,
+  xnn_operator_type_convolution_spnchw_f32,
+  xnn_operator_type_convolution_q8,
+  xnn_operator_type_deconvolution_f32,
+  xnn_operator_type_deconvolution_q8,
+  xnn_operator_type_fully_connected_f32,
+  xnn_operator_type_fully_connected_q8,
+  xnn_operator_type_global_average_pooling_f32,
+  xnn_operator_type_global_average_pooling_q8,
+  xnn_operator_type_global_average_pooling_spnchw_f32,
+  xnn_operator_type_hswish_f32,
+  xnn_operator_type_leaky_relu_q8,
+  xnn_operator_type_max_pooling_f32,
+  xnn_operator_type_max_pooling_u8,
+  xnn_operator_type_prelu_f32,
+  xnn_operator_type_sigmoid_q8,
+  xnn_operator_type_softargmax_q8,
+  xnn_operator_type_unpooling_x32,
+};
+
+struct xnn_ukernel_dconv2d {
+  union {
+    xnn_conv_hwc2spchw_ukernel_function hwc2spchw_function;
+    xnn_conv_hwc_ukernel_function hwc_function;
+  };
+  uint8_t output_height_tile;
+  uint8_t output_channel_tile;
+};
+
+struct xnn_ukernel_dwconv {
+  union {
+    xnn_dwconv_up_ukernel_function unipass_function;
+    xnn_dwconv_mp_ukernel_function multipass_function;
+  };
+  uint8_t mr;
+  uint8_t qr;
+};
+
+// Direct 2D Depthwise Convolution
+struct xnn_ukernel_dwconv2d {
+  union {
+    xnn_dwconv_spchw_ukernel_function spchw_function;
+  };
+  uint8_t input_width_tile;
+  uint8_t output_width_tile;
+};
+
+struct xnn_ukernel_gemm {
+  xnn_gemm_ukernel_function default_function;
+  xnn_gemm_ukernel_function mr1_function;
+  uint8_t mr;
+  uint8_t nr;
+  uint8_t kr;
+};
+
+struct xnn_ukernel_igemm {
+  xnn_igemm_ukernel_function default_function;
+  xnn_igemm_ukernel_function mr1_function;
+  uint8_t mr;
+  uint8_t nr;
+  uint8_t kr;
+};
+
+struct xnn_ukernel_spmm {
+  xnn_spmm_ukernel_function function;
+  uint8_t mr;
+};
+
+struct xnn_ukernel_vmulcaddc {
+  xnn_vmulcaddc_ukernel_function function;
+  uint8_t mr;
+};
+
+struct xnn_ukernel {
+  enum xnn_ukernel_type type;
+  union {
+    struct xnn_ukernel_dconv2d dconv2d;
+    struct xnn_ukernel_dwconv dwconv;
+    struct xnn_ukernel_dwconv2d dwconv2d;
+    struct xnn_ukernel_gemm gemm;
+    struct xnn_ukernel_igemm igemm;
+    struct xnn_ukernel_spmm spmm;
+    struct xnn_ukernel_vmulcaddc vmulcaddc;
+  };
+};
+
+enum xnn_run_state {
+  xnn_run_state_invalid = 0,
+  xnn_run_state_ready,
+  xnn_run_state_skip,
+};
+
+struct subconvolution_params {
+  void* weights;
+  size_t w_stride;
+  const void** indirection_buffer;
+  void* output;
+  size_t slice_width;
+  size_t slice_height;
+  size_t indirection_y_stride;
+  size_t indirection_x_stride;
+  /* kernel_size * mr * sizeof(void*) */
+  size_t scaled_kernel_size;
+};
+
+struct xnn_operator {
+  size_t batch_size;
+  uint32_t padding_top;
+  uint32_t padding_right;
+  uint32_t padding_bottom;
+  uint32_t padding_left;
+  uint32_t adjustment_height;
+  uint32_t adjustment_width;
+  uint32_t kernel_height;
+  uint32_t kernel_width;
+  uint32_t stride_height;
+  uint32_t stride_width;
+  uint32_t dilation_height;
+  uint32_t dilation_width;
+  uint32_t groups;
+  size_t group_channels;
+  size_t group_input_channels;
+  size_t group_output_channels;
+  size_t channels;
+
+  size_t pad_before_channels;
+  size_t pad_after_channels;
+  uint32_t pad_value;
+
+  size_t input_height;
+  size_t input_width;
+  size_t input_pixel_stride;
+  const void* input;
+  const void** indirection_buffer;
+  void* a_sum;
+
+  size_t input2_pixel_stride;
+  const void* input2;
+
+  size_t output_height;
+  size_t output_width;
+  size_t output_pixel_stride;
+  void* output;
+
+  void* packed_weights;
+  // Total number of non-zero kernel elements when weights use sparse representation.
+  size_t num_nonzero_values;
+  // Total number of non-zero kernel blocks when weights use sparse representation.
+  size_t num_nonzero_blocks;
+  // Total number of output channel blocks when weights use sparse representation.
+  size_t num_output_channel_blocks;
+  // Input channel corresponding to the first non-zero kernel element.
+  size_t first_input_channel;
+
+  float input_scale;
+  float output_scale;
+  uint8_t input_zero_point;
+  uint8_t kernel_zero_point;
+  uint8_t output_zero_point;
+  uint8_t output_min;
+  uint8_t output_max;
+
+  size_t valid_batch_size;
+  size_t last_input_height;
+  size_t last_input_width;
+  const void* last_input;
+  void* last_output;
+
+  void* zero_buffer;
+  void* lookup_table;
+  void* pixelwise_buffer;
+  struct subconvolution_params* subconvolution_buffer;
+
+  union {
+    union xnn_f32_avgpool_params f32_avgpool_params;
+    union xnn_f32_gavgpool_params f32_gavgpool_params;
+    union xnn_f32_hswish_params f32_hswish_params;
+    union xnn_f32_output_params f32_output_params;
+    union xnn_f32_spchw_params f32_spchw_params;
+    union xnn_q8_add_params q8_add_params;
+    union xnn_q8_avgpool_params q8_avgpool_params;
+    union xnn_q8_gemm_params q8_gemm_params;
+    union xnn_u8_output_params u8_output_params;
+  };
+  enum xnn_operator_type type;
+  struct xnn_ukernel ukernel;
+
+  struct compute_parameters compute;
+  struct compute_parameters compute2;
+  union {
+    struct add_contiguous_context add_contiguous;
+    struct add_strided_context add_strided;
+    struct argmax_pooling_context argmax_pooling;
+    struct average_pooling_context average_pooling;
+    struct channel_pad_context channel_pad;
+    struct channel_shuffle_context channel_shuffle;
+    struct dconv2d_context dconv2d;
+    struct dwconv2d_context dwconv2d;
+    struct dwconv_context dwconv;
+    struct gemm_context gemm;
+    struct global_average_pooling_context global_average_pooling;
+    struct global_average_pooling_spnchw_context global_average_pooling_spnchw;
+    struct igemm_context igemm;
+    struct lut_contiguous_context lut_contiguous;
+    struct lut_strided_context lut_strided;
+    struct max_pooling_context max_pooling;
+    struct pixelwise_average_pooling_context pixelwise_average_pooling;
+    struct prelu_context prelu;
+    struct spmm_context spmm;
+    struct subconv_context subconv;
+    struct u8_softargmax_context u8_softargmax;
+    struct univector_contiguous_context univector_contiguous;
+    struct univector_strided_context univector_strided;
+    struct unpooling_context unpooling;
+    struct vmulcaddc_context vmulcaddc;
+  } context;
+
+  enum xnn_run_state state;
+};
diff --git a/src/xnnpack/pack.h b/src/xnnpack/pack.h
new file mode 100644
index 0000000..4bc31c2
--- /dev/null
+++ b/src/xnnpack/pack.h
@@ -0,0 +1,646 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stdint.h>
+#include <xnnpack/math.h>
+#include <xnnpack/operator.h>
+
+
+static inline void xnn_pack_q8_gemm_goi_w(
+  size_t g,
+  size_t nc,
+  size_t kc,
+  uint32_t nr,
+  uint32_t kr,
+  uint8_t izp,
+  uint8_t kzp,
+  const uint8_t* k,
+  const int32_t* b,
+  void* packed_w)
+{
+  const int32_t boff = (int32_t) kc * (int32_t) izp * (int32_t) kzp;
+  do {
+    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+      const size_t nr_block_size = min(nc - nr_block_start, nr);
+      int32_t* packed_b = (int32_t*) packed_w;
+      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+        *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
+        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+      }
+      packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
+      for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
+        const size_t kr_block_size = min(kc - kr_block_start, kr);
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          int32_t ksum = 0;
+          for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+            const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
+            ksum += (int32_t) kv;
+            *((uint8_t*) packed_w) = kv;
+            packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+          }
+          packed_b[nr_block_offset] -= ksum * (int32_t) izp;
+          packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
+        }
+        packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
+      }
+    }
+    k += nc * kc;
+    b += nc;
+  } while (--g != 0);
+}
+
+static inline void xnn_pack_q8_conv_goki_w(
+  size_t g,
+  size_t nc,
+  size_t ks,
+  size_t kc,
+  uint32_t nr,
+  uint32_t kr,
+  uint8_t izp,
+  uint8_t kzp,
+  const uint8_t* k,
+  const int32_t* b,
+  void* packed_w)
+{
+  const int32_t boff = (int32_t) ks * (int32_t) kc * (int32_t) izp * (int32_t) kzp;
+  do {
+    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+      const size_t nr_block_size = min(nc - nr_block_start, nr);
+      int32_t* packed_b = (int32_t*) packed_w;
+      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+        *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
+        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+      }
+      packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
+      for (size_t ki = 0; ki < ks; ki++) {
+        for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
+          const size_t kr_block_size = min(kc - kr_block_start, kr);
+          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+            int32_t ksum = 0;
+            for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+              const uint8_t kv =
+                k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
+              ksum += (int32_t) kv;
+              *((uint8_t*) packed_w) = kv;
+              packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+            }
+            packed_b[nr_block_offset] -= ksum * (int32_t) izp;
+            packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
+          }
+          packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
+        }
+      }
+    }
+    k += ks * kc * nc;
+    b += nc;
+  } while (--g != 0);
+}
+
+static inline void xnn_pack_q8_conv_kgo_w(
+  size_t g,
+  size_t nc,
+  size_t ks,
+  uint32_t nr,
+  uint32_t kr,
+  uint8_t izp,
+  uint8_t kzp,
+  const uint8_t* k,
+  const int32_t* b,
+  void* packed_w)
+{
+  const int32_t boff = (int32_t) ks * (int32_t) izp * (int32_t) kzp;
+  for (size_t i = 0; i < g; i++) {
+    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+      const size_t nr_block_size = min(nc - nr_block_start, nr);
+      int32_t* packed_b = (int32_t*) packed_w;
+      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+        *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
+        packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+      }
+      packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
+      for (size_t ki = 0; ki < ks; ki++) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          const uint8_t kv =
+            k[ki * g * nc + (nr_block_start + nr_block_offset)];
+          *((uint8_t*) packed_w) = kv;
+          packed_b[nr_block_offset] -= (int32_t) kv * (int32_t) izp;
+          packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(uint8_t));
+        }
+        packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
+      }
+    }
+    k += nc;
+    b += nc;
+  }
+}
+
+static inline void xnn_pack_q8_deconv_goki_w(
+  size_t g,
+  size_t nc,
+  size_t kh,
+  size_t kw,
+  size_t kc,
+  size_t sh,
+  size_t sw,
+  size_t nr,
+  size_t kr,
+  uint8_t izp,
+  uint8_t kzp,
+  const uint8_t* k,
+  const int32_t* b,
+  void* packed_w,
+  struct subconvolution_params* params)
+{
+  for (size_t i = 0; i < g; i++) {
+    for (size_t oy = 0; oy < sh; oy++) {
+      for (size_t ox = 0; ox < sw; ox++) {
+        if (i == 0) {
+          (*params++).weights = packed_w;
+        }
+        const int32_t boff = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * (int32_t) izp * (int32_t) kzp;
+        for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+          const size_t nr_block_size = min(nc - nr_block_start, nr);
+          int32_t* packed_b = (int32_t*) packed_w;
+          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+            *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
+            packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+          }
+          packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
+          for (size_t ky = oy; ky < kh; ky += sh) {
+            for (size_t kx = ox; kx < kw; kx += sw) {
+              for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
+                const size_t kr_block_size = min(kc - kr_block_start, kr);
+                for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+                  int32_t ksum = 0;
+                  for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+                    const uint8_t kv =
+                      k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
+                    ksum += (int32_t) kv;
+                    *((uint8_t*) packed_w) = kv;
+                    packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+                  }
+                  packed_b[nr_block_offset] -= ksum * (int32_t) izp;
+                  packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
+                }
+                packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
+              }
+            }
+          }
+        }
+      }
+    }
+    k += kh * kw * kc * nc;
+    b += nc;
+  }
+}
+
+static inline void xnn_pack_q8_dwconv_ghw_w(
+  size_t h,
+  size_t w,
+  size_t c,
+  size_t cr,
+  uint8_t izp,
+  uint8_t kzp,
+  const uint8_t* k,
+  const int32_t* b,
+  void* packed_w)
+{
+  const int32_t boff = (int32_t) h * (int32_t) w * (int32_t) izp * (int32_t) kzp;
+  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+    const size_t cr_block_size = min(c - cr_block_start, cr);
+    int32_t* packed_b = (int32_t*) packed_w;
+    for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+      *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
+      packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+    }
+    packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
+    for (size_t x = 0; x < w; x++) {
+      for (size_t y = 0; y < h; y++) {
+        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+          const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
+          packed_b[cr_block_offset] -= (int32_t) kv * (int32_t) izp;
+          *((uint8_t*) packed_w) = kv;
+          packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+        }
+        packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
+      }
+    }
+  }
+}
+
+static inline void xnn_pack_q8_dwconv_hwg_w(
+  size_t h,
+  size_t w,
+  size_t c,
+  size_t cr,
+  uint8_t izp,
+  uint8_t kzp,
+  const uint8_t* k,
+  const int32_t* b,
+  void* packed_w)
+{
+  const int32_t boff = (int32_t) h * (int32_t) w * (int32_t) izp * (int32_t) kzp;
+  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+    const size_t cr_block_size = min(c - cr_block_start, cr);
+    int32_t* packed_b = (int32_t*) packed_w;
+    for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+      *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
+      packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+    }
+    packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
+    for (size_t x = 0; x < w; x++) {
+      for (size_t y = 0; y < h; y++) {
+        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+          const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
+          packed_b[cr_block_offset] -= (int32_t) kv * (int32_t) izp;
+          *((uint8_t*) packed_w) = kv;
+          packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+        }
+        packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
+      }
+    }
+  }
+}
+
+static inline void xnn_pack_f16_gemm_goi_w(
+  size_t g,
+  size_t nc,
+  size_t kc,
+  size_t nr,
+  size_t kr,
+  const uint16_t* k,
+  const uint16_t* b,
+  uint16_t* packed_w)
+{
+  do {
+    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+      const size_t nr_block_size = min(nc - nr_block_start, nr);
+      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+        *packed_w++ = b[nr_block_start + nr_block_offset];
+      }
+      packed_w += nr - nr_block_size;
+      for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
+        const size_t kr_block_size = min(kc - kr_block_start, kr);
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+            *packed_w++ =
+              k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
+          }
+          packed_w += kr - kr_block_size;
+        }
+        packed_w += (nr - nr_block_size) * kr;
+      }
+    }
+    k += nc * kc;
+    b += nc;
+  } while (--g != 0);
+}
+
+static inline void xnn_pack_f32_gemm_goi_w(
+  size_t g,
+  size_t nc,
+  size_t kc,
+  size_t nr,
+  size_t kr,
+  size_t sr,
+  const float* k,
+  const float* b,
+  float* packed_w)
+{
+  const size_t skr = sr * kr;
+  const size_t skc = round_down_po2(kc, skr);
+  const size_t sr_mask = (sr - 1) * kr;
+  do {
+    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+      const size_t nr_block_size = min(nc - nr_block_start, nr);
+      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+        *packed_w++ = b[nr_block_start + nr_block_offset];
+      }
+      packed_w += nr - nr_block_size;
+
+      for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+            *packed_w++ =
+              k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
+          }
+        }
+        packed_w += (nr - nr_block_size) * kr;
+      }
+
+      for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+        const size_t kr_block_size = min(kc - kr_block_start, kr);
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+            *packed_w++ =
+              k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
+          }
+          packed_w += kr - kr_block_size;
+        }
+        packed_w += (nr - nr_block_size) * kr;
+      }
+    }
+    k += nc * kc;
+    b += nc;
+  } while (--g != 0);
+}
+
+static inline void xnn_pack_f32_gemminc_goi_w(
+  size_t g,
+  size_t nc,
+  size_t kc,
+  size_t nr,
+  size_t kr,
+  size_t sr,
+  const float* k,
+  float* packed_w)
+{
+  const size_t skr = sr * kr;
+  const size_t skc = round_down_po2(kc, skr);
+  const size_t sr_mask = (sr - 1) * kr;
+  do {
+    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+      const size_t nr_block_size = min(nc - nr_block_start, nr);
+
+      for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+            *packed_w++ =
+              k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
+          }
+        }
+        packed_w += (nr - nr_block_size) * kr;
+      }
+
+      for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+        const size_t kr_block_size = min(kc - kr_block_start, kr);
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+            *packed_w++ =
+              k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
+          }
+          packed_w += kr - kr_block_size;
+        }
+        packed_w += (nr - nr_block_size) * kr;
+      }
+    }
+    k += nc * kc;
+  } while (--g != 0);
+}
+
+static inline void xnn_pack_f32_conv_goki_w(
+  size_t g,
+  size_t nc,
+  size_t ks,
+  size_t kc,
+  size_t nr,
+  size_t kr,
+  size_t sr,
+  const float* k,
+  const float* b,
+  float* packed_w)
+{
+  const size_t skr = sr * kr;
+  const size_t skc = round_down_po2(kc, skr);
+  const size_t sr_mask = (sr - 1) * kr;
+  do {
+    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+      const size_t nr_block_size = min(nc - nr_block_start, nr);
+      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+        *packed_w++ = b[nr_block_start + nr_block_offset];
+      }
+      packed_w += nr - nr_block_size;
+
+      for (size_t ki = 0; ki < ks; ki++) {
+        for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+            for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+              *packed_w++ =
+                k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
+            }
+          }
+          packed_w += (nr - nr_block_size) * kr;
+        }
+
+        for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+          const size_t kr_block_size = min(kc - kr_block_start, kr);
+          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+            for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+              *packed_w++ =
+                k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
+            }
+            packed_w += kr - kr_block_size;
+          }
+          packed_w += (nr - nr_block_size) * kr;
+        }
+      }
+    }
+    k += ks * kc * nc;
+    b += nc;
+  } while (--g != 0);
+}
+
+static inline void xnn_pack_f32_conv_kgo_w(
+  size_t g,
+  size_t nc,
+  size_t ks,
+  size_t nr,
+  size_t kr,
+  const float* k,
+  const float* b,
+  float* packed_w)
+{
+  for (size_t i = 0; i < g; i++) {
+    for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+      const size_t nr_block_size = min(nc - nr_block_start, nr);
+      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+        *packed_w++ = b[nr_block_start + nr_block_offset];
+      }
+      packed_w += nr - nr_block_size;
+      for (size_t ki = 0; ki < ks; ki++) {
+        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+          *packed_w =
+            k[ki * g * nc + (nr_block_start + nr_block_offset)];
+          packed_w += kr;
+        }
+        packed_w += (nr - nr_block_size) * kr;
+      }
+    }
+    k += nc;
+    b += nc;
+  }
+}
+
+static inline void xnn_pack_f32_dconv_oki_w(
+  size_t nc,
+  size_t kc,
+  size_t nr,
+  size_t kh,
+  size_t kw,
+  const float* k,
+  const float* b,
+  float* packed_w)
+{
+  for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+    const size_t nr_block_size = min(nc - nr_block_start, nr);
+    for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
+      *packed_w++ = b[nr_block_start + min(nr_block_offset, nr_block_size - 1)];
+    }
+
+    for (size_t kx = 0; kx < kw; kx++) {
+      for (size_t c = 0; c < kc; c++) {
+        for (size_t ky = 0; ky < kh; ky++) {
+          for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
+            *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
+          }
+        }
+      }
+    }
+  }
+}
+
+static inline void xnn_pack_f32_deconv_goki_w(
+  size_t g,
+  size_t nc,
+  size_t kh,
+  size_t kw,
+  size_t kc,
+  size_t sh,
+  size_t sw,
+  size_t nr,
+  size_t kr,
+  const float* k,
+  const float* b,
+  float* packed_w,
+  struct subconvolution_params* params)
+{
+  for (size_t i = 0; i < g; i++) {
+    for (size_t oy = 0; oy < sh; oy++) {
+      for (size_t ox = 0; ox < sw; ox++) {
+        if (i == 0) {
+          (*params++).weights = packed_w;
+        }
+        for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+          const size_t nr_block_size = min(nc - nr_block_start, nr);
+          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+            *packed_w++ = b[nr_block_start + nr_block_offset];
+          }
+          packed_w += nr - nr_block_size;
+          for (size_t ky = oy; ky < kh; ky += sh) {
+            for (size_t kx = ox; kx < kw; kx += sw) {
+              for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
+                const size_t kr_block_size = min(kc - kr_block_start, kr);
+                for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+                  for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+                    *packed_w++ =
+                      k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
+                  }
+                  packed_w += kr - kr_block_size;
+                }
+                packed_w += (nr - nr_block_size) * kr;
+              }
+            }
+          }
+        }
+      }
+    }
+    k += kh * kw * kc * nc;
+    b += nc;
+  }
+}
+
+static inline void xnn_pack_f32_dwconv_ghw_w(
+  size_t h,
+  size_t w,
+  size_t c,
+  size_t cr,
+  const float* k,
+  const float* b,
+  float* packed_w)
+{
+  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+    const size_t cr_block_size = min(c - cr_block_start, cr);
+    for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+      *packed_w++ = b[cr_block_start + cr_block_offset];
+    }
+    packed_w += cr - cr_block_size;
+    for (size_t x = 0; x < w; x++) {
+      for (size_t y = 0; y < h; y++) {
+        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+          const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
+          *packed_w++ = kv;
+        }
+        packed_w += cr - cr_block_size;
+      }
+    }
+  }
+}
+
+static inline void xnn_pack_f32_dwconv_hwg_w(
+  size_t h,
+  size_t w,
+  size_t c,
+  size_t cr,
+  const float* k,
+  const float* b,
+  float* packed_w)
+{
+  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+    const size_t cr_block_size = min(c - cr_block_start, cr);
+    for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+      *packed_w++ = b[cr_block_start + cr_block_offset];
+    }
+    packed_w += cr - cr_block_size;
+    for (size_t x = 0; x < w; x++) {
+      for (size_t y = 0; y < h; y++) {
+        for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+          const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
+          *packed_w++ = kv;
+        }
+        packed_w += cr - cr_block_size;
+      }
+    }
+  }
+}
+
+static inline void xnn_pack_f32_spchw_dwconv_ghw_w(
+  size_t kernel_size,
+  size_t groups,
+  const float* kernel,
+  const float* bias,
+  float* packed_weights)
+{
+  for (size_t g = 0; g < groups; g++) {
+    *packed_weights++ = *bias++;
+    for (size_t i = 0; i < kernel_size; i++) {
+      *packed_weights++ = kernel[g * kernel_size + i];
+    }
+  }
+}
+
+static inline void xnn_pack_f32_vmulcaddc_w(
+  size_t c,
+  size_t cr,
+  const float* s,
+  const float* b,
+  float* packed_w)
+{
+  for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+    const size_t cr_block_size = min(c - cr_block_start, cr);
+    for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+      packed_w[cr_block_offset] = s[cr_block_start + cr_block_offset];
+    }
+    packed_w += cr;
+    for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+      packed_w[cr_block_offset] = b[cr_block_start + cr_block_offset];
+    }
+    packed_w += cr;
+  }
+}
diff --git a/src/xnnpack/packx.h b/src/xnnpack/packx.h
new file mode 100644
index 0000000..20b3bc1
--- /dev/null
+++ b/src/xnnpack/packx.h
@@ -0,0 +1,36 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_X32_PACKX_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                         \
+      size_t m,                                      \
+      size_t k,                                      \
+      const uint32_t* x,                             \
+      size_t x_stride,                               \
+      uint32_t* y);
+
+DECLARE_X32_PACKX_UKERNEL_FUNCTION(xnn_x32_packx_ukernel_2x__scalar)
+DECLARE_X32_PACKX_UKERNEL_FUNCTION(xnn_x32_packx_ukernel_3x__scalar)
+DECLARE_X32_PACKX_UKERNEL_FUNCTION(xnn_x32_packx_ukernel_4x__neon_st4)
+DECLARE_X32_PACKX_UKERNEL_FUNCTION(xnn_x32_packx_ukernel_4x__psimd)
+DECLARE_X32_PACKX_UKERNEL_FUNCTION(xnn_x32_packx_ukernel_4x__scalar)
+DECLARE_X32_PACKX_UKERNEL_FUNCTION(xnn_x32_packx_ukernel_4x__sse)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/pad.h b/src/xnnpack/pad.h
new file mode 100644
index 0000000..3cb8103
--- /dev/null
+++ b/src/xnnpack/pad.h
@@ -0,0 +1,39 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_PAD_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                  \
+    size_t m,                                 \
+    size_t n,                                 \
+    size_t l,                                 \
+    size_t r,                                 \
+    uint32_t c,                               \
+    const void* input,                        \
+    size_t input_stride,                      \
+    void* output,                             \
+    size_t output_stride);
+
+DECLARE_PAD_UKERNEL_FUNCTION(xnn_x32_pad_x2__neon)
+DECLARE_PAD_UKERNEL_FUNCTION(xnn_x32_pad_x2__psimd)
+DECLARE_PAD_UKERNEL_FUNCTION(xnn_x32_pad_x2__scalar)
+DECLARE_PAD_UKERNEL_FUNCTION(xnn_x32_pad_x2__sse2)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
new file mode 100644
index 0000000..30e8393
--- /dev/null
+++ b/src/xnnpack/params.h
@@ -0,0 +1,1304 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <cpuinfo.h>
+
+#include <xnnpack/common.h>
+
+#define XNN_INTERNAL_EXTRA_BYTES 32
+
+struct xnn_f16_output_params {
+  uint16_t scale;
+  uint16_t max;
+  uint16_t min;
+};
+
+union xnn_f32_output_params {
+  struct {
+    float max;
+    float min;
+  } scalar;
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) float max[4];
+    XNN_ALIGN(16) float min[4];
+  } sse;
+#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+};
+
+union xnn_f32_spchw_params {
+  struct {
+    float max;
+    float min;
+  } scalar;
+#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+  struct {
+    XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels
+    XNN_ALIGN(16) uint32_t mask_odd[4];  // used by stride 2 kernels
+    XNN_ALIGN(16) uint32_t mask[4]; // used by stride 1 kernels
+    float min;
+    float max;
+  } neon;
+#elif CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels
+    XNN_ALIGN(16) uint32_t mask_odd[4];  // used by stride 2 kernels
+    XNN_ALIGN(16) uint32_t mask[4]; // used by stride 1 kernels
+    XNN_ALIGN(16) float max[4];
+    XNN_ALIGN(16) float min[4];
+  } sse;
+#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+};
+
+union xnn_u8_output_params {
+  struct {
+    int32_t max;
+    int32_t min;
+  } scalar;
+#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+  struct {
+    uint8_t max;
+    uint8_t min;
+  } neon;
+#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) uint8_t max[16];
+    XNN_ALIGN(16) uint8_t min[16];
+  } sse2;
+#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+};
+
+union xnn_f32_avgpool_params {
+  struct {
+    float multiplier;
+    float output_min;
+    float output_max;
+  } scalar;
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) float multiplier[4];
+    XNN_ALIGN(16) float output_max[4];
+    XNN_ALIGN(16) float output_min[4];
+  } sse2;
+#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+  struct {
+    XNN_ALIGN(16) float multiplier;
+    XNN_ALIGN(16) float output_max;
+    XNN_ALIGN(16) float output_min;
+  } neon;
+#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+};
+
+union xnn_f32_gavgpool_params {
+  struct {
+    float multiplier;
+    float output_min;
+    float output_max;
+  } scalar;
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) float multiplier[4];
+    XNN_ALIGN(16) float output_max[4];
+    XNN_ALIGN(16) float output_min[4];
+    XNN_ALIGN(16) uint32_t mask[4];
+  } sse;
+#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+  struct {
+    XNN_ALIGN(16) float multiplier;
+    XNN_ALIGN(16) float output_max;
+    XNN_ALIGN(16) float output_min;
+    XNN_ALIGN(16) uint32_t mask[4];
+  } neon;
+#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+};
+
+union xnn_f32_hswish_params {
+  struct {
+    float sixth;
+    float half;
+    float one;
+  } scalar;
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) float sixth[4];
+    XNN_ALIGN(16) float half[4];
+    XNN_ALIGN(16) float one[4];
+  } sse;
+#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+};
+
+union xnn_q8_gemm_params {
+  struct {
+    int32_t kernel_zero_point;
+    int32_t input_zero_point;
+    int32_t multiplier;
+    int32_t remainder_mask;
+    int32_t remainder_threshold;
+    uint32_t shift;
+    int32_t output_min_less_zero_point;
+    int32_t output_max_less_zero_point;
+    int32_t output_zero_point;
+  } scalar;
+#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+  struct {
+    int16_t kernel_zero_point;
+    int16_t input_zero_point;
+    int32_t multiplier;
+    int32_t right_shift;
+    int16_t output_zero_point;
+    uint8_t output_max;
+    uint8_t output_min;
+  } neon;
+#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) int16_t kernel_zero_point[8];
+    XNN_ALIGN(16) int16_t input_zero_point[8];
+    XNN_ALIGN(16) uint32_t multiplier[4];
+    XNN_ALIGN(16) uint64_t rounding[2];
+    XNN_ALIGN(16) int32_t remainder_mask[4];
+    XNN_ALIGN(16) int32_t remainder_threshold[4];
+    XNN_ALIGN(16) uint64_t shift[2];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) uint8_t output_max[16];
+    XNN_ALIGN(16) uint8_t output_min[16];
+  } sse2;
+#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+};
+
+union xnn_q8_add_params {
+  struct {
+    int32_t zero_point_product;
+    uint32_t a_multiplier;
+    uint32_t b_multiplier;
+    uint32_t shift;
+    int32_t remainder_mask;
+    int32_t remainder_threshold;
+    int32_t y_zero_point;
+    int32_t y_max;
+    int32_t y_min;
+  } scalar;
+#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+  struct {
+    uint8_t a_zero_point;
+    uint8_t b_zero_point;
+    int16_t y_zero_point;
+    int32_t a_multiplier;
+    int32_t b_multiplier;
+    int32_t right_shift;
+    uint8_t y_max;
+    uint8_t y_min;
+  } neon;
+#endif
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) int32_t zero_point_product[4];
+    XNN_ALIGN(16) uint16_t a_multiplier_lo[8];
+    XNN_ALIGN(16) uint16_t a_multiplier_hi[8];
+    XNN_ALIGN(16) uint16_t b_multiplier_lo[8];
+    XNN_ALIGN(16) uint16_t b_multiplier_hi[8];
+    XNN_ALIGN(16) int32_t remainder_mask[4];
+    XNN_ALIGN(16) int32_t remainder_threshold[4];
+    XNN_ALIGN(16) int16_t y_zero_point[8];
+    XNN_ALIGN(16) uint8_t y_max[16];
+    XNN_ALIGN(16) uint8_t y_min[16];
+    uint32_t shift;
+    uint32_t a_multiplier;
+    uint32_t b_multiplier;
+  } sse2;
+#endif
+};
+
+union xnn_q8_avgpool_params {
+  struct {
+    int32_t bias;
+    int32_t multiplier;
+    int64_t rounding;
+    uint32_t right_shift;
+    int32_t output_min_less_zero_point;
+    int32_t output_max_less_zero_point;
+    int32_t output_zero_point;
+  } scalar;
+#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+  struct {
+    int32_t bias;
+    int32_t multiplier;
+    int64_t left_shift;
+    int16_t output_zero_point;
+    uint8_t output_max;
+    uint8_t output_min;
+  } neon;
+#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) int32_t bias[4];
+    XNN_ALIGN(16) uint32_t multiplier[4];
+    XNN_ALIGN(16) uint64_t rounding[2];
+    XNN_ALIGN(16) uint64_t right_shift[2];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) uint8_t output_max[16];
+    XNN_ALIGN(16) uint8_t output_min[16];
+  } sse2;
+#endif
+};
+
+union xnn_fp32_requantization_params {
+  struct {
+    float scale;
+    float min_less_zero_point;
+    float max_less_zero_point;
+    float magic;
+    int32_t magic_less_zero_point;
+  } scalar;
+  struct {
+    float scale;
+    float max;
+    float min;
+    float magic;
+    int32_t magic_less_zero_point;
+  } neon;
+  struct {
+    float scale;
+    int16_t zero_point;
+    uint8_t max;
+    uint8_t min;
+  } neonv8;
+  struct {
+    XNN_ALIGN(16) float scale[4];
+    XNN_ALIGN(16) int16_t zero_point[8];
+    XNN_ALIGN(16) uint8_t max[16];
+    XNN_ALIGN(16) uint8_t min[16];
+  } sse2;
+  struct {
+    XNN_ALIGN(16) float scale[4];
+    XNN_ALIGN(16) float min_less_zero_point[4];
+    XNN_ALIGN(16) float max_less_zero_point[4];
+    XNN_ALIGN(16) float magic[4];
+    XNN_ALIGN(16) int32_t magic_less_zero_point[4];
+  } psimd;
+};
+
+union xnn_precise_requantization_params {
+  struct {
+    uint32_t multiplier;
+    uint32_t rounding_lo;
+    uint32_t rounding_hi;
+    uint32_t shift_less_32;
+    int32_t min_less_zero_point;
+    int32_t max_less_zero_point;
+    int32_t zero_point;
+  } scalar;
+  struct {
+    int32_t multiplier;
+    int32_t right_shift;
+    int16_t zero_point;
+    uint8_t max;
+    uint8_t min;
+  } neon;
+  struct {
+    XNN_ALIGN(16) uint32_t multiplier[4];
+    XNN_ALIGN(16) uint64_t rounding[2];
+    XNN_ALIGN(16) uint32_t shift[4];
+    XNN_ALIGN(16) int16_t zero_point[8];
+    XNN_ALIGN(16) uint8_t max[16];
+    XNN_ALIGN(16) uint8_t min[16];
+  } sse2;
+};
+
+union xnn_q31_requantization_params {
+  struct {
+    int32_t multiplier;
+    int32_t remainder_mask;
+    int32_t remainder_threshold;
+    uint32_t shift;
+    int32_t min_less_zero_point;
+    int32_t max_less_zero_point;
+    int32_t zero_point;
+  } scalar;
+#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+  struct {
+    int32_t multiplier;
+    int32_t right_shift;
+    int16_t zero_point;
+    uint8_t max;
+    uint8_t min;
+  } neon;
+#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) uint32_t multiplier[4];
+    XNN_ALIGN(16) uint64_t rounding[2];
+    XNN_ALIGN(16) int32_t remainder_mask[4];
+    XNN_ALIGN(16) int32_t remainder_threshold[4];
+    XNN_ALIGN(16) uint64_t shift[2];
+    XNN_ALIGN(16) int16_t zero_point[8];
+    XNN_ALIGN(16) uint8_t max[16];
+    XNN_ALIGN(16) uint8_t min[16];
+  } sse2;
+#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+};
+
+union xnn_requantization_params {
+  union xnn_precise_requantization_params precise;
+  union xnn_fp32_requantization_params fp32;
+  union xnn_q31_requantization_params q31;
+};
+
+typedef void (*xnn_ppmm_ukernel_function)(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const void* a,
+    const void* w,
+    void* c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const void* params);
+
+typedef void (*xnn_f32_ppmm_ukernel_function)(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float* a,
+    const float* w,
+    float* c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params* params);
+
+typedef void (*xnn_f16_ppmm_ukernel_function)(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const void* a,
+    const void* w,
+    void* c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const struct xnn_f16_output_params* params);
+
+typedef void (*xnn_gemm_ukernel_function)(
+    size_t mr,
+    size_t nr,
+    size_t k,
+    const void* a,
+    size_t a_stride,
+    const void* w,
+    void* c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const void* params);
+
+typedef void (*xnn_f32_gemm_ukernel_function)(
+    size_t mr,
+    size_t nr,
+    size_t k,
+    const float* a,
+    size_t a_stride,
+    const float* w,
+    float* c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params* params);
+
+typedef void (*xnn_f32_gemminc_ukernel_function)(
+    size_t mr,
+    size_t nr,
+    size_t k,
+    const float* a,
+    size_t a_stride,
+    const float* w,
+    float* c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const float* acc,
+    const union xnn_f32_output_params* params);
+
+typedef void (*xnn_f16_gemm_ukernel_function)(
+    size_t mr,
+    size_t nr,
+    size_t k,
+    const void* a,
+    size_t a_stride,
+    const void* w,
+    void* c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const struct xnn_f16_output_params* params);
+
+typedef void (*xnn_q8_gemm_ukernel_function)(
+    size_t mr,
+    size_t nr,
+    size_t k,
+    const uint8_t* a,
+    size_t a_stride,
+    const void* w,
+    uint8_t* c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_q8_gemm_params* params);
+
+typedef void (*xnn_igemm_ukernel_function)(
+    size_t mr,
+    size_t nr,
+    size_t kc,
+    size_t ks,
+    const void** a,
+    const void* w,
+    void* c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const void* zero,
+    const void* params);
+
+typedef void (*xnn_f32_igemm_ukernel_function)(
+    size_t mr,
+    size_t nr,
+    size_t kc,
+    size_t ks,
+    const float** a,
+    const float* w,
+    float* c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params* params);
+
+typedef void (*xnn_q8_igemm_ukernel_function)(
+    size_t mr,
+    size_t nr,
+    size_t kc,
+    size_t ks,
+    const uint8_t** a,
+    const void* w,
+    uint8_t* c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const uint8_t* zero,
+    const union xnn_q8_gemm_params* params);
+
+typedef void (*xnn_conv_hwc_ukernel_function)(
+    size_t input_height,
+    size_t input_width,
+    size_t output_y_start,
+    size_t output_y_end,
+    const void* input,
+    const void* zero,
+    const void* weights,
+    void* output,
+    size_t input_padding_top,
+    size_t output_channels,
+    size_t output_height_stride,
+    size_t output_width_stride,
+    const void* params);
+
+typedef void (*xnn_f32_conv_hwc_ukernel_function)(
+    size_t input_height,
+    size_t input_width,
+    size_t output_y_start,
+    size_t output_y_end,
+    const float* input,
+    const float* zero,
+    const float* weights,
+    float* output,
+    size_t input_padding_top,
+    size_t output_channels,
+    size_t output_height_stride,
+    size_t output_width_stride,
+    const union xnn_f32_output_params* params);
+
+typedef void (*xnn_conv_hwc2spchw_ukernel_function)(
+    size_t input_height,
+    size_t input_width,
+    size_t output_y_start,
+    size_t output_y_end,
+    const void* input,
+    const void* zero,
+    const void* weights,
+    void* output,
+    size_t input_padding_top,
+    size_t output_channels,
+    size_t output_height_stride,
+    size_t output_channel_stride,
+    const void* params);
+
+typedef void (*xnn_f32_conv_hwc2spchw_ukernel_function)(
+    size_t input_height,
+    size_t input_width,
+    size_t output_y_start,
+    size_t output_y_end,
+    const float* input,
+    const float* zero,
+    const float* weights,
+    float* output,
+    size_t input_padding_top,
+    size_t output_channels,
+    size_t output_height_stride,
+    size_t output_channel_stride,
+    const union xnn_f32_output_params* params);
+
+typedef void (*xnn_spmm_ukernel_function)(
+    uint32_t m,
+    uint32_t n,
+    const void* a,
+    const void* w,
+    const int32_t* dmap,
+    const uint32_t* nmap,
+    void* c,
+    const void* params);
+
+typedef void (*xnn_f32_spmm_ukernel_function)(
+    uint32_t m,
+    uint32_t n,
+    const float* a,
+    const float* w,
+    const int32_t* dmap,
+    const uint32_t* nmap,
+    float* c,
+    const union xnn_f32_output_params* params);
+
+typedef void (*xnn_packx_ukernel_function)(
+    size_t m,
+    size_t k,
+    const void* x,
+    size_t x_stride,
+    void* y);
+
+typedef void (*xnn_x32_packx_ukernel_function)(
+    size_t m,
+    size_t k,
+    const uint32_t* x,
+    size_t x_stride,
+    uint32_t* y);
+
+typedef void (*xnn_pad_ukernel_function)(
+    size_t m,
+    size_t n,
+    size_t l,
+    size_t r,
+    uint32_t c,
+    const void* x,
+    size_t x_stride,
+    void* y,
+    size_t y_stride);
+
+typedef void (*xnn_unpool_ukernel_function)(
+    size_t p,
+    size_t c,
+    uint32_t f,
+    const void* input,
+    const uint32_t* index,
+    void** output);
+
+typedef void (*xnn_x32_unpool_ukernel_function)(
+    size_t p,
+    size_t c,
+    uint32_t f,
+    const uint32_t* input,
+    const uint32_t* index,
+    uint32_t** output);
+
+typedef void (*xnn_zipc_ukernel_function)(
+    size_t n,
+    const void* x,
+    void* y);
+
+typedef void (*xnn_x8_zipc_ukernel_function)(
+    size_t n,
+    const uint8_t* x,
+    uint8_t* y);
+
+typedef void (*xnn_x32_zipc_ukernel_function)(
+    size_t n,
+    const uint32_t* x,
+    uint32_t* y);
+
+typedef void (*xnn_zipv_ukernel_function)(
+    size_t n,
+    size_t m,
+    const void* x,
+    void* y);
+
+typedef void (*xnn_x8_zipv_ukernel_function)(
+    size_t n,
+    size_t m,
+    const uint8_t* x,
+    uint8_t* y);
+
+typedef void (*xnn_x32_zipv_ukernel_function)(
+    size_t n,
+    size_t m,
+    const uint32_t* x,
+    uint32_t* y);
+
+typedef void (*xnn_x8_lut_ukernel_function)(
+    size_t n,
+    const uint8_t* x,
+    const uint8_t* t,
+    uint8_t* y);
+
+typedef void (*xnn_dwconv_spchw_ukernel_function)(
+    size_t output_height,
+    size_t input_width,
+    const void* input,
+    const void* weights,
+    void* output,
+    size_t input_tuple_stride,
+    size_t output_tuple_stride,
+    size_t input_height_stride,
+    size_t output_height_stride,
+    const void* params);
+
+typedef void (*xnn_f32_dwconv_spchw_ukernel_function)(
+    size_t output_height,
+    size_t input_width,
+    const float* input,
+    const float* weights,
+    float* output,
+    size_t input_tuple_stride,
+    size_t output_tuple_stride,
+    size_t input_height_stride,
+    size_t output_height_stride,
+    const union xnn_f32_spchw_params* params);
+
+typedef void (*xnn_dwconv_up_ukernel_function)(
+    size_t channels,
+    size_t output_width,
+    const void** input,
+    const void* weights,
+    void* output,
+    size_t input_stride,
+    size_t output_increment,
+    const void* params);
+
+typedef void (*xnn_f32_dwconv_up_ukernel_function)(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params* params);
+
+typedef void (*xnn_q8_dwconv_up_ukernel_function)(
+    size_t channels,
+    size_t output_width,
+    const uint8_t** input,
+    const void* weights,
+    uint8_t* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_q8_gemm_params* params);
+
+typedef void (*xnn_dwconv_mp_ukernel_function)(
+    size_t channels,
+    size_t output_width,
+    const void** input,
+    const void* weights,
+    void* buffer,
+    void* output,
+    size_t input_stride,
+    size_t output_increment,
+    const void* params);
+
+typedef void (*xnn_gavgpool_up_ukernel_function)(
+    size_t m,
+    size_t n,
+    const void* x,
+    size_t x_stride,
+    const void* zero,
+    void* y,
+    const void* params);
+
+typedef void (*xnn_f32_gavgpool_up_ukernel_function)(
+    size_t m,
+    size_t n,
+    const float* x,
+    size_t x_stride,
+    const float* zero,
+    float* y,
+    const union xnn_f32_avgpool_params* params);
+
+typedef void (*xnn_gavgpool_spchw_ukernel_function)(
+    size_t elements,
+    size_t channels,
+    const float* input,
+    float* output,
+    const void* params);
+
+typedef void (*xnn_f32_gavgpool_spchw_ukernel_function)(
+    size_t elements,
+    size_t channels,
+    const float* input,
+    float* output,
+    const union xnn_f32_gavgpool_params* params);
+
+typedef void (*xnn_q8_gavgpool_up_ukernel_function)(
+    size_t m,
+    size_t n,
+    const uint8_t* x,
+    size_t x_stride,
+    const uint8_t* zero,
+    uint8_t* y,
+    const union xnn_q8_avgpool_params* params);
+
+typedef void (*xnn_gavgpool_mp_ukernel_function)(
+    size_t m,
+    size_t n,
+    const void* x,
+    size_t x_stride,
+    const void* zero,
+    void* buffer,
+    void* y,
+    const void* params);
+
+typedef void (*xnn_f32_gavgpool_mp_ukernel_function)(
+    size_t m,
+    size_t n,
+    const float* x,
+    size_t x_stride,
+    const float* zero,
+    float* buffer,
+    float* y,
+    const union xnn_f32_avgpool_params* params);
+
+typedef void (*xnn_q8_gavgpool_mp_ukernel_function)(
+    size_t m,
+    size_t n,
+    const uint8_t* x,
+    size_t x_stride,
+    const uint8_t* zero,
+    int32_t* buffer,
+    uint8_t* y,
+    const union xnn_q8_avgpool_params* params);
+
+typedef void (*xnn_avgpool_up_ukernel_function)(
+    size_t n,
+    size_t ks,
+    size_t kc,
+    const void** x,
+    const void* zero,
+    void* y,
+    size_t x_increment,
+    size_t y_increment,
+    const void* params);
+
+typedef void (*xnn_f32_avgpool_up_ukernel_function)(
+    size_t n,
+    size_t ks,
+    size_t kc,
+    const float** x,
+    const float* zero,
+    float* y,
+    size_t x_increment,
+    size_t y_increment,
+    const union xnn_f32_avgpool_params* params);
+
+typedef void (*xnn_q8_avgpool_up_ukernel_function)(
+    size_t n,
+    size_t ks,
+    size_t kc,
+    const uint8_t** x,
+    const uint8_t* zero,
+    uint8_t* y,
+    size_t x_increment,
+    size_t y_increment,
+    const union xnn_q8_avgpool_params* params);
+
+typedef void (*xnn_avgpool_mp_ukernel_function)(
+    size_t n,
+    size_t ks,
+    size_t kc,
+    const void** x,
+    const void* zero,
+    void* buffer,
+    void* y,
+    size_t x_increment,
+    size_t y_increment,
+    const void* params);
+
+typedef void (*xnn_f32_avgpool_mp_ukernel_function)(
+    size_t n,
+    size_t ks,
+    size_t kc,
+    const float** x,
+    const float* zero,
+    float* buffer,
+    float* y,
+    size_t x_increment,
+    size_t y_increment,
+    const union xnn_f32_avgpool_params* params);
+
+typedef void (*xnn_q8_avgpool_mp_ukernel_function)(
+    size_t n,
+    size_t ks,
+    size_t kc,
+    const uint8_t** x,
+    const uint8_t* zero,
+    int32_t* buffer,
+    uint8_t* y,
+    size_t x_increment,
+    size_t y_increment,
+    const union xnn_q8_avgpool_params* params);
+
+typedef void (*xnn_pavgpool_up_ukernel_function)(
+    size_t n,
+    size_t ks,
+    size_t kc,
+    const void** x,
+    const void* zero,
+    const void* multiplier,
+    void* y,
+    size_t x_increment,
+    size_t y_increment,
+    const void* params);
+
+typedef void (*xnn_f32_pavgpool_up_ukernel_function)(
+    size_t n,
+    size_t ks,
+    size_t kc,
+    const float** x,
+    const float* zero,
+    const float* multiplier,
+    float* y,
+    size_t x_increment,
+    size_t y_increment,
+    const union xnn_f32_output_params* params);
+
+typedef void (*xnn_pavgpool_mp_ukernel_function)(
+    size_t n,
+    size_t ks,
+    size_t kc,
+    const void** x,
+    const void* zero,
+    const void* multiplier,
+    void* buffer,
+    void* y,
+    size_t x_increment,
+    size_t y_increment,
+    const void* params);
+
+typedef void (*xnn_f32_pavgpool_mp_ukernel_function)(
+    size_t n,
+    size_t ks,
+    size_t kc,
+    const float** x,
+    const float* zero,
+    const float* multiplier,
+    float* buffer,
+    float* y,
+    size_t x_increment,
+    size_t y_increment,
+    const union xnn_f32_output_params* params);
+
+typedef void (*xnn_maxpool_ukernel_function)(
+    size_t n,
+    size_t ks,
+    size_t kc,
+    const void** x,
+    void* y,
+    size_t x_increment,
+    size_t y_increment,
+    const void* params);
+
+typedef void (*xnn_f32_maxpool_ukernel_function)(
+    size_t n,
+    size_t ks,
+    size_t kc,
+    const float** x,
+    float* y,
+    size_t x_increment,
+    size_t y_increment,
+    const union xnn_f32_output_params* params);
+
+typedef void (*xnn_u8_maxpool_ukernel_function)(
+    size_t n,
+    size_t ks,
+    size_t kc,
+    const uint8_t** x,
+    uint8_t* y,
+    size_t x_increment,
+    size_t y_increment,
+    const union xnn_u8_output_params* params);
+
+typedef void (*xnn_argmaxpool_up_ukernel_function)(
+    size_t n,
+    size_t ks,
+    size_t kc,
+    const void** x,
+    void* y,
+    uint32_t* i,
+    size_t x_increment,
+    size_t y_increment,
+    const void* params);
+
+typedef void (*xnn_f32_argmaxpool_up_ukernel_function)(
+    size_t n,
+    size_t ks,
+    size_t kc,
+    const float** x,
+    float* y,
+    uint32_t* i,
+    size_t x_increment,
+    size_t y_increment,
+    const union xnn_f32_output_params* params);
+
+typedef void (*xnn_argmaxpool_mp_ukernel_function)(
+    size_t n,
+    size_t ks,
+    size_t kc,
+    const void** x,
+    void* ab,
+    uint32_t* ib,
+    void* y,
+    uint32_t* i,
+    size_t x_increment,
+    size_t y_increment,
+    const void* params);
+
+typedef void (*xnn_f32_argmaxpool_mp_ukernel_function)(
+    size_t n,
+    size_t ks,
+    size_t kc,
+    const float** x,
+    float* ab,
+    uint32_t* ib,
+    float* y,
+    uint32_t* i,
+    size_t x_increment,
+    size_t y_increment,
+    const union xnn_f32_output_params* params);
+
+typedef void (*xnn_univector_ukernel_function)(
+    size_t n,
+    const void* x,
+    void* y,
+    const void* params);
+
+typedef void (*xnn_f32_clamp_ukernel_function)(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_output_params* params);
+
+typedef void (*xnn_u8_clamp_ukernel_function)(
+    size_t n,
+    const uint8_t* x,
+    uint8_t* y,
+    const union xnn_u8_output_params* params);
+
+typedef void (*xnn_f32_hswish_ukernel_function)(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params* params);
+
+typedef void (*xnn_rmax_ukernel_function)(
+    size_t n,
+    const void* x,
+    void* y);
+
+typedef void (*xnn_u8_rmax_ukernel_function)(
+    size_t n,
+    const uint8_t* x,
+    uint8_t* y);
+
+typedef void (*xnn_f32_rmax_ukernel_function)(
+    size_t n,
+    const float* x,
+    float* y);
+
+typedef void (*xnn_u8_lut32norm_ukernel_function)(
+    size_t n,
+    const uint8_t* x,
+    const uint32_t* t,
+    uint8_t* y);
+
+typedef void (*xnn_vadd_ukernel_function)(
+    size_t n,
+    const void* a,
+    const void* b,
+    void* y,
+    const void* params);
+
+typedef void (*xnn_f32_vadd_ukernel_function)(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params* params);
+
+typedef void (*xnn_q8_vadd_ukernel_function)(
+    size_t n,
+    const uint8_t* a,
+    const uint8_t* b,
+    uint8_t* y,
+    const union xnn_q8_add_params* params);
+
+typedef void (*xnn_vmul_ukernel_function)(
+    size_t n,
+    const void* a,
+    const void* b,
+    void* y,
+    const void* params);
+
+typedef void (*xnn_f32_vmul_ukernel_function)(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params* params);
+
+typedef void (*xnn_vsub_ukernel_function)(
+    size_t n,
+    const void* a,
+    const void* b,
+    void* y,
+    const void* params);
+
+typedef void (*xnn_f32_vsub_ukernel_function)(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params* params);
+
+typedef void (*xnn_vmulcaddc_ukernel_function)(
+    size_t m,
+    size_t c,
+    const void* x,
+    size_t x_stride,
+    const void* w,
+    void* y,
+    size_t y_stride,
+    const void* params);
+
+typedef void (*xnn_f32_vmulcaddc_ukernel_function)(
+    size_t m,
+    size_t c,
+    const float* x,
+    size_t x_stride,
+    const float* w,
+    float* y,
+    size_t y_stride,
+    const union xnn_f32_output_params* params);
+
+typedef void (*xnn_prelu_ukernel_function)(
+    size_t mr,
+    size_t n,
+    const void* x,
+    size_t x_stride,
+    const void* w,
+    void* y,
+    size_t y_stride,
+    const void* params);
+
+typedef void (*xnn_f32_prelu_ukernel_function)(
+    size_t mr,
+    size_t n,
+    const float* x,
+    size_t x_stride,
+    const float* w,
+    float* y,
+    size_t y_stride,
+    const union xnn_f32_output_params* params);
+
+
+struct gemm_parameters {
+  xnn_gemm_ukernel_function gemm;
+  xnn_igemm_ukernel_function igemm;
+  /* Optional GEMM and IGEMM micro-kernels with MR=1 and the same NR and KR parameters */
+  xnn_gemm_ukernel_function gemm1;
+  xnn_igemm_ukernel_function igemm1;
+  uint8_t mr;
+  uint8_t nr;
+  uint8_t log2_kr;
+  uint8_t log2_sr;
+};
+
+struct spmm_parameters {
+  xnn_spmm_ukernel_function ukernel;
+  // Number of M-dimension elements in a tile.
+  // Corresponds to a block of pixels in 1x1 Convolution and a block of batch size in Fully Connected operator.
+  uint8_t mr;
+  // Number of N-dimension elements in a tile.
+  // Corresponds to a block of output channels/features in 1x1 Convolution and Fully Connected operator.
+  uint8_t nr;
+};
+
+struct hwc2spchw_dconv_parameters {
+  xnn_conv_hwc2spchw_ukernel_function ukernel_with_symm_padding;
+  // Number of output channels in a tile.
+  // This parameter must be passed as is to weight packing function.
+  uint8_t output_channel_tile;
+  // Number of output height pixels in a tile.
+  // For best efficiency, micro-kernel must produce a multiple of this number of rows in each call.
+  uint8_t output_height_tile;
+  // Number of output width pixes in a tile.
+  uint8_t output_width_tile;
+};
+
+struct spchw_dwconv_parameters {
+  xnn_dwconv_spchw_ukernel_function ukernel;
+  // Number of input width pixels in a tile.
+  uint8_t input_width_tile;
+  // Number of output width pixels in a tile.
+  uint8_t output_width_tile;
+  // Number of output height pixels in a tile.
+  // For best efficiency, micro-kernel must produce a multiple of this number of rows in each call.
+  uint8_t output_height_tile;
+};
+
+struct spchw_gavgpool_parameters {
+  xnn_gavgpool_spchw_ukernel_function ukernel;
+  // Number of channels in a tile.
+  // For best efficiency, micro-kernel must process a multiple of this number of channels in each call.
+  uint8_t channel_tile;
+};
+
+struct dwconv_parameters {
+  union {
+    xnn_dwconv_up_ukernel_function up;
+    xnn_dwconv_mp_ukernel_function mp;
+  };
+  uint8_t cr;
+  uint8_t mr;
+  uint8_t qr;
+};
+
+struct gavgpool_parameters {
+  xnn_gavgpool_up_ukernel_function up;
+  xnn_gavgpool_mp_ukernel_function mp;
+  uint8_t mr;
+};
+
+struct avgpool_parameters {
+  xnn_avgpool_up_ukernel_function up;
+  xnn_avgpool_mp_ukernel_function mp;
+  uint8_t mr;
+  uint8_t qr;
+};
+
+struct pavgpool_parameters {
+  xnn_pavgpool_up_ukernel_function up;
+  xnn_pavgpool_mp_ukernel_function mp;
+  uint8_t mr;
+  uint8_t qr;
+};
+
+struct argmaxpool_parameters {
+  union {
+    xnn_argmaxpool_up_ukernel_function up;
+    xnn_argmaxpool_mp_ukernel_function mp;
+  };
+  uint8_t mr;
+  uint8_t qr;
+};
+
+struct maxpool_parameters {
+  xnn_maxpool_ukernel_function ukernel;
+  uint8_t mr;
+  uint8_t qr;
+};
+
+struct zip_parameters {
+  xnn_zipc_ukernel_function x2;
+  xnn_zipc_ukernel_function x3;
+  xnn_zipc_ukernel_function x4;
+  xnn_zipv_ukernel_function xm;
+};
+
+struct prelu_parameters {
+  xnn_prelu_ukernel_function ukernel;
+  uint8_t mr;
+};
+
+struct pad_parameters {
+  xnn_pad_ukernel_function ukernel;
+  uint8_t mr;
+};
+
+struct vmulcaddc_parameters {
+  xnn_vmulcaddc_ukernel_function ukernel;
+  uint8_t cr;
+  uint8_t mr;
+};
+
+#define XNN_MAX_Q8_DWCONV_UKERNELS 1
+#define XNN_MAX_F32_DWCONV_UKERNELS 3
+#define XNN_MAX_F32_ARGMAXPOOL_UKERNELS 3
+
+struct xnn_parameters {
+  bool initialized;
+  struct {
+    struct gemm_parameters gemm;
+    struct dwconv_parameters dwconv[XNN_MAX_Q8_DWCONV_UKERNELS];
+    struct avgpool_parameters avgpool;
+    struct gavgpool_parameters gavgpool;
+    xnn_vadd_ukernel_function vadd;
+  } q8;
+  struct {
+    struct maxpool_parameters maxpool;
+    xnn_univector_ukernel_function clamp;
+    xnn_u8_lut32norm_ukernel_function lut32norm;
+    xnn_u8_rmax_ukernel_function rmax;
+  } u8;
+  struct {
+    xnn_x8_lut_ukernel_function lut;
+    struct zip_parameters zip;
+  } x8;
+  struct {
+    struct gemm_parameters gemm;
+    struct gemm_parameters gemm2;
+    struct dwconv_parameters dwconv[XNN_MAX_F32_DWCONV_UKERNELS];
+    struct avgpool_parameters avgpool;
+    struct pavgpool_parameters pavgpool;
+    struct gavgpool_parameters gavgpool;
+    struct maxpool_parameters maxpool;
+    struct argmaxpool_parameters argmaxpool[XNN_MAX_F32_ARGMAXPOOL_UKERNELS];
+    xnn_univector_ukernel_function clamp;
+    xnn_univector_ukernel_function hswish;
+    struct prelu_parameters prelu;
+    xnn_vadd_ukernel_function vadd;
+    struct vmulcaddc_parameters vmulcaddc;
+    // Sparse Matrix-Dense Matrix Multiplication (NR=1 block).
+    struct spmm_parameters spmm;
+    // Sparse Matrix-Dense Matrix Multiplication (NR=2 block).
+    struct spmm_parameters spmm2;
+    // Sparse Matrix-Dense Matrix Multiplication (NR=4 block).
+    struct spmm_parameters spmm4;
+    // Direct 3x3 stride-2 Convolution with 3 input channels and HWC->SpCHW layout conversion.
+    struct hwc2spchw_dconv_parameters hwc2spchw_dconv3x3c3s2;
+    // Direct 3x3 stride-1 Convolution with padding 1 on left and right in SpCHW layout.
+    struct spchw_dwconv_parameters spchw_dwconv3x3;
+    // Direct 3x3 stride-2 Convolution with padding 1 on left and right in SpCHW layout.
+    struct spchw_dwconv_parameters spchw_dwconv3x3s2;
+    // Global Average Pooling in SpCHW layout.
+    struct spchw_gavgpool_parameters spchw_gavgpool;
+  } f32;
+  struct {
+    struct pad_parameters pad;
+    xnn_unpool_ukernel_function unpool;
+    struct zip_parameters zip;
+  } x32;
+};
+
+extern XNN_INTERNAL struct xnn_parameters xnn_params;
diff --git a/src/xnnpack/pavgpool.h b/src/xnnpack/pavgpool.h
new file mode 100644
index 0000000..f124519
--- /dev/null
+++ b/src/xnnpack/pavgpool.h
@@ -0,0 +1,60 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_PAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                                    \
+      size_t n,                                                 \
+      size_t ks,                                                \
+      size_t kc,                                                \
+      const float** x,                                          \
+      const float* zero,                                        \
+      const float* multiplier,                                  \
+      float* buffer,                                            \
+      float* y,                                                 \
+      size_t x_increment,                                       \
+      size_t y_increment,                                       \
+      const union xnn_f32_output_params* params);
+
+DECLARE_F32_PAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_mp9p8q__neon)
+DECLARE_F32_PAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_mp9p8q__psimd)
+DECLARE_F32_PAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_mp9p8q__scalar)
+DECLARE_F32_PAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_mp9p8q__sse)
+
+
+#define DECLARE_F32_PAVGPOOL_UNIPASS_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                                  \
+      size_t n,                                               \
+      size_t ks,                                              \
+      size_t kc,                                              \
+      const float** x,                                        \
+      const float* zero,                                      \
+      const float* multiplier,                                  \
+      float* y,                                               \
+      size_t x_increment,                                     \
+      size_t y_increment,                                     \
+      const union xnn_f32_output_params* params);
+
+DECLARE_F32_PAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_up9__neon)
+DECLARE_F32_PAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_up9__psimd)
+DECLARE_F32_PAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_up9__scalar)
+DECLARE_F32_PAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_up9__sse)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/ppmm.h b/src/xnnpack/ppmm.h
new file mode 100644
index 0000000..1bf6941
--- /dev/null
+++ b/src/xnnpack/ppmm.h
@@ -0,0 +1,45 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_PPMM_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                       \
+      size_t mr,                                   \
+      size_t nc,                                   \
+      size_t kc,                                   \
+      const float* a,                              \
+      const float* w,                              \
+      float* c,                                    \
+      size_t cm_stride,                            \
+      size_t cn_stride,                            \
+      const union xnn_f32_output_params* params);
+
+DECLARE_F32_PPMM_UKERNEL_FUNCTION(xnn_f32_ppmm_ukernel_2x4__scalar)
+DECLARE_F32_PPMM_UKERNEL_FUNCTION(xnn_f32_ppmm_ukernel_3x3__scalar)
+DECLARE_F32_PPMM_UKERNEL_FUNCTION(xnn_f32_ppmm_ukernel_4x2__scalar)
+DECLARE_F32_PPMM_UKERNEL_FUNCTION(xnn_f32_ppmm_ukernel_4x4__scalar)
+DECLARE_F32_PPMM_UKERNEL_FUNCTION(xnn_f32_ppmm_ukernel_4x8__neon)
+DECLARE_F32_PPMM_UKERNEL_FUNCTION(xnn_f32_ppmm_ukernel_4x8__neonfma)
+DECLARE_F32_PPMM_UKERNEL_FUNCTION(xnn_f32_ppmm_ukernel_4x8__psimd)
+DECLARE_F32_PPMM_UKERNEL_FUNCTION(xnn_f32_ppmm_ukernel_4x8__sse)
+DECLARE_F32_PPMM_UKERNEL_FUNCTION(xnn_f32_ppmm_ukernel_8x8__neon)
+DECLARE_F32_PPMM_UKERNEL_FUNCTION(xnn_f32_ppmm_ukernel_8x8__neonfma)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/prelu.h b/src/xnnpack/prelu.h
new file mode 100644
index 0000000..2a882a7
--- /dev/null
+++ b/src/xnnpack/prelu.h
@@ -0,0 +1,38 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_PRELU_UKERNEL_FUNCTION(fn_name)            \
+  XNN_INTERNAL void fn_name(                                   \
+      size_t mr,                                               \
+      size_t n,                                                \
+      const float* x,                                          \
+      size_t x_stride,                                         \
+      const float* w,                                          \
+      float* y,                                                \
+      size_t y_stride,                                         \
+      const union xnn_f32_output_params* clamping_params);
+
+
+DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel_x4__psimd)
+DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel_x4__scalar)
+DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel_x4__sse)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/requantization-stubs.h b/src/xnnpack/requantization-stubs.h
new file mode 100644
index 0000000..ee6e86d
--- /dev/null
+++ b/src/xnnpack/requantization-stubs.h
@@ -0,0 +1,69 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stdint.h>
+#include <stddef.h>
+
+#include <xnnpack/params.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*requantization_function)(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    uint8_t zero_point,
+    uint8_t qmin,
+    uint8_t qmax,
+    uint8_t* output);
+
+#define DECLARE_REQUANTIZATION_FUNCTION(fn_name) \
+    void fn_name( \
+        size_t n, \
+        const int32_t* input, \
+        float scale, \
+        uint8_t zero_point, \
+        uint8_t qmin, \
+        uint8_t qmax, \
+        uint8_t* output);
+
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_precise__scalar_unsigned32)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_precise__scalar_unsigned64)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_precise__scalar_signed64)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_precise__sse2)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_precise__ssse3)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_precise__sse4)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_precise__neon)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_precise__psimd)
+
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_fp32__scalar_lrintf)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_fp32__scalar_magic)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_fp32__sse2)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_fp32__neon)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_fp32__psimd)
+
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_q31__scalar)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_q31__sse2)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_q31__ssse3)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_q31__sse4)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_q31__neon)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_q31__psimd)
+
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_gemmlowp__scalar)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_gemmlowp__sse2)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_gemmlowp__ssse3)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_gemmlowp__sse4)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_gemmlowp__neon)
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/requantization.h b/src/xnnpack/requantization.h
new file mode 100644
index 0000000..bf3e100
--- /dev/null
+++ b/src/xnnpack/requantization.h
@@ -0,0 +1,1307 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+  #include <cstdint>
+  #include <cstddef>
+  #include <cassert>
+  #include <cmath>
+#else
+  #include <stdint.h>
+  #include <stddef.h>
+  #include <assert.h>
+  #include <math.h>
+#endif
+
+#include <fp16.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/scalar-utils.h>
+
+
+static inline union xnn_q8_gemm_params xnn_compute_scalar_q8_gemm_params(
+  uint8_t input_zero_point,
+  uint8_t kernel_zero_point,
+  float scale,
+  uint8_t output_zero_point,
+  uint8_t output_min,
+  uint8_t output_max)
+{
+  /* Compute requantization parameters */
+  const uint32_t scale_bits = fp32_to_bits(scale);
+
+  /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+  const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
+  assert(multiplier >= INT32_C(0x40000000));
+  assert(multiplier <= INT32_C(0x7FFFFF80));
+
+  /* Shift is in [0, 31] range */
+  const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
+  assert(shift >= 0);
+  assert(shift < 32);
+
+  const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+  const uint32_t remainder_threshold = remainder_mask >> 1;
+
+  union xnn_q8_gemm_params params;
+  params.scalar.input_zero_point = (int32_t) (uint32_t) input_zero_point;
+  params.scalar.kernel_zero_point = (int32_t) (uint32_t) kernel_zero_point;
+  params.scalar.multiplier = multiplier;
+  params.scalar.remainder_mask = (int32_t) remainder_mask;
+  params.scalar.remainder_threshold = (int32_t) remainder_threshold;
+  params.scalar.shift = (uint32_t) shift;
+  params.scalar.output_min_less_zero_point =
+    (int32_t) (uint32_t) output_min - (int32_t) (uint32_t) output_zero_point;
+  params.scalar.output_max_less_zero_point =
+    (int32_t) (uint32_t) output_max - (int32_t) (uint32_t) output_zero_point;
+  params.scalar.output_zero_point = (int32_t) (uint32_t) output_zero_point;
+  return params;
+}
+
+static inline union xnn_q8_gemm_params xnn_compute_q8_gemm_params(
+  uint8_t input_zero_point,
+  uint8_t kernel_zero_point,
+  float scale,
+  uint8_t output_zero_point,
+  uint8_t output_min,
+  uint8_t output_max)
+{
+  /* Compute requantization parameters */
+  const uint32_t scale_bits = fp32_to_bits(scale);
+
+  /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+  const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
+  assert(multiplier >= INT32_C(0x40000000));
+  assert(multiplier <= INT32_C(0x7FFFFF80));
+
+  /* Shift is in [0, 31] range */
+  const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
+  assert(shift >= 0);
+  assert(shift < 32);
+
+  union xnn_q8_gemm_params params;
+  #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+    const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+    const uint32_t remainder_threshold = remainder_mask >> 1;
+    for (uint32_t i = 0; i < 8; i++) {
+      params.sse2.input_zero_point[i] = (int16_t) (uint16_t) input_zero_point;
+      params.sse2.kernel_zero_point[i] = (int16_t) (uint16_t) kernel_zero_point;
+    }
+    params.sse2.multiplier[0] = multiplier;
+    params.sse2.multiplier[1] = multiplier;
+    params.sse2.multiplier[2] = multiplier;
+    params.sse2.multiplier[3] = multiplier;
+    params.sse2.rounding[0] = UINT64_C(0x40000000);
+    params.sse2.rounding[1] = UINT64_C(0x40000000);
+    params.sse2.remainder_mask[0] = (int32_t) remainder_mask;
+    params.sse2.remainder_mask[1] = (int32_t) remainder_mask;
+    params.sse2.remainder_mask[2] = (int32_t) remainder_mask;
+    params.sse2.remainder_mask[3] = (int32_t) remainder_mask;
+    params.sse2.remainder_threshold[0] = (int32_t) remainder_threshold;
+    params.sse2.remainder_threshold[1] = (int32_t) remainder_threshold;
+    params.sse2.remainder_threshold[2] = (int32_t) remainder_threshold;
+    params.sse2.remainder_threshold[3] = (int32_t) remainder_threshold;
+    params.sse2.shift[0] = (uint64_t) (uint32_t) shift;
+    params.sse2.shift[1] = (uint64_t) (uint32_t) shift;
+    for (uint32_t i = 0; i < 8; i++) {
+      params.sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
+    }
+    for (uint32_t i = 0; i < 16; i++) {
+      params.sse2.output_max[i] = output_max;
+      params.sse2.output_min[i] = output_min;
+    }
+  #elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+    params.neon.input_zero_point = (int16_t) (uint16_t) input_zero_point;
+    params.neon.kernel_zero_point = (int16_t) (uint16_t) kernel_zero_point;
+    params.neon.multiplier = multiplier;
+    params.neon.right_shift = -shift;
+    params.neon.output_zero_point = (int16_t) (uint16_t) output_zero_point;
+    params.neon.output_max = output_max;
+    params.neon.output_min = output_min;
+  #else
+    const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+    const uint32_t remainder_threshold = remainder_mask >> 1;
+    params.scalar.input_zero_point = (int32_t) (uint32_t) input_zero_point;
+    params.scalar.kernel_zero_point = (int32_t) (uint32_t) kernel_zero_point;
+    params.scalar.multiplier = multiplier;
+    params.scalar.remainder_mask = (int32_t) remainder_mask;
+    params.scalar.remainder_threshold = (int32_t) remainder_threshold;
+    params.scalar.shift = (uint32_t) shift;
+    params.scalar.output_min_less_zero_point =
+      (int32_t) (uint32_t) output_min - (int32_t) (uint32_t) output_zero_point;
+    params.scalar.output_max_less_zero_point =
+      (int32_t) (uint32_t) output_max - (int32_t) (uint32_t) output_zero_point;
+    params.scalar.output_zero_point = (int32_t) (uint32_t) output_zero_point;
+  #endif
+  return params;
+}
+
+static inline union xnn_q8_avgpool_params xnn_compute_q8_avgpool_params(
+  int32_t bias,
+  float scale,
+  uint8_t output_zero_point,
+  uint8_t output_min,
+  uint8_t output_max)
+{
+  /* Compute requantization parameters */
+  assert(scale >= 0x1.0p-32f);
+  assert(scale < 256.0f);
+  const uint32_t scale_bits = fp32_to_bits(scale);
+
+  /* Multiplier is in [0x00800000, 0x00FFFFFF] range */
+  const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
+  assert(multiplier >= INT32_C(0x00800000));
+  assert(multiplier <= INT32_C(0x00FFFFFF));
+
+  /* Shift is in [16, 55] range */
+  const int32_t shift = 127 + 23 - (scale_bits >> 23);
+  assert(shift >= 16);
+  assert(shift < 64);
+
+  union xnn_q8_avgpool_params params;
+  #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+    const uint32_t right_shift = (uint32_t) shift;
+    const uint64_t rounding = UINT64_C(1) << (right_shift - 1);
+    params.sse2.bias[0] = bias;
+    params.sse2.bias[1] = bias;
+    params.sse2.bias[2] = bias;
+    params.sse2.bias[3] = bias;
+    params.sse2.multiplier[0] = (uint32_t) multiplier;
+    params.sse2.multiplier[1] = (uint32_t) multiplier;
+    params.sse2.multiplier[2] = (uint32_t) multiplier;
+    params.sse2.multiplier[3] = (uint32_t) multiplier;
+    params.sse2.rounding[0] = rounding;
+    params.sse2.rounding[1] = rounding;
+    params.sse2.right_shift[0] = (uint64_t) right_shift;
+    params.sse2.right_shift[1] = (uint64_t) right_shift;
+    for (uint32_t i = 0; i < 8; i++) {
+      params.sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
+    }
+    for (uint32_t i = 0; i < 16; i++) {
+      params.sse2.output_max[i] = output_max;
+      params.sse2.output_min[i] = output_min;
+    }
+  #elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+    params.neon.bias = bias;
+    params.neon.multiplier = multiplier;
+    params.neon.left_shift = (int64_t) -shift;
+    params.neon.output_zero_point = (int16_t) (uint16_t) output_zero_point;
+    params.neon.output_max = output_max;
+    params.neon.output_min = output_min;
+  #else
+    const uint32_t right_shift = (uint32_t) shift;
+    const int64_t rounding = INT64_C(1) << (right_shift - 1);
+    params.scalar.bias = bias;
+    params.scalar.multiplier = multiplier;
+    params.scalar.rounding = rounding;
+    params.scalar.right_shift = right_shift;
+    params.scalar.output_min_less_zero_point =
+      (int32_t) (uint32_t) output_min - (int32_t) (uint32_t) output_zero_point;
+    params.scalar.output_max_less_zero_point =
+      (int32_t) (uint32_t) output_max - (int32_t) (uint32_t) output_zero_point;
+    params.scalar.output_zero_point = (int32_t) (uint32_t) output_zero_point;
+  #endif
+  return params;
+}
+
+static inline union xnn_q8_avgpool_params xnn_compute_scalar_q8_avgpool_params(
+  int32_t bias,
+  float scale,
+  uint8_t output_zero_point,
+  uint8_t output_min,
+  uint8_t output_max)
+{
+  /* Compute requantization parameters */
+  assert(scale >= 0x1.0p-32f);
+  assert(scale < 256.0f);
+  const uint32_t scale_bits = fp32_to_bits(scale);
+
+  /* Multiplier is in [0x00800000, 0x00FFFFFF] range */
+  const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
+  assert(multiplier >= INT32_C(0x00800000));
+  assert(multiplier <= INT32_C(0x00FFFFFF));
+
+  /* Shift is in [16, 55] range */
+  const int32_t shift = 127 + 23 - (scale_bits >> 23);
+  assert(shift >= 16);
+  assert(shift < 64);
+
+  union xnn_q8_avgpool_params params;
+  const uint32_t right_shift = (uint32_t) shift;
+  const int64_t rounding = INT64_C(1) << (right_shift - 1);
+  params.scalar.bias = bias;
+  params.scalar.rounding = rounding;
+  params.scalar.multiplier = multiplier;
+  params.scalar.right_shift = right_shift;
+  params.scalar.output_min_less_zero_point =
+    (int32_t) (uint32_t) output_min - (int32_t) (uint32_t) output_zero_point;
+  params.scalar.output_max_less_zero_point =
+    (int32_t) (uint32_t) output_max - (int32_t) (uint32_t) output_zero_point;
+  params.scalar.output_zero_point = (int32_t) (uint32_t) output_zero_point;
+  return params;
+}
+
+static inline void xnn_update_f32_avgpool_params(
+  union xnn_f32_avgpool_params* params,
+  float multiplier)
+{
+  #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+    for (uint32_t i = 0; i < 4; i++) {
+      params->sse2.multiplier[i] = multiplier;
+    }
+  #else
+    params->scalar.multiplier = multiplier;
+  #endif
+}
+
+static inline union xnn_f32_avgpool_params xnn_compute_f32_avgpool_params(
+  float multiplier,
+  float output_min,
+  float output_max)
+{
+  union xnn_f32_avgpool_params params;
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  for (uint32_t i = 0; i < 4; i++) {
+    params.sse2.multiplier[i] = multiplier;
+    params.sse2.output_min[i] = output_min;
+    params.sse2.output_max[i] = output_max;
+  }
+#else
+  params.scalar.multiplier = multiplier;
+  params.scalar.output_min = output_min;
+  params.scalar.output_max = output_max;
+#endif
+return params;
+}
+
+static inline union xnn_f32_gavgpool_params xnn_compute_f32_gavgpool_params(
+  float multiplier,
+  float output_min,
+  float output_max,
+  uint32_t width)
+{
+  union xnn_f32_gavgpool_params params;
+  #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+    for (uint32_t i = 0; i < 4; i++) {
+      params.sse.multiplier[i] = multiplier;
+      params.sse.output_min[i] = output_min;
+      params.sse.output_max[i] = output_max;
+    }
+  switch (width % 4) {
+    case 0:
+      params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask[1] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask[2] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask[3] = UINT32_C(0xFFFFFFFF);
+      break;
+    case 1:
+      params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask[1] = 0;
+      params.sse.mask[2] = 0;
+      params.sse.mask[3] = 0;
+      break;
+    case 2:
+      params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask[1] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask[2] = 0;
+      params.sse.mask[3] = 0;
+      break;
+    case 3:
+      params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask[1] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask[2] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask[3] = 0;
+      break;
+  }
+#elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+    switch (width % 4) {
+      case 0:
+        params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
+        params.neon.mask[1] = UINT32_C(0xFFFFFFFF);
+        params.neon.mask[2] = UINT32_C(0xFFFFFFFF);
+        params.neon.mask[3] = UINT32_C(0xFFFFFFFF);
+        break;
+      case 1:
+        params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
+        params.neon.mask[1] = 0;
+        params.neon.mask[2] = 0;
+        params.neon.mask[3] = 0;
+        break;
+      case 2:
+        params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
+        params.neon.mask[1] = UINT32_C(0xFFFFFFFF);
+        params.neon.mask[2] = 0;
+        params.neon.mask[3] = 0;
+        break;
+      case 3:
+        params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
+        params.neon.mask[1] = UINT32_C(0xFFFFFFFF);
+        params.neon.mask[2] = UINT32_C(0xFFFFFFFF);
+        params.neon.mask[3] = 0;
+        break;
+    }
+    params.neon.multiplier = multiplier;
+    params.neon.output_min = output_min;
+    params.neon.output_max = output_max;
+  #else
+    params.scalar.multiplier = multiplier;
+    params.scalar.output_min = output_min;
+    params.scalar.output_max = output_max;
+  #endif
+  return params;
+}
+
+static inline void xnn_update_f32_gavgpool_params(
+  union xnn_f32_gavgpool_params* params,
+  float multiplier,
+  uint32_t width)
+{
+  #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+    for (uint32_t i = 0; i < 4; i++) {
+      params->sse.multiplier[i] = multiplier;
+    }
+    switch (width % 4) {
+      case 0:
+        params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask[1] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask[2] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask[3] = UINT32_C(0xFFFFFFFF);
+        break;
+      case 1:
+        params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask[1] = 0;
+        params->sse.mask[2] = 0;
+        params->sse.mask[3] = 0;
+        break;
+      case 2:
+        params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask[1] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask[2] = 0;
+        params->sse.mask[3] = 0;
+        break;
+      case 3:
+        params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask[1] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask[2] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask[3] = 0;
+        break;
+    }
+  #elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+    params->neon.multiplier = multiplier;
+    switch (width % 4) {
+      case 0:
+        params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask[1] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask[2] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask[3] = UINT32_C(0xFFFFFFFF);
+        break;
+      case 1:
+        params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask[1] = 0;
+        params->neon.mask[2] = 0;
+        params->neon.mask[3] = 0;
+        break;
+      case 2:
+        params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask[1] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask[2] = 0;
+        params->neon.mask[3] = 0;
+        break;
+      case 3:
+        params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask[1] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask[2] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask[3] = 0;
+        break;
+    }
+  #endif
+}
+
+static inline union xnn_f32_avgpool_params xnn_compute_scalar_f32_avgpool_params(
+  float multiplier,
+  float output_min,
+  float output_max)
+{
+  union xnn_f32_avgpool_params params;
+  params.scalar.multiplier = multiplier;
+  params.scalar.output_min = output_min;
+  params.scalar.output_max = output_max;
+  return params;
+}
+
+static inline union xnn_f32_gavgpool_params xnn_compute_scalar_f32_gavgpool_params(
+  float multiplier,
+  float output_min,
+  float output_max,
+  uint32_t width)
+{
+  union xnn_f32_gavgpool_params params;
+  params.scalar.multiplier = multiplier;
+  params.scalar.output_min = output_min;
+  params.scalar.output_max = output_max;
+  return params;
+}
+
+static inline union xnn_f32_output_params xnn_compute_f32_output_params(
+  float output_min,
+  float output_max)
+{
+  union xnn_f32_output_params params;
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  for (uint32_t i = 0; i < 4; i++) {
+    params.sse.min[i] = output_min;
+    params.sse.max[i] = output_max;
+  }
+#else
+  params.scalar.min = output_min;
+  params.scalar.max = output_max;
+#endif
+  return params;
+}
+
+static inline union xnn_f32_output_params xnn_compute_scalar_f32_output_params(
+  float output_min,
+  float output_max)
+{
+  union xnn_f32_output_params params;
+  params.scalar.min = output_min;
+  params.scalar.max = output_max;
+  return params;
+}
+
+static inline union xnn_f32_hswish_params xnn_compute_f32_hswish_params(void)
+{
+  union xnn_f32_hswish_params params;
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  for (uint32_t i = 0; i < 4; i++) {
+    params.sse.sixth[i] = 0x1.555556p-3f;
+    params.sse.half[i] = 0.5f;
+    params.sse.one[i] = 1.0f;
+  }
+#else
+  params.scalar.sixth = 0x1.555556p-3f;
+  params.scalar.half = 0.5f;
+  params.scalar.one = 1.0f;
+#endif
+  return params;
+}
+
+static inline union xnn_f32_hswish_params xnn_compute_scalar_f32_hswish_params(void)
+{
+  union xnn_f32_hswish_params params;
+  params.scalar.sixth = 0x1.555556p-3f;
+  params.scalar.half = 0.5f;
+  params.scalar.one = 1.0f;
+  return params;
+}
+
+static inline union xnn_f32_spchw_params xnn_compute_f32_spchw_params(
+  uint32_t width,
+  float output_min,
+  float output_max)
+{
+  union xnn_f32_spchw_params params;
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+  switch (width % 4) {
+    case 0:
+      params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask[1] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask[2] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask[3] = UINT32_C(0xFFFFFFFF);
+      break;
+    case 1:
+      params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask[1] = 0;
+      params.sse.mask[2] = 0;
+      params.sse.mask[3] = 0;
+      break;
+    case 2:
+      params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask[1] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask[2] = 0;
+      params.sse.mask[3] = 0;
+      break;
+    case 3:
+      params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask[1] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask[2] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask[3] = 0;
+      break;
+  }
+  switch (width % 8) {
+    case 0:
+      params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_even[3] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_odd[3] = UINT32_C(0xFFFFFFFF);
+      break;
+    case 1:
+      params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_even[1] = 0;
+      params.sse.mask_even[2] = 0;
+      params.sse.mask_even[3] = 0;
+      params.sse.mask_odd[0] = 0;
+      params.sse.mask_odd[1] = 0;
+      params.sse.mask_odd[2] = 0;
+      params.sse.mask_odd[3] = 0;
+      break;
+    case 2:
+      params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_even[1] = 0;
+      params.sse.mask_even[2] = 0;
+      params.sse.mask_even[3] = 0;
+      params.sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_odd[1] = 0;
+      params.sse.mask_odd[2] = 0;
+      params.sse.mask_odd[3] = 0;
+      break;
+    case 3:
+      params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_even[2] = 0;
+      params.sse.mask_even[3] = 0;
+      params.sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_odd[1] = 0;
+      params.sse.mask_odd[2] = 0;
+      params.sse.mask_odd[3] = 0;
+      break;
+    case 4:
+      params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_even[2] = 0;
+      params.sse.mask_even[3] = 0;
+      params.sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_odd[2] = 0;
+      params.sse.mask_odd[3] = 0;
+      break;
+    case 5:
+      params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_even[3] = 0;
+      params.sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_odd[2] = 0;
+      params.sse.mask_odd[3] = 0;
+      break;
+    case 6:
+      params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_even[3] = 0;
+      params.sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_odd[3] = 0;
+      break;
+    case 7:
+      params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_even[3] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+      params.sse.mask_odd[3] = 0;
+      break;
+  }
+  for (uint32_t i = 0; i < 4; i++) {
+    params.sse.max[i] = output_max;
+    params.sse.min[i] = output_min;
+  }
+#elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+  switch (width % 4) {
+    case 0:
+      params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask[1] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask[2] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask[3] = UINT32_C(0xFFFFFFFF);
+      break;
+    case 1:
+      params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask[1] = 0;
+      params.neon.mask[2] = 0;
+      params.neon.mask[3] = 0;
+      break;
+    case 2:
+      params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask[1] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask[2] = 0;
+      params.neon.mask[3] = 0;
+      break;
+    case 3:
+      params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask[1] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask[2] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask[3] = 0;
+      break;
+  }
+  switch (width % 8) {
+    case 0:
+      params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_even[3] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_odd[3] = UINT32_C(0xFFFFFFFF);
+      break;
+    case 1:
+      params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_even[1] = 0;
+      params.neon.mask_even[2] = 0;
+      params.neon.mask_even[3] = 0;
+      params.neon.mask_odd[0] = 0;
+      params.neon.mask_odd[1] = 0;
+      params.neon.mask_odd[2] = 0;
+      params.neon.mask_odd[3] = 0;
+      break;
+    case 2:
+      params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_even[1] = 0;
+      params.neon.mask_even[2] = 0;
+      params.neon.mask_even[3] = 0;
+      params.neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_odd[1] = 0;
+      params.neon.mask_odd[2] = 0;
+      params.neon.mask_odd[3] = 0;
+      break;
+    case 3:
+      params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_even[2] = 0;
+      params.neon.mask_even[3] = 0;
+      params.neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_odd[1] = 0;
+      params.neon.mask_odd[2] = 0;
+      params.neon.mask_odd[3] = 0;
+      break;
+    case 4:
+      params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_even[2] = 0;
+      params.neon.mask_even[3] = 0;
+      params.neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_odd[2] = 0;
+      params.neon.mask_odd[3] = 0;
+      break;
+    case 5:
+      params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_even[3] = 0;
+      params.neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_odd[2] = 0;
+      params.neon.mask_odd[3] = 0;
+      break;
+    case 6:
+      params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_even[3] = 0;
+      params.neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_odd[3] = 0;
+      break;
+    case 7:
+      params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_even[3] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+      params.neon.mask_odd[3] = 0;
+      break;
+  }
+  params.neon.max = output_max;
+  params.neon.min = output_min;
+#else
+  params.scalar.max = output_max;
+  params.scalar.min = output_min;
+#endif
+  return params;
+}
+
+static inline void xnn_update_f32_spchw_params(
+  union xnn_f32_spchw_params* params,
+  uint32_t width)
+{
+  #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+    switch (width % 4) {
+      case 0:
+        params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask[1] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask[2] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask[3] = UINT32_C(0xFFFFFFFF);
+        break;
+      case 1:
+        params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask[1] = 0;
+        params->sse.mask[2] = 0;
+        params->sse.mask[3] = 0;
+        break;
+      case 2:
+        params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask[1] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask[2] = 0;
+        params->sse.mask[3] = 0;
+        break;
+      case 3:
+        params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask[1] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask[2] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask[3] = 0;
+        break;
+    }
+    switch (width % 8) {
+      case 0:
+        params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_even[3] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_odd[3] = UINT32_C(0xFFFFFFFF);
+        break;
+      case 1:
+        params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_even[1] = 0;
+        params->sse.mask_even[2] = 0;
+        params->sse.mask_even[3] = 0;
+        params->sse.mask_odd[0] = 0;
+        params->sse.mask_odd[1] = 0;
+        params->sse.mask_odd[2] = 0;
+        params->sse.mask_odd[3] = 0;
+        break;
+      case 2:
+        params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_even[1] = 0;
+        params->sse.mask_even[2] = 0;
+        params->sse.mask_even[3] = 0;
+        params->sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_odd[1] = 0;
+        params->sse.mask_odd[2] = 0;
+        params->sse.mask_odd[3] = 0;
+        break;
+      case 3:
+        params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_even[2] = 0;
+        params->sse.mask_even[3] = 0;
+        params->sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_odd[1] = 0;
+        params->sse.mask_odd[2] = 0;
+        params->sse.mask_odd[3] = 0;
+        break;
+      case 4:
+        params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_even[2] = 0;
+        params->sse.mask_even[3] = 0;
+        params->sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_odd[2] = 0;
+        params->sse.mask_odd[3] = 0;
+        break;
+      case 5:
+        params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_even[3] = 0;
+        params->sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_odd[2] = 0;
+        params->sse.mask_odd[3] = 0;
+        break;
+      case 6:
+        params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_even[3] = 0;
+        params->sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_odd[3] = 0;
+        break;
+      case 7:
+        params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_even[3] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+        params->sse.mask_odd[3] = 0;
+        break;
+    }
+  #elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+    switch (width % 4) {
+      case 0:
+        params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask[1] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask[2] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask[3] = UINT32_C(0xFFFFFFFF);
+        break;
+      case 1:
+        params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask[1] = 0;
+        params->neon.mask[2] = 0;
+        params->neon.mask[3] = 0;
+        break;
+      case 2:
+        params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask[1] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask[2] = 0;
+        params->neon.mask[3] = 0;
+        break;
+      case 3:
+        params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask[1] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask[2] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask[3] = 0;
+        break;
+    }
+    switch (width % 8) {
+      case 0:
+        params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_even[3] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_odd[3] = UINT32_C(0xFFFFFFFF);
+        break;
+      case 1:
+        params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_even[1] = 0;
+        params->neon.mask_even[2] = 0;
+        params->neon.mask_even[3] = 0;
+        params->neon.mask_odd[0] = 0;
+        params->neon.mask_odd[1] = 0;
+        params->neon.mask_odd[2] = 0;
+        params->neon.mask_odd[3] = 0;
+        break;
+      case 2:
+        params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_even[1] = 0;
+        params->neon.mask_even[2] = 0;
+        params->neon.mask_even[3] = 0;
+        params->neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_odd[1] = 0;
+        params->neon.mask_odd[2] = 0;
+        params->neon.mask_odd[3] = 0;
+        break;
+      case 3:
+        params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_even[2] = 0;
+        params->neon.mask_even[3] = 0;
+        params->neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_odd[1] = 0;
+        params->neon.mask_odd[2] = 0;
+        params->neon.mask_odd[3] = 0;
+        break;
+      case 4:
+        params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_even[2] = 0;
+        params->neon.mask_even[3] = 0;
+        params->neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_odd[2] = 0;
+        params->neon.mask_odd[3] = 0;
+        break;
+      case 5:
+        params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_even[3] = 0;
+        params->neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_odd[2] = 0;
+        params->neon.mask_odd[3] = 0;
+        break;
+      case 6:
+        params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_even[3] = 0;
+        params->neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_odd[3] = 0;
+        break;
+      case 7:
+        params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_even[3] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+        params->neon.mask_odd[3] = 0;
+        break;
+    }
+  #endif
+}
+
+static inline union xnn_f32_spchw_params xnn_compute_scalar_f32_spchw_params(
+  uint32_t width,
+  float output_min,
+  float output_max)
+{
+  union xnn_f32_spchw_params params;
+  params.scalar.max = output_max;
+  params.scalar.min = output_min;
+  return params;
+}
+
+static inline union xnn_u8_output_params xnn_compute_u8_output_params(
+  uint8_t output_min,
+  uint8_t output_max)
+{
+  assert(output_min < output_max);
+
+  union xnn_u8_output_params params;
+  #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+    for (uint32_t i = 0; i < 16; i++) {
+      params.sse2.max[i] = output_max;
+      params.sse2.min[i] = output_min;
+    }
+  #elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+    params.neon.max = output_max;
+    params.neon.min = output_min;
+  #else
+    params.scalar.min = (int32_t) (uint32_t) output_min;
+    params.scalar.max = (int32_t) (uint32_t) output_max;
+  #endif
+  return params;
+}
+
+static inline union xnn_u8_output_params xnn_compute_scalar_u8_output_params(
+  uint8_t output_min,
+  uint8_t output_max)
+{
+  assert(output_min < output_max);
+
+  union xnn_u8_output_params params;
+  params.scalar.min = (int32_t) (uint32_t) output_min;
+  params.scalar.max = (int32_t) (uint32_t) output_max;
+  return params;
+}
+
+static inline union xnn_q8_add_params xnn_compute_q8_add_params(
+  uint8_t a_zero_point,
+  uint8_t b_zero_point,
+  uint8_t output_zero_point,
+  float a_output_scale,
+  float b_output_scale,
+  uint8_t output_min,
+  uint8_t output_max)
+{
+  assert(a_output_scale >= 0x1.0p-14f);
+  assert(b_output_scale >= 0x1.0p-14f);
+  assert(a_output_scale < 0x1.0p+8f);
+  assert(b_output_scale < 0x1.0p+8f);
+
+  /* Compute requantization parameters */
+  const float max_output_scale = a_output_scale > b_output_scale ? a_output_scale : b_output_scale;
+  assert(max_output_scale >= 0x1.0p-14f);
+  assert(max_output_scale < 0x1.0p+8f);
+  const uint32_t max_scale_bits = fp32_to_bits(max_output_scale);
+  const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
+  /* Shift is in [13, 31] range */
+  const uint32_t shift = (uint32_t) (21 - max_scale_exponent);
+  assert(shift < 32);
+  assert(shift >= 13);
+
+  const float scale_multiplier = fp32_from_bits((uint32_t) (21 - max_scale_exponent + 127) << 23);
+
+  /* Multipliers are in [0, 2**22) range, largest multiplier is in [2**21, 2**22) range */
+  const uint32_t a_multiplier = (uint32_t) (int32_t) __builtin_lrintf(a_output_scale * scale_multiplier);
+  const uint32_t b_multiplier = (uint32_t) (int32_t) __builtin_lrintf(b_output_scale * scale_multiplier);
+  assert((a_multiplier > b_multiplier ? a_multiplier : b_multiplier) >= UINT32_C(0x00200000));
+  assert(a_multiplier < UINT32_C(0x00400000));
+  assert(b_multiplier < UINT32_C(0x00400000));
+
+  union xnn_q8_add_params params;
+  #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+    const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+    const uint32_t remainder_threshold = remainder_mask >> 1;
+    const int32_t zero_point_product =
+      (int32_t) -(a_multiplier * (uint32_t) a_zero_point + b_multiplier * (uint32_t) b_zero_point);
+    for (uint32_t i = 0; i < 4; i++) {
+      params.sse2.zero_point_product[i] = zero_point_product;
+    }
+    for (uint32_t i = 0; i < 8; i++) {
+      params.sse2.y_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
+    }
+    for (uint32_t i = 0; i < 8; i++) {
+      params.sse2.a_multiplier_lo[i] = (uint16_t) (uint32_t) a_multiplier;
+      params.sse2.a_multiplier_hi[i] = (uint16_t) ((uint32_t) a_multiplier >> 16);
+      params.sse2.b_multiplier_lo[i] = (uint16_t) (uint32_t) b_multiplier;
+      params.sse2.b_multiplier_hi[i] = (uint16_t) ((uint32_t) b_multiplier >> 16);
+    }
+    params.sse2.a_multiplier = a_multiplier;
+    params.sse2.b_multiplier = b_multiplier;
+    for (uint32_t i = 0; i < 4; i++) {
+      params.sse2.remainder_mask[i] = remainder_mask;
+      params.sse2.remainder_threshold[i] = remainder_threshold;
+    }
+    params.sse2.shift = shift;
+    for (uint32_t i = 0; i < 16; i++) {
+      params.sse2.y_max[i] = output_max;
+      params.sse2.y_min[i] = output_min;
+    }
+  #elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+    params.neon.a_zero_point = a_zero_point;
+    params.neon.b_zero_point = b_zero_point;
+    params.neon.y_zero_point = (int16_t) (uint16_t) output_zero_point;
+    params.neon.a_multiplier = (int32_t) a_multiplier;
+    params.neon.b_multiplier = (int32_t) b_multiplier;
+    params.neon.right_shift = (int32_t) -shift;
+    params.neon.y_max = output_max;
+    params.neon.y_min = output_min;
+  #else
+    const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+    const uint32_t remainder_threshold = remainder_mask >> 1;
+    params.scalar.zero_point_product =
+      (int32_t) -(a_multiplier * (uint32_t) a_zero_point + b_multiplier * (uint32_t) b_zero_point);
+    params.scalar.a_multiplier = a_multiplier;
+    params.scalar.b_multiplier = b_multiplier;
+    params.scalar.remainder_mask = (int32_t) remainder_mask;
+    params.scalar.remainder_threshold = (int32_t) remainder_threshold;
+    params.scalar.shift = shift;
+    params.scalar.y_zero_point = (int32_t) (uint32_t) output_zero_point;
+    params.scalar.y_max = (int32_t) (uint32_t) output_max;
+    params.scalar.y_min = (int32_t) (uint32_t) output_min;
+  #endif
+  return params;
+}
+
+static inline union xnn_q8_add_params xnn_compute_scalar_q8_add_params(
+  uint8_t a_zero_point,
+  uint8_t b_zero_point,
+  uint8_t output_zero_point,
+  float a_output_scale,
+  float b_output_scale,
+  uint8_t output_min,
+  uint8_t output_max)
+{
+  assert(a_output_scale >= 0x1.0p-10f);
+  assert(b_output_scale >= 0x1.0p-10f);
+  assert(a_output_scale < 0x1.0p+8f);
+  assert(b_output_scale < 0x1.0p+8f);
+
+  /* Compute requantization parameters */
+  const float max_output_scale = a_output_scale > b_output_scale ? a_output_scale : b_output_scale;
+  assert(max_output_scale >= 0x1.0p-10f);
+  assert(max_output_scale < 0x1.0p+8f);
+  const uint32_t max_scale_bits = fp32_to_bits(max_output_scale);
+  const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
+  /* Shift is in [13, 31] range */
+  const uint32_t shift = (uint32_t) (21 - max_scale_exponent);
+  assert(shift < 32);
+  assert(shift >= 13);
+
+  /* Multipliers are in [0, 2**22) range, largest multiplier is in [2**21, 2**22) range */
+  const uint32_t a_multiplier = (uint32_t) (int32_t) __builtin_lrintf(fp32_from_bits(fp32_to_bits(a_output_scale) + (shift << 23)));
+  const uint32_t b_multiplier = (uint32_t) (int32_t) __builtin_lrintf(fp32_from_bits(fp32_to_bits(b_output_scale) + (shift << 23)));
+  assert((a_multiplier > b_multiplier ? a_multiplier : b_multiplier) >= UINT32_C(0x00200000));
+  assert(a_multiplier < UINT32_C(0x00400000));
+  assert(b_multiplier < UINT32_C(0x00400000));
+
+  union xnn_q8_add_params params;
+  const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+  const uint32_t remainder_threshold = remainder_mask >> 1;
+  params.scalar.zero_point_product =
+    (int32_t) -(a_multiplier * (uint32_t) a_zero_point + b_multiplier * (uint32_t) b_zero_point);
+  params.scalar.a_multiplier = a_multiplier;
+  params.scalar.b_multiplier = b_multiplier;
+  params.scalar.remainder_mask = (int32_t) remainder_mask;
+  params.scalar.remainder_threshold = (int32_t) remainder_threshold;
+  params.scalar.shift = shift;
+  params.scalar.y_zero_point = (int32_t) (uint32_t) output_zero_point;
+  params.scalar.y_max = (int32_t) (uint32_t) output_max;
+  params.scalar.y_min = (int32_t) (uint32_t) output_min;
+  return params;
+}
+
+static inline union xnn_q31_requantization_params xnn_compute_scalar_requantization_params(
+  float scale,
+  uint8_t zero_point,
+  uint8_t min,
+  uint8_t max)
+{
+  /* Compute requantization parameters */
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+  const uint32_t scale_bits = fp32_to_bits(scale);
+
+  /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+  const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
+  assert(multiplier >= INT32_C(0x40000000));
+  assert(multiplier <= INT32_C(0x7FFFFF80));
+
+  /* Shift is in [0, 31] range */
+  const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
+  assert(shift >= 0);
+  assert(shift < 32);
+
+  union xnn_q31_requantization_params params;
+  const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+  const uint32_t remainder_threshold = remainder_mask >> 1;
+  params.scalar.multiplier = multiplier;
+  params.scalar.remainder_mask = (int32_t) remainder_mask;
+  params.scalar.remainder_threshold = (int32_t) remainder_threshold;
+  params.scalar.shift = (uint32_t) shift;
+  params.scalar.min_less_zero_point = (int32_t) (uint32_t) min - (int32_t) (uint32_t) zero_point;
+  params.scalar.max_less_zero_point = (int32_t) (uint32_t) max - (int32_t) (uint32_t) zero_point;
+  params.scalar.zero_point = (int32_t) (uint32_t) zero_point;
+  return params;
+}
+
+static inline union xnn_q31_requantization_params xnn_compute_requantization_params(
+  float scale,
+  uint8_t zero_point,
+  uint8_t min,
+  uint8_t max)
+{
+  /* Compute requantization parameters */
+  const uint32_t scale_bits = fp32_to_bits(scale);
+
+  /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+  const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
+  assert(multiplier >= INT32_C(0x40000000));
+  assert(multiplier <= INT32_C(0x7FFFFF80));
+
+  /* Shift is in [0, 31] range */
+  const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
+  assert(shift >= 0);
+  assert(shift < 32);
+
+  union xnn_q31_requantization_params params;
+  #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+    const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+    const uint32_t remainder_threshold = remainder_mask >> 1;
+    params.sse2.multiplier[0] = multiplier;
+    params.sse2.multiplier[1] = multiplier;
+    params.sse2.multiplier[2] = multiplier;
+    params.sse2.multiplier[3] = multiplier;
+    params.sse2.rounding[0] = UINT64_C(0x40000000);
+    params.sse2.rounding[1] = UINT64_C(0x40000000);
+    params.sse2.remainder_mask[0] = (int32_t) remainder_mask;
+    params.sse2.remainder_mask[1] = (int32_t) remainder_mask;
+    params.sse2.remainder_mask[2] = (int32_t) remainder_mask;
+    params.sse2.remainder_mask[3] = (int32_t) remainder_mask;
+    params.sse2.remainder_threshold[0] = (int32_t) remainder_threshold;
+    params.sse2.remainder_threshold[1] = (int32_t) remainder_threshold;
+    params.sse2.remainder_threshold[2] = (int32_t) remainder_threshold;
+    params.sse2.remainder_threshold[3] = (int32_t) remainder_threshold;
+    params.sse2.shift[0] = (uint64_t) (uint32_t) shift;
+    params.sse2.shift[1] = (uint64_t) (uint32_t) shift;
+    for (uint32_t i = 0; i < 8; i++) {
+      params.sse2.zero_point[i] = (int16_t) (uint16_t) zero_point;
+    }
+    for (uint32_t i = 0; i < 16; i++) {
+      params.sse2.max[i] = max;
+      params.sse2.min[i] = min;
+    }
+  #elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+    params.neon.multiplier = multiplier;
+    params.neon.right_shift = -shift;
+    params.neon.zero_point = (int16_t) (uint16_t) zero_point;
+    params.neon.max = max;
+    params.neon.min = min;
+  #else
+    const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+    const uint32_t remainder_threshold = remainder_mask >> 1;
+    params.scalar.multiplier = multiplier;
+    params.scalar.remainder_mask = (int32_t) remainder_mask;
+    params.scalar.remainder_threshold = (int32_t) remainder_threshold;
+    params.scalar.shift = (uint32_t) shift;
+    params.scalar.min_less_zero_point = (int32_t) (uint32_t) min - (int32_t) (uint32_t) zero_point;
+    params.scalar.max_less_zero_point = (int32_t) (uint32_t) max - (int32_t) (uint32_t) zero_point;
+    params.scalar.zero_point = (int32_t) (uint32_t) zero_point;
+  #endif
+  return params;
+}
+
+static inline uint8_t xnn_q31_requantize(
+  int32_t n,
+  union xnn_q31_requantization_params params)
+{
+  const int64_t product = (int64_t) n * (int64_t) params.scalar.multiplier;
+  const int32_t q31product = (int32_t) (uint32_t) ((uint64_t) (product + INT64_C(0x40000000)) >> 31);
+  const int32_t remainder = (q31product & params.scalar.remainder_mask) - (int32_t) (n < 0);
+  n = asr_s32(q31product, params.scalar.shift) + (int32_t) (remainder > params.scalar.remainder_threshold);
+  if (n < params.scalar.min_less_zero_point) {
+    n = params.scalar.min_less_zero_point;
+  }
+  if (n > params.scalar.max_less_zero_point) {
+    n = params.scalar.max_less_zero_point;
+  }
+
+  return (uint8_t) (n + params.scalar.zero_point);
+}
+
+static inline uint8_t xnn_avgpool_quantize(
+  int32_t n,
+  union xnn_q8_avgpool_params params)
+{
+  const int64_t product = (int64_t) n * (int64_t) params.scalar.multiplier;
+  const int64_t adjusted_product = product - (int64_t) (n < 0);
+
+  n = (int32_t) asr_s64(adjusted_product + params.scalar.rounding, params.scalar.right_shift);
+  if (n < params.scalar.output_min_less_zero_point) {
+    n = params.scalar.output_min_less_zero_point;
+  }
+  if (n > params.scalar.output_max_less_zero_point) {
+    n = params.scalar.output_max_less_zero_point;
+  }
+
+  return (uint8_t) (n + params.scalar.output_zero_point);
+}
+
+static inline uint8_t xnn_add_quantize(
+  uint8_t a, uint8_t b,
+  union xnn_q8_add_params params)
+{
+  /* Multiply by factors and accumulate products */
+  int32_t acc = params.scalar.zero_point_product +
+    (int32_t) ((uint32_t) a * params.scalar.a_multiplier) +
+    (int32_t) ((uint32_t) b * params.scalar.b_multiplier);
+
+  /* Shift right and round */
+  const int32_t rem = (acc & params.scalar.remainder_mask) - (int32_t) (acc < 0);
+  acc = asr_s32(acc, params.scalar.shift) + (int32_t) (rem > params.scalar.remainder_threshold);
+
+  /* Clamp and add output zero point */
+  int32_t y = acc + params.scalar.y_zero_point;
+  if (y >= params.scalar.y_max) {
+    y = params.scalar.y_max;
+  }
+  if (y <= params.scalar.y_min) {
+    y = params.scalar.y_min;
+  }
+  return (uint8_t) y;
+}
diff --git a/src/xnnpack/rmax.h b/src/xnnpack/rmax.h
new file mode 100644
index 0000000..25f6e32
--- /dev/null
+++ b/src/xnnpack/rmax.h
@@ -0,0 +1,47 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_RMAX_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                       \
+      size_t n,                                    \
+      const float* x,                              \
+      float* y);
+
+DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__avx)
+DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__avx512f)
+DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__neon)
+DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__scalar)
+DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__sse)
+
+
+#define DECLARE_U8_RMAX_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                      \
+      size_t n,                                   \
+      const uint8_t* x,                           \
+      uint8_t* y);
+
+DECLARE_U8_RMAX_UKERNEL_FUNCTION(xnn_u8_rmax_ukernel__neon)
+DECLARE_U8_RMAX_UKERNEL_FUNCTION(xnn_u8_rmax_ukernel__scalar)
+DECLARE_U8_RMAX_UKERNEL_FUNCTION(xnn_u8_rmax_ukernel__sse2)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/scalar-utils.h b/src/xnnpack/scalar-utils.h
new file mode 100644
index 0000000..88d30c8
--- /dev/null
+++ b/src/xnnpack/scalar-utils.h
@@ -0,0 +1,121 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+  #include <climits>
+  #include <cstdint>
+  #include <cstdbool>
+  #include <cassert>
+#else
+  #include <limits.h>
+  #include <stdint.h>
+  #include <stdbool.h>
+  #include <assert.h>
+#endif
+
+#include <fp16.h>
+
+#if defined(__clang__) && !defined(__pnacl__)
+  #if __clang_major__ == 3 && __clang_minor__ >= 7 || __clang_major__ > 3
+    #define XNN_IGNORE_SHIFT_BASE_UB __attribute__((__no_sanitize__("shift-base")))
+  #else
+    #define XNN_IGNORE_SHIFT_BASE_UB
+  #endif
+#elif defined(__GNUC__)
+  #if __GNUC__ >= 8
+    #define XNN_IGNORE_SHIFT_BASE_UB __attribute__((__no_sanitize__("shift-base")))
+  #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 || __GNUC__ > 4
+    /* 4.9 <= gcc < 8 support ubsan, but doesn't support no_sanitize attribute */
+    #define XNN_IGNORE_SHIFT_BASE_UB
+    #ifndef XNN_USE_SHIFT_BASE_UB_WORKAROUND
+      #define XNN_USE_SHIFT_BASE_UB_WORKAROUND 1
+    #endif
+  #else
+    #define XNN_IGNORE_SHIFT_BASE_UB
+  #endif
+#else
+  #define XNN_IGNORE_SHIFT_BASE_UB
+#endif
+
+XNN_IGNORE_SHIFT_BASE_UB
+inline static int32_t asr_s32(int32_t x, uint32_t n) {
+  #ifdef XNN_USE_SHIFT_BASE_UB_WORKAROUND
+    #if defined(__x86_64__) || defined(__aarch64__)
+      return (int32_t) ((uint64_t) (int64_t) x >> n);
+    #else
+      return x >= 0 ? x >> n : ~(~x >> n);
+    #endif
+  #else
+    return x >> n;
+  #endif
+}
+
+XNN_IGNORE_SHIFT_BASE_UB
+inline static int64_t asr_s64(int64_t x, uint32_t n) {
+  #ifdef XNN_USE_SHIFT_BASE_UB_WORKAROUND
+    return x >= 0 ? x >> n : ~(~x >> n);
+  #else
+    return x >> n;
+  #endif
+}
+
+inline static uint8_t scalar_requantize_precise(
+  int32_t value,
+  float scale,
+  uint8_t zero_point,
+  uint8_t qmin,
+  uint8_t qmax)
+{
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  const uint32_t scale_bits = fp32_to_bits(scale);
+  const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000);
+  const uint32_t shift = 127 + 23 - (scale_bits >> 23);
+  assert(shift >= 24);
+  assert(shift < 56);
+
+  /*
+   * Compute absolute value of input as unsigned 32-bit int.
+   * All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
+   */
+  const uint32_t abs_value = (value >= 0) ? (uint32_t) value : -(uint32_t) value;
+
+  /* Compute full 64-bit product of 32-bit factors */
+  const uint64_t product = (uint64_t) abs_value * (uint64_t) multiplier;
+
+  /*
+   * Shift the full 64-bit product right with rounding.
+   * Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
+   */
+  const uint64_t rounding = UINT64_C(1) << (shift - 1);
+  const uint32_t abs_scaled_value = (uint32_t) ((product + rounding) >> shift);
+
+  /*
+   * Copy the sign of input to scaled absolute input value.
+   */
+  const int32_t scaled_value = (int32_t) (value >= 0 ? abs_scaled_value : -abs_scaled_value);
+
+  /* Clamp scaled value with zero point between smin and smax */
+  int32_t clamped_value = scaled_value;
+  const int32_t smin = (int32_t) (uint32_t) qmin - (int32_t) (uint32_t) zero_point;
+  if (clamped_value < smin) {
+    clamped_value = smin;
+  }
+  const int32_t smax = (int32_t) (uint32_t) qmax - (int32_t) (uint32_t) zero_point;
+  if (clamped_value > smax) {
+    clamped_value = smax;
+  }
+
+  /* Add zero point to clamped value */
+  const int32_t biased_value = clamped_value + (int32_t) (uint32_t) zero_point;
+
+  return biased_value;
+}
diff --git a/src/xnnpack/spmm.h b/src/xnnpack/spmm.h
new file mode 100644
index 0000000..7ea16bf
--- /dev/null
+++ b/src/xnnpack/spmm.h
@@ -0,0 +1,66 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_SPMM_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                       \
+    uint32_t m,                                    \
+    uint32_t n,                                    \
+    const float* a,                                \
+    const float* w,                                \
+    const int32_t* dmap,                           \
+    const uint32_t* nmap,                          \
+    float* c,                                      \
+    const union xnn_f32_output_params* params);
+
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_12x1__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_12x2__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_12x4__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_16x1__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_16x2__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_16x4__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_16x1__neonfma_pipelined)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_16x1__neonfma_unroll2)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_1x1__scalar)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_1x1__scalar_pipelined)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_1x1__scalar_unroll2)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_2x1__scalar)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_2x1__scalar_pipelined)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_2x1__scalar_unroll2)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_4x1__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_4x2__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_4x4__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_4x1__neonfma_pipelined)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_4x1__neonfma_unroll2)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_4x1__scalar)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_4x1__scalar_pipelined)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_4x1__scalar_unroll2)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_4x1__sse)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_8x1__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_8x2__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_8x4__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_8x1__neonfma_pipelined)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_8x1__neonfma_unroll2)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_8x1__scalar)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_8x1__scalar_pipelined)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_8x1__scalar_unroll2)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_8x1__sse)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/unpool.h b/src/xnnpack/unpool.h
new file mode 100644
index 0000000..c02457a
--- /dev/null
+++ b/src/xnnpack/unpool.h
@@ -0,0 +1,34 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_X32_UNPOOL_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                         \
+    size_t p,                                        \
+    size_t c,                                        \
+    uint32_t f,                                      \
+    const uint32_t* input,                           \
+    const uint32_t* index,                           \
+    uint32_t** output);
+
+DECLARE_X32_UNPOOL_UKERNEL_FUNCTION(xnn_x32_unpool_ukernel__psimd)
+DECLARE_X32_UNPOOL_UKERNEL_FUNCTION(xnn_x32_unpool_ukernel__scalar)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/vadd.h b/src/xnnpack/vadd.h
new file mode 100644
index 0000000..a66d171
--- /dev/null
+++ b/src/xnnpack/vadd.h
@@ -0,0 +1,51 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_VADD_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                       \
+      size_t n,                                    \
+      const float* a,                              \
+      const float* b,                              \
+      float* y,                                    \
+      const union xnn_f32_output_params* params);
+
+DECLARE_F32_VADD_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__neon)
+DECLARE_F32_VADD_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__psimd)
+DECLARE_F32_VADD_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__scalar)
+DECLARE_F32_VADD_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__sse)
+
+
+#define DECLARE_Q8_VADD_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                      \
+      size_t n,                                   \
+      const uint8_t* a,                           \
+      const uint8_t* b,                           \
+      uint8_t* y,                                 \
+      const union xnn_q8_add_params* params);
+
+DECLARE_Q8_VADD_UKERNEL_FUNCTION(xnn_q8_vadd_ukernel__neon)
+DECLARE_Q8_VADD_UKERNEL_FUNCTION(xnn_q8_vadd_ukernel__scalar)
+DECLARE_Q8_VADD_UKERNEL_FUNCTION(xnn_q8_vadd_ukernel__sse2)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/vmul.h b/src/xnnpack/vmul.h
new file mode 100644
index 0000000..9747de8
--- /dev/null
+++ b/src/xnnpack/vmul.h
@@ -0,0 +1,35 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_VMUL_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                       \
+      size_t n,                                    \
+      const float* a,                              \
+      const float* b,                              \
+      float* y,                                    \
+      const union xnn_f32_output_params* params);
+
+DECLARE_F32_VMUL_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__neon)
+DECLARE_F32_VMUL_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__psimd)
+DECLARE_F32_VMUL_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__scalar)
+DECLARE_F32_VMUL_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__sse)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/vmulcaddc.h b/src/xnnpack/vmulcaddc.h
new file mode 100644
index 0000000..a37e747
--- /dev/null
+++ b/src/xnnpack/vmulcaddc.h
@@ -0,0 +1,39 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                            \
+      size_t m,                                         \
+      size_t c,                                         \
+      const float* x,                                   \
+      size_t x_stride,                                  \
+      const float* w,                                   \
+      float* y,                                         \
+      size_t y_stride,                                  \
+      const union xnn_f32_output_params* params);
+
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c1__scalar_x2)
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__neon_x2)
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__neonfma_x2)
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__psimd_x2)
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__sse_x2)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/vsub.h b/src/xnnpack/vsub.h
new file mode 100644
index 0000000..e444eb6
--- /dev/null
+++ b/src/xnnpack/vsub.h
@@ -0,0 +1,35 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_VSUB_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                       \
+      size_t n,                                    \
+      const float* a,                              \
+      const float* b,                              \
+      float* y,                                    \
+      const union xnn_f32_output_params* params);
+
+DECLARE_F32_VSUB_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__neon)
+DECLARE_F32_VSUB_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__psimd)
+DECLARE_F32_VSUB_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__scalar)
+DECLARE_F32_VSUB_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__sse)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/zip.h b/src/xnnpack/zip.h
new file mode 100644
index 0000000..48b164e
--- /dev/null
+++ b/src/xnnpack/zip.h
@@ -0,0 +1,86 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_X8_ZIPC_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                      \
+      size_t n,                                   \
+      const uint8_t* x,                           \
+      uint8_t* y);
+
+DECLARE_X8_ZIPC_UKERNEL_FUNCTION(xnn_x8_zip_x2_ukernel__neon)
+DECLARE_X8_ZIPC_UKERNEL_FUNCTION(xnn_x8_zip_x2_ukernel__sse2)
+DECLARE_X8_ZIPC_UKERNEL_FUNCTION(xnn_x8_zip_x2_ukernel__scalar)
+DECLARE_X8_ZIPC_UKERNEL_FUNCTION(xnn_x8_zip_x3_ukernel__neon)
+DECLARE_X8_ZIPC_UKERNEL_FUNCTION(xnn_x8_zip_x3_ukernel__sse2)
+DECLARE_X8_ZIPC_UKERNEL_FUNCTION(xnn_x8_zip_x3_ukernel__scalar)
+DECLARE_X8_ZIPC_UKERNEL_FUNCTION(xnn_x8_zip_x4_ukernel__neon)
+DECLARE_X8_ZIPC_UKERNEL_FUNCTION(xnn_x8_zip_x4_ukernel__sse2)
+DECLARE_X8_ZIPC_UKERNEL_FUNCTION(xnn_x8_zip_x4_ukernel__scalar)
+
+
+#define DECLARE_X32_ZIPC_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                       \
+      size_t n,                                    \
+      const uint32_t* x,                           \
+      uint32_t* y);
+
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x2_ukernel__neon)
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x2_ukernel__psimd)
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x2_ukernel__scalar)
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x2_ukernel__sse2)
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x3_ukernel__neon)
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x3_ukernel__psimd)
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x3_ukernel__scalar)
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x3_ukernel__sse2)
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x4_ukernel__neon)
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x4_ukernel__psimd)
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x4_ukernel__scalar)
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x4_ukernel__sse2)
+
+
+#define DECLARE_X8_ZIPV_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                      \
+      size_t n,                                   \
+      size_t m,                                   \
+      const uint8_t* x,                           \
+      uint8_t* y);
+
+DECLARE_X8_ZIPV_UKERNEL_FUNCTION(xnn_x8_zip_xm_ukernel__neon)
+DECLARE_X8_ZIPV_UKERNEL_FUNCTION(xnn_x8_zip_xm_ukernel__sse2)
+DECLARE_X8_ZIPV_UKERNEL_FUNCTION(xnn_x8_zip_xm_ukernel__scalar)
+
+
+#define DECLARE_X32_ZIPV_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                       \
+      size_t n,                                    \
+      size_t m,                                    \
+      const uint32_t* x,                           \
+      uint32_t* y);
+
+DECLARE_X32_ZIPV_UKERNEL_FUNCTION(xnn_x32_zip_xm_ukernel__neon)
+DECLARE_X32_ZIPV_UKERNEL_FUNCTION(xnn_x32_zip_xm_ukernel__psimd)
+DECLARE_X32_ZIPV_UKERNEL_FUNCTION(xnn_x32_zip_xm_ukernel__scalar)
+DECLARE_X32_ZIPV_UKERNEL_FUNCTION(xnn_x32_zip_xm_ukernel__sse2)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif