Initial open-source release
PiperOrigin-RevId: 271685289
diff --git a/src/xnnpack/AlignedAllocator.h b/src/xnnpack/AlignedAllocator.h
new file mode 100644
index 0000000..ee12481
--- /dev/null
+++ b/src/xnnpack/AlignedAllocator.h
@@ -0,0 +1,104 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <cstddef>
+#include <limits>
+#include <type_traits>
+#include <utility>
+
+#include <stdlib.h>
+
+template <typename T, size_t Alignment>
+class AlignedAllocator;
+
+template <size_t Alignment>
+class AlignedAllocator<void, Alignment> {
+ public:
+ typedef void* pointer;
+ typedef const void* const_pointer;
+ typedef void value_type;
+
+ template <class U>
+ struct rebind {
+ typedef AlignedAllocator<U, Alignment> other;
+ };
+};
+
+template <typename T, size_t Alignment>
+class AlignedAllocator {
+ public:
+ typedef T value_type;
+ typedef T* pointer;
+ typedef const T* const_pointer;
+ typedef T& reference;
+ typedef const T& const_reference;
+ typedef size_t size_type;
+ typedef ptrdiff_t difference_type;
+
+#if __cplusplus >= 201402L
+ typedef std::true_type propagate_on_container_move_assignment;
+#endif
+
+ template <class U>
+ struct rebind {
+ typedef AlignedAllocator<U, Alignment> other;
+ };
+
+ public:
+ inline AlignedAllocator() noexcept {}
+
+ template <class U>
+ inline AlignedAllocator(
+ const AlignedAllocator<U, Alignment>& other) noexcept {}
+
+ inline size_type max_size() const noexcept {
+ return (std::numeric_limits<size_type>::max() - size_type(Alignment)) /
+ sizeof(T);
+ }
+
+ inline pointer address(reference x) const noexcept {
+ return std::addressof(x);
+ }
+
+ inline const_pointer address(const_reference x) const noexcept {
+ return std::addressof(x);
+ }
+
+ inline pointer allocate(
+ size_type n,
+ typename AlignedAllocator<void, Alignment>::const_pointer hint = 0) {
+#if defined(__ANDROID__)
+ void* memory = memalign(Alignment, n * sizeof(T));
+ if (memory == 0) {
+#if !defined(__GNUC__) || defined(__EXCEPTIONS)
+ throw std::bad_alloc();
+#endif
+ }
+#else
+ void* memory = nullptr;
+ if (posix_memalign(&memory, Alignment, n * sizeof(T)) != 0) {
+#if !defined(__GNUC__) || defined(__EXCEPTIONS)
+ throw std::bad_alloc();
+#endif
+ }
+#endif
+ return static_cast<pointer>(memory);
+ }
+
+ inline void deallocate(pointer p, size_type n) noexcept {
+ free(static_cast<void*>(p));
+ }
+
+ template <class U, class... Args>
+ inline void construct(U* p, Args&&... args) {
+ ::new (static_cast<void*>(p)) U(std::forward<Args>(args)...);
+ }
+
+ template <class U>
+ inline void destroy(U* p) {
+ p->~U();
+ }
+};
diff --git a/src/xnnpack/allocator.h b/src/xnnpack/allocator.h
new file mode 100644
index 0000000..303aa37
--- /dev/null
+++ b/src/xnnpack/allocator.h
@@ -0,0 +1,47 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef __ANDROID__
+ #include <malloc.h>
+#endif
+
+#include <cpuinfo.h>
+
+extern int posix_memalign(void **memptr, size_t alignment, size_t size);
+
+
+#define XNN_ALLOCATION_ALIGNMENT 16
+
+
+inline static void* xnn_allocate_memory(size_t memory_size) {
+ void* memory_ptr = NULL;
+#if CPUINFO_ARCH_ASMJS || CPUINFO_ARCH_WASM
+ memory_ptr = malloc(memory_size);
+#elif defined(__ANDROID__)
+ memory_ptr = memalign(XNN_ALLOCATION_ALIGNMENT, memory_size);
+#else
+ if (posix_memalign(&memory_ptr, XNN_ALLOCATION_ALIGNMENT, memory_size) != 0) {
+ return NULL;
+ }
+#endif
+ return memory_ptr;
+}
+
+inline static void* xnn_allocate_zero_memory(size_t memory_size) {
+ void* memory_ptr = xnn_allocate_memory(memory_size);
+ if (memory_ptr != NULL) {
+ memset(memory_ptr, 0, memory_size);
+ }
+ return memory_ptr;
+}
+
+inline static void xnn_release_memory(void* memory_ptr) {
+ free(memory_ptr);
+}
diff --git a/src/xnnpack/argmaxpool.h b/src/xnnpack/argmaxpool.h
new file mode 100644
index 0000000..5b9776d
--- /dev/null
+++ b/src/xnnpack/argmaxpool.h
@@ -0,0 +1,60 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ size_t ks, \
+ size_t kc, \
+ const float** x, \
+ float* y, \
+ uint32_t* i, \
+ size_t x_increment, \
+ size_t y_increment, \
+ const union xnn_f32_output_params* params);
+
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up4__psimd)
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up4__scalar)
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up4__sse2)
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up9__psimd)
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up9__scalar)
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up9__sse2)
+
+
+#define DECLARE_F32_ARGMAXPOOL_MULTIPASS_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ size_t ks, \
+ size_t kc, \
+ const float** x, \
+ float* ab, \
+ uint32_t* ib, \
+ float* y, \
+ uint32_t* i, \
+ size_t x_increment, \
+ size_t y_increment, \
+ const union xnn_f32_output_params* params);
+
+DECLARE_F32_ARGMAXPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd)
+DECLARE_F32_ARGMAXPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar)
+DECLARE_F32_ARGMAXPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/assembly.h b/src/xnnpack/assembly.h
new file mode 100644
index 0000000..4ed7270
--- /dev/null
+++ b/src/xnnpack/assembly.h
@@ -0,0 +1,32 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#ifdef __ELF__
+ .macro BEGIN_FUNCTION name
+ .text
+ .p2align 4
+ .global \name
+ .type \name, %function
+ \name:
+ .endm
+
+ .macro END_FUNCTION name
+ .size \name, .-\name
+ .endm
+#elif defined(__MACH__)
+ .macro BEGIN_FUNCTION name
+ .text
+ .p2align 4
+ .global _\name
+ .private_extern _\name
+ _\name:
+ .endm
+
+ .macro END_FUNCTION name
+ .endm
+#endif
diff --git a/src/xnnpack/avgpool.h b/src/xnnpack/avgpool.h
new file mode 100644
index 0000000..5fd51b9
--- /dev/null
+++ b/src/xnnpack/avgpool.h
@@ -0,0 +1,96 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ size_t ks, \
+ size_t kc, \
+ const float** x, \
+ const float* zero, \
+ float* buffer, \
+ float* y, \
+ size_t x_increment, \
+ size_t y_increment, \
+ const union xnn_f32_avgpool_params* params);
+
+DECLARE_F32_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_mp9p8q__neon)
+DECLARE_F32_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_mp9p8q__psimd)
+DECLARE_F32_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_mp9p8q__scalar)
+DECLARE_F32_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_mp9p8q__sse)
+
+
+#define DECLARE_F32_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ size_t ks, \
+ size_t kc, \
+ const float** x, \
+ const float* zero, \
+ float* y, \
+ size_t x_increment, \
+ size_t y_increment, \
+ const union xnn_f32_avgpool_params* params);
+
+DECLARE_F32_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_up9__neon)
+DECLARE_F32_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_up9__psimd)
+DECLARE_F32_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_up9__scalar)
+DECLARE_F32_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_up9__sse)
+
+
+#define DECLARE_Q8_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ size_t ks, \
+ size_t kc, \
+ const uint8_t** x, \
+ const uint8_t* zero, \
+ int32_t* buffer, \
+ uint8_t* y, \
+ size_t x_increment, \
+ size_t y_increment, \
+ const union xnn_q8_avgpool_params* params);
+
+DECLARE_Q8_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_q8_avgpool_ukernel_mp9p8q__neon)
+DECLARE_Q8_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_q8_avgpool_ukernel_mp9p8q__scalar)
+DECLARE_Q8_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_q8_avgpool_ukernel_mp9p8q__sse2)
+
+
+#define DECLARE_Q8_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ size_t ks, \
+ size_t kc, \
+ const uint8_t** x, \
+ const uint8_t* zero, \
+ uint8_t* y, \
+ size_t x_increment, \
+ size_t y_increment, \
+ const union xnn_q8_avgpool_params* params);
+
+DECLARE_Q8_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_q8_avgpool_ukernel_up9__neon)
+DECLARE_Q8_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_q8_avgpool_ukernel_up9__scalar)
+DECLARE_Q8_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_q8_avgpool_ukernel_up9__sse2)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/clamp.h b/src/xnnpack/clamp.h
new file mode 100644
index 0000000..db19d28
--- /dev/null
+++ b/src/xnnpack/clamp.h
@@ -0,0 +1,49 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_CLAMP_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ const float* x, \
+ float* y, \
+ const union xnn_f32_output_params* params);
+
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__psimd)
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__neon)
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__sse)
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__scalar)
+
+
+#define DECLARE_U8_CLAMP_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ const uint8_t* x, \
+ uint8_t* y, \
+ const union xnn_u8_output_params* params);
+
+DECLARE_U8_CLAMP_UKERNEL_FUNCTION(xnn_u8_clamp_ukernel__neon)
+DECLARE_U8_CLAMP_UKERNEL_FUNCTION(xnn_u8_clamp_ukernel__sse2)
+DECLARE_U8_CLAMP_UKERNEL_FUNCTION(xnn_u8_clamp_ukernel__scalar)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/common.h b/src/xnnpack/common.h
new file mode 100644
index 0000000..0fc7011
--- /dev/null
+++ b/src/xnnpack/common.h
@@ -0,0 +1,67 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+
+#if defined(__GNUC__)
+ #if defined(__clang__) || (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 5)
+ #define XNN_UNREACHABLE do { __builtin_unreachable(); } while (0)
+ #else
+ #define XNN_UNREACHABLE do { __builtin_trap(); } while (0)
+ #endif
+#elif defined(_MSC_VER)
+ #define XNN_UNREACHABLE __assume(0)
+#else
+ #define XNN_UNREACHABLE do { } while (0)
+#endif
+
+#define XNN_ALIGN(alignment) __attribute__((__aligned__(alignment)))
+
+#define XNN_COUNT_OF(array) (sizeof(array) / sizeof(0[array]))
+
+#if defined(__GNUC__)
+ #define XNN_LIKELY(condition) (__builtin_expect(!!(condition), 1))
+ #define XNN_UNLIKELY(condition) (__builtin_expect(!!(condition), 0))
+#else
+ #define XNN_LIKELY(condition) (!!(condition))
+ #define XNN_UNLIKELY(condition) (!!(condition))
+#endif
+
+// TODO - __builtin_expect_with_probability for GCC 9+
+#if defined(__clang__) && (__has_builtin(__builtin_unpredictable))
+ #define XNN_UNPREDICTABLE(condition) (__builtin_unpredictable(!!(condition)))
+#else
+ #define XNN_UNPREDICTABLE(condition) (!!(condition))
+#endif
+
+#if defined(__GNUC__)
+ #define XNN_INLINE inline __attribute__((__always_inline__))
+#else
+ #define XNN_INLINE inline
+#endif
+
+#ifndef XNN_INTERNAL
+ #if defined(__ELF__)
+ #define XNN_INTERNAL __attribute__((__visibility__("internal")))
+ #elif defined(__MACH__)
+ #define XNN_INTERNAL __attribute__((__visibility__("hidden")))
+ #else
+ #define XNN_INTERNAL
+ #endif
+#endif
+
+#ifndef XNN_PRIVATE
+ #if defined(__ELF__)
+ #define XNN_PRIVATE __attribute__((__visibility__("hidden")))
+ #elif defined(__MACH__)
+ #define XNN_PRIVATE __attribute__((__visibility__("hidden")))
+ #else
+ #define XNN_PRIVATE
+ #endif
+#endif
diff --git a/src/xnnpack/compute.h b/src/xnnpack/compute.h
new file mode 100644
index 0000000..fc8693a
--- /dev/null
+++ b/src/xnnpack/compute.h
@@ -0,0 +1,709 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack.h>
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/params.h>
+
+
+enum xnn_parallelization_type {
+ xnn_parallelization_type_invalid = 0,
+ xnn_parallelization_type_1d,
+ xnn_parallelization_type_1d_tile_1d,
+ xnn_parallelization_type_2d,
+ xnn_parallelization_type_2d_tile_1d,
+ xnn_parallelization_type_2d_tile_2d,
+ xnn_parallelization_type_3d_tile_2d,
+ xnn_parallelization_type_4d_tile_2d,
+ xnn_parallelization_type_5d_tile_2d,
+ xnn_parallelization_type_6d_tile_2d,
+};
+
+struct compute_parameters {
+ enum xnn_parallelization_type type;
+ union {
+ pthreadpool_task_1d_t task_1d;
+ pthreadpool_task_1d_tile_1d_t task_1d_tile_1d;
+ pthreadpool_task_2d_t task_2d;
+ pthreadpool_task_2d_tile_1d_t task_2d_tile_1d;
+ pthreadpool_task_2d_tile_2d_t task_2d_tile_2d;
+ pthreadpool_task_3d_tile_2d_t task_3d_tile_2d;
+ pthreadpool_task_4d_tile_2d_t task_4d_tile_2d;
+ pthreadpool_task_5d_tile_2d_t task_5d_tile_2d;
+ pthreadpool_task_6d_tile_2d_t task_6d_tile_2d;
+ };
+ size_t range[6];
+ size_t tile[2];
+};
+
+struct gemm_context {
+ size_t k_scaled;
+ const void* a;
+ size_t a_stride;
+ const void* packed_w;
+ size_t w_stride;
+ size_t wg_stride;
+ void* c;
+ size_t cm_stride;
+ size_t cn_stride;
+ size_t cg_stride;
+ uint32_t log2_csize;
+ xnn_gemm_ukernel_function ukernel;
+ union {
+ union xnn_q8_gemm_params q8;
+ union xnn_f32_output_params f32;
+ } params;
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_ggemm(
+ const struct gemm_context context[restrict static 1],
+ size_t group_index,
+ size_t mr_block_start,
+ size_t nr_block_start,
+ size_t mr_block_size,
+ size_t nr_block_size);
+
+ XNN_PRIVATE void xnn_compute_gemm(
+ const struct gemm_context context[restrict static 1],
+ size_t mr_block_start,
+ size_t nr_block_start,
+ size_t mr_block_size,
+ size_t nr_block_size);
+#endif
+
+// Context for Sparse Matrix-Dense Matrix Multiplication.
+// C [MxN] := A [MxK] * B [KxN] + bias [N]
+// A and C are dense matrices with row-major storage, B is a sparse matrix.
+struct spmm_context {
+ // N dimension of the B and C matrices.
+ // Corresponds to number of output channels in 1x1 convolution.
+ size_t n;
+ // Input matrix A.
+ const void* a;
+ // Packed bias elements and non-zero filter elements.
+ const void* packed_weights;
+ // Input pointer increments, in bytes, after each processed non-zero weight.
+ const int32_t* input_increments;
+ // Number of non-zero filter elements per each N (output channel) dimension.
+ const uint32_t* output_channel_nonzeros;
+ // Output matrix C.
+ void* c;
+ // Stride, in bytes, between matrices A corresponding to different images in batched 1x1 Convolution
+ size_t batched_a_stride;
+ // Stride, in bytes, between matrices C corresponding to different images in batched 1x1 Convolution
+ size_t batched_c_stride;
+ // Micro-kernel function pointer.
+ xnn_spmm_ukernel_function ukernel;
+ // Output activation parameters.
+ union {
+ union xnn_f32_output_params f32;
+ } params;
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_spmm(
+ const struct spmm_context context[restrict static 1],
+ size_t batch_index,
+ size_t mr_block_start,
+ size_t mr_block_size);
+#endif
+
+struct igemm_context {
+ size_t ks;
+ size_t ks_scaled;
+ size_t kc;
+ size_t w_stride;
+ const void** indirect_a;
+ size_t a_offset;
+ void* zero;
+ const void* packed_w;
+ void* c;
+ size_t cm_stride;
+ size_t cn_stride;
+ size_t ga_stride;
+ size_t gw_stride;
+ size_t gc_stride;
+ size_t ba_stride;
+ size_t bc_stride;
+ uint32_t log2_csize;
+ xnn_igemm_ukernel_function ukernel;
+ union {
+ union xnn_q8_gemm_params q8;
+ union xnn_f32_output_params f32;
+ } params;
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_gigemm(
+ const struct igemm_context context[restrict static 1],
+ size_t batch_index,
+ size_t group_index,
+ size_t mr_block_start,
+ size_t nr_block_start,
+ size_t mr_block_size,
+ size_t nr_block_size);
+
+ XNN_PRIVATE void xnn_compute_igemm(
+ const struct igemm_context context[restrict static 1],
+ size_t batch_index,
+ size_t mr_block_start,
+ size_t nr_block_start,
+ size_t mr_block_size,
+ size_t nr_block_size);
+#endif
+
+struct subconv_context {
+ const struct subconvolution_params* subconvolution_params;
+ size_t kc;
+ size_t a_offset;
+ void* zero;
+ size_t cx_stride;
+ size_t cy_stride;
+ size_t cn_stride;
+ size_t ga_stride;
+ size_t gw_stride;
+ size_t gc_stride;
+ size_t ba_stride;
+ size_t bc_stride;
+ uint32_t log2_csize;
+ xnn_igemm_ukernel_function ukernel;
+ union {
+ union xnn_q8_gemm_params q8;
+ union xnn_f32_output_params f32;
+ } params;
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_gsubconv2d(
+ const struct subconv_context context[restrict static 1],
+ size_t batch_index,
+ size_t group_index,
+ size_t subkernel_index,
+ size_t slice_y,
+ size_t slice_x_start,
+ size_t nr_block_start,
+ size_t slice_x_max,
+ size_t nr_block_size);
+
+ XNN_PRIVATE void xnn_compute_subconv2d(
+ const struct subconv_context context[restrict static 1],
+ size_t batch_index,
+ size_t subkernel_index,
+ size_t slice_y,
+ size_t slice_x_start,
+ size_t nr_block_start,
+ size_t slice_x_max,
+ size_t nr_block_size);
+#endif
+
+struct dconv2d_context {
+ size_t input_height;
+ size_t input_width;
+ const void* input;
+ size_t input_batch_stride;
+ const void* zero;
+ const void* packed_weights;
+ void* output;
+ size_t output_batch_stride;
+ size_t input_padding_top;
+ size_t output_channels;
+ size_t output_height_stride;
+ size_t output_channel_stride;
+ union {
+ xnn_conv_hwc2spchw_ukernel_function hwc2spchw_ukernel;
+ };
+ union {
+ union xnn_f32_output_params f32;
+ } params;
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_dconv2d_hwc2spchw(
+ const struct dconv2d_context context[restrict static 1],
+ size_t batch_index,
+ size_t output_y_start,
+ size_t output_y_slice);
+#endif
+
+struct dwconv_context {
+ size_t groups;
+ const void** indirection_buffer;
+ size_t indirection_buffer_row_stride;
+ size_t indirection_buffer_col_stride;
+ const void* packed_weights;
+ void* output;
+ size_t output_width;
+ size_t output_row_stride;
+ size_t output_col_increment;
+ union {
+ union xnn_q8_gemm_params q8;
+ union xnn_f32_output_params f32;
+ } params;
+ union {
+ xnn_dwconv_up_ukernel_function unipass_ukernel;
+ };
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_dwconv_unipass(
+ const struct dwconv_context context[restrict static 1],
+ size_t output_y);
+#endif
+
+struct dwconv2d_context {
+ size_t output_height;
+ size_t input_width;
+ const void* input;
+ size_t input_channel_stride;
+ size_t input_batch_stride;
+ const void* packed_weights;
+ size_t weights_channel_stride;
+ void* output;
+ size_t output_channel_stride;
+ size_t output_batch_stride;
+ size_t input_tuple_stride;
+ size_t output_tuple_stride;
+ size_t input_pixel_stride;
+ size_t output_pixel_stride;
+ union {
+ union xnn_f32_spchw_params f32;
+ } params;
+ union {
+ xnn_dwconv_spchw_ukernel_function spchw_ukernel;
+ };
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_dwconv2d_spchw(
+ const struct dwconv2d_context context[restrict static 1],
+ size_t batch_index,
+ size_t channel);
+#endif
+
+struct max_pooling_context {
+ const void** indirect_input;
+ size_t indirect_input_batch_stride;
+ size_t indirect_input_height_stride;
+ void* output;
+ size_t output_batch_stride;
+ size_t output_height_stride;
+ size_t output_width;
+ size_t pooling_size;
+ size_t channels;
+ size_t input_increment;
+ size_t output_increment;
+ union {
+ union xnn_u8_output_params u8;
+ union xnn_f32_output_params f32;
+ } params;
+ xnn_maxpool_ukernel_function ukernel;
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_max_pooling(
+ const struct max_pooling_context context[restrict static 1],
+ size_t batch_index,
+ size_t output_y);
+#endif
+
+struct unpooling_context {
+ const void* input;
+ size_t input_height_stride;
+ size_t input_width_stride;
+ const uint32_t* index;
+ size_t index_height_stride;
+ size_t index_width_stride;
+ void** indirect_output;
+ size_t indirect_output_height_stride;
+ size_t indirect_output_width_stride;
+ size_t pooling_size;
+ size_t channels;
+ uint32_t fill_value;
+ xnn_unpool_ukernel_function ukernel;
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_unpooling(
+ const struct unpooling_context context[restrict static 1],
+ size_t input_y,
+ size_t input_x);
+#endif
+
+struct argmax_pooling_context {
+ const void** indirect_input;
+ size_t indirect_input_batch_stride;
+ size_t indirect_input_height_stride;
+ void* output;
+ size_t output_batch_stride;
+ size_t output_height_stride;
+ size_t output_width;
+ uint32_t* index;
+ size_t index_batch_stride;
+ size_t index_height_stride;
+ size_t pooling_size;
+ size_t channels;
+ size_t input_increment;
+ size_t output_increment;
+ union {
+ union xnn_f32_output_params f32;
+ } params;
+ union {
+ xnn_argmaxpool_up_ukernel_function unipass_ukernel;
+ xnn_argmaxpool_mp_ukernel_function multipass_ukernel;
+ };
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_argmax_pooling_unipass(
+ const struct argmax_pooling_context context[restrict static 1],
+ size_t batch_index,
+ size_t output_y);
+
+ XNN_PRIVATE void xnn_compute_argmax_pooling_multipass(
+ const struct argmax_pooling_context context[restrict static 1],
+ size_t batch_index,
+ size_t output_y);
+#endif
+
+struct average_pooling_context {
+ const void** indirect_input;
+ size_t indirect_input_batch_stride;
+ size_t indirect_input_height_stride;
+ void* output;
+ size_t output_batch_stride;
+ size_t output_height_stride;
+ size_t output_width;
+ size_t pooling_size;
+ size_t channels;
+ const void* zero;
+ size_t input_increment;
+ size_t output_increment;
+ union {
+ union xnn_q8_avgpool_params q8;
+ union xnn_f32_avgpool_params f32;
+ } params;
+ union {
+ xnn_avgpool_up_ukernel_function unipass_ukernel;
+ xnn_avgpool_mp_ukernel_function multipass_ukernel;
+ };
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_average_pooling_unipass(
+ const struct average_pooling_context context[restrict static 1],
+ size_t batch_index,
+ size_t output_y);
+
+ XNN_PRIVATE void xnn_compute_average_pooling_multipass(
+ const struct average_pooling_context context[restrict static 1],
+ size_t batch_index,
+ size_t output_y);
+#endif
+
+struct pixelwise_average_pooling_context {
+ const void** indirect_input;
+ size_t indirect_input_batch_stride;
+ size_t indirect_input_height_stride;
+ const void* pixelwise_buffer;
+ size_t pixelwise_buffer_height_stride;
+ void* output;
+ size_t output_batch_stride;
+ size_t output_height_stride;
+ size_t output_width;
+ size_t pooling_size;
+ size_t channels;
+ const void* zero;
+ size_t input_increment;
+ size_t output_increment;
+ union {
+ union xnn_u8_output_params u8;
+ union xnn_f32_output_params f32;
+ } params;
+ union {
+ xnn_pavgpool_up_ukernel_function unipass_ukernel;
+ xnn_pavgpool_mp_ukernel_function multipass_ukernel;
+ };
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_pixelwise_average_pooling_unipass(
+ const struct pixelwise_average_pooling_context context[restrict static 1],
+ size_t batch_index,
+ size_t output_y);
+
+ XNN_PRIVATE void xnn_compute_pixelwise_average_pooling_multipass(
+ const struct pixelwise_average_pooling_context context[restrict static 1],
+ size_t batch_index,
+ size_t output_y);
+#endif
+
+struct global_average_pooling_context {
+ const void* input;
+ const void* zero;
+ size_t input_pixel_stride;
+ size_t input_batch_stride;
+ size_t input_elements;
+ size_t channels;
+ void* output;
+ size_t output_batch_stride;
+ union {
+ union xnn_q8_avgpool_params q8;
+ union xnn_f32_avgpool_params f32;
+ } params;
+ union {
+ xnn_gavgpool_up_ukernel_function unipass_ukernel;
+ xnn_gavgpool_mp_ukernel_function multipass_ukernel;
+ };
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_global_average_pooling_unipass(
+ const struct global_average_pooling_context context[restrict static 1],
+ size_t batch_index);
+
+ XNN_PRIVATE void xnn_compute_global_average_pooling_multipass(
+ const struct global_average_pooling_context context[restrict static 1],
+ size_t batch_index);
+#endif
+
+struct global_average_pooling_spnchw_context {
+ size_t input_elements;
+ const void* input;
+ size_t input_channel_stride;
+ size_t input_batch_stride;
+ void* output;
+ size_t output_channel_stride;
+ size_t output_batch_stride;
+ xnn_gavgpool_spchw_ukernel_function ukernel;
+ union {
+ union xnn_f32_gavgpool_params f32;
+ } params;
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_global_average_pooling_spnchw(
+ const struct global_average_pooling_spnchw_context context[restrict static 1],
+ size_t batch_index,
+ size_t channels_start,
+ size_t channels_slice);
+#endif
+
+struct add_strided_context {
+ size_t n;
+ const void* a;
+ size_t a_stride;
+ const void* b;
+ size_t b_stride;
+ const void* y;
+ size_t y_stride;
+ union {
+ union xnn_q8_add_params q8;
+ union xnn_f32_output_params f32;
+ } params;
+ xnn_vadd_ukernel_function ukernel;
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_add_strided(
+ const struct add_strided_context context[restrict static 1],
+ size_t batch_index,
+ size_t batch_range);
+#endif
+
+struct add_contiguous_context {
+ const void* a;
+ const void* b;
+ void* y;
+ union {
+ union xnn_q8_add_params q8;
+ union xnn_f32_output_params f32;
+ } params;
+ xnn_vadd_ukernel_function ukernel;
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_add_contiguous(
+ const struct add_contiguous_context context[restrict static 1],
+ size_t offset,
+ size_t size);
+#endif
+
+struct channel_shuffle_context {
+ const void* x;
+ size_t x_stride;
+ void* y;
+ size_t y_stride;
+ size_t n;
+ size_t m;
+ union {
+ xnn_zipc_ukernel_function fixed_ukernel;
+ xnn_zipv_ukernel_function variable_ukernel;
+ };
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_channel_shuffle_fixed(
+ const struct channel_shuffle_context context[restrict static 1],
+ size_t index);
+
+ XNN_PRIVATE void xnn_compute_channel_shuffle_variable(
+ const struct channel_shuffle_context context[restrict static 1],
+ size_t index);
+#endif
+
+struct lut_strided_context {
+ size_t n;
+ const void* x;
+ size_t x_stride;
+ const void* t;
+ void* y;
+ size_t y_stride;
+ xnn_x8_lut_ukernel_function ukernel;
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_lut_strided(
+ const struct lut_strided_context context[restrict static 1],
+ size_t batch_index);
+#endif
+
+struct lut_contiguous_context {
+ const void* x;
+ size_t x_stride;
+ const void* t;
+ void* y;
+ size_t y_stride;
+ xnn_x8_lut_ukernel_function ukernel;
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_lut_contiguous(
+ const struct lut_contiguous_context context[restrict static 1],
+ size_t offset,
+ size_t size);
+#endif
+
+struct univector_strided_context {
+ size_t n;
+ const void* x;
+ size_t x_stride;
+ void* y;
+ size_t y_stride;
+ xnn_univector_ukernel_function ukernel;
+ union {
+ union xnn_u8_output_params u8_output;
+ union xnn_f32_output_params f32_output;
+ union xnn_f32_hswish_params f32_hswish;
+ } params;
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_univector_strided(
+ const struct univector_strided_context context[restrict static 1],
+ size_t batch_index,
+ size_t batch_range);
+#endif
+
+struct univector_contiguous_context {
+ const void* x;
+ size_t x_stride;
+ void* y;
+ size_t y_stride;
+ xnn_univector_ukernel_function ukernel;
+ union {
+ union xnn_u8_output_params u8_output;
+ union xnn_f32_output_params f32_output;
+ union xnn_f32_hswish_params f32_hswish;
+ } params;
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_univector_contiguous(
+ const struct univector_contiguous_context context[restrict static 1],
+ size_t offset,
+ size_t size);
+#endif
+
+struct prelu_context {
+ size_t n;
+ const void* x;
+ size_t x_stride;
+ const void* w;
+ void* y;
+ size_t y_stride;
+ xnn_prelu_ukernel_function ukernel;
+ union xnn_f32_output_params params;
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_prelu(
+ const struct prelu_context context[restrict static 1],
+ size_t batch_start,
+ size_t batch_range);
+#endif
+
+struct vmulcaddc_context {
+ size_t n;
+ const void* x;
+ size_t x_stride;
+ const void* w;
+ void* y;
+ size_t y_stride;
+ xnn_vmulcaddc_ukernel_function ukernel;
+ union {
+ union xnn_f32_output_params f32;
+ } params;
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_vmulcaddc(
+ const struct vmulcaddc_context context[restrict static 1],
+ size_t batch_start,
+ size_t batch_size);
+#endif
+
+struct channel_pad_context {
+ size_t n;
+ size_t l;
+ size_t r;
+ uint32_t c;
+ const void* x;
+ size_t x_stride;
+ void* y;
+ size_t y_stride;
+ xnn_pad_ukernel_function ukernel;
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_channel_pad(
+ const struct channel_pad_context context[restrict static 1],
+ size_t batch_start,
+ size_t batch_range);
+#endif
+
+struct u8_softargmax_context {
+ size_t n;
+ const uint8_t* x;
+ size_t x_stride;
+ const uint32_t* t;
+ uint8_t* y;
+ size_t y_stride;
+ xnn_u8_rmax_ukernel_function rmax_ukernel;
+ xnn_u8_lut32norm_ukernel_function lut_norm_ukernel;
+};
+
+#ifndef __cplusplus
+ XNN_PRIVATE void xnn_compute_u8_softargmax(
+ const struct u8_softargmax_context context[restrict static 1],
+ size_t batch_index);
+#endif
diff --git a/src/xnnpack/conv.h b/src/xnnpack/conv.h
new file mode 100644
index 0000000..c1bdec3
--- /dev/null
+++ b/src/xnnpack/conv.h
@@ -0,0 +1,63 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_CONV_HWC_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t input_height, \
+ size_t input_width, \
+ size_t output_y_start, \
+ size_t output_y_end, \
+ const float* input, \
+ const float* zero, \
+ const float* weights, \
+ float* output, \
+ size_t input_padding_top, \
+ size_t output_channels, \
+ size_t output_height_stride, \
+ size_t output_width_stride, \
+ const union xnn_f32_output_params* params);
+
+DECLARE_F32_CONV_HWC_UKERNEL_FUNCTION(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2)
+DECLARE_F32_CONV_HWC_UKERNEL_FUNCTION(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2)
+
+
+#define DECLARE_F32_CONV_HWC2SPCHW_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t input_height, \
+ size_t input_width, \
+ size_t output_y_start, \
+ size_t output_y_end, \
+ const float* input, \
+ const float* zero, \
+ const float* weights, \
+ float* output, \
+ size_t input_padding_top, \
+ size_t output_channels, \
+ size_t output_height_stride, \
+ size_t output_channel_stride, \
+ const union xnn_f32_output_params* params);
+
+DECLARE_F32_CONV_HWC2SPCHW_UKERNEL_FUNCTION(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__neonfma_2x2)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/dwconv.h b/src/xnnpack/dwconv.h
new file mode 100644
index 0000000..dc52a61
--- /dev/null
+++ b/src/xnnpack/dwconv.h
@@ -0,0 +1,88 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t channels, \
+ size_t output_width, \
+ const float** input, \
+ const float* weights, \
+ float* output, \
+ size_t input_stride, \
+ size_t output_increment, \
+ const union xnn_f32_output_params* params);
+
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x25__scalar)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x4__scalar)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x9__scalar)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x25__psimd)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x25__sse)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x4__psimd)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x4__sse)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__aarch64_neonfma)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__aarch64_neonfma_cortex_a55)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__neon)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__neonfma)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__psimd)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__sse)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x9__neonfma)
+
+
+#define DECLARE_Q8_DWCONV_UNIPASS_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t channels, \
+ size_t output_width, \
+ const uint8_t** input, \
+ const void* weights, \
+ uint8_t* output, \
+ size_t input_stride, \
+ size_t output_increment, \
+ const union xnn_q8_gemm_params* params);
+
+DECLARE_Q8_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_q8_dwconv_ukernel_up1x9__scalar)
+DECLARE_Q8_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_q8_dwconv_ukernel_up8x9__aarch32_neon)
+DECLARE_Q8_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_q8_dwconv_ukernel_up8x9__neon)
+DECLARE_Q8_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_q8_dwconv_ukernel_up8x9__sse2)
+
+
+#define DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t m, \
+ size_t n, \
+ const float* input, \
+ const float* weights, \
+ float* output, \
+ size_t input_tuple_stride, \
+ size_t output_tuple_stride, \
+ size_t input_height_stride, \
+ size_t output_height_stride, \
+ const union xnn_f32_spchw_params* params);
+
+DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma)
+DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma)
+DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(xnn_f32_dwconv_spchw_ukernel_3x3p1__sse)
+DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma)
+DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma)
+DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/gavgpool.h b/src/xnnpack/gavgpool.h
new file mode 100644
index 0000000..b567196
--- /dev/null
+++ b/src/xnnpack/gavgpool.h
@@ -0,0 +1,99 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t m, \
+ size_t n, \
+ const float* x, \
+ size_t x_stride, \
+ const float* zero, \
+ float* buffer, \
+ float* y, \
+ const union xnn_f32_avgpool_params* params);
+
+DECLARE_F32_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_mp7p7q__neon)
+DECLARE_F32_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_mp7p7q__psimd)
+DECLARE_F32_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_mp7p7q__scalar)
+DECLARE_F32_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_mp7p7q__sse)
+
+
+#define DECLARE_F32_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t m, \
+ size_t n, \
+ const float* x, \
+ size_t x_stride, \
+ const float* zero, \
+ float* y, \
+ const union xnn_f32_avgpool_params* params);
+
+DECLARE_F32_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_up7__neon)
+DECLARE_F32_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_up7__psimd)
+DECLARE_F32_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_up7__scalar)
+DECLARE_F32_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_up7__sse)
+
+#define DECLARE_Q8_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t m, \
+ size_t n, \
+ const uint8_t* x, \
+ size_t x_stride, \
+ const uint8_t* zero, \
+ int32_t* buffer, \
+ uint8_t* y, \
+ const union xnn_q8_avgpool_params* params);
+
+DECLARE_Q8_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_q8_gavgpool_ukernel_mp7p7q__neon)
+DECLARE_Q8_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_q8_gavgpool_ukernel_mp7p7q__scalar)
+DECLARE_Q8_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_q8_gavgpool_ukernel_mp7p7q__sse2)
+
+
+#define DECLARE_Q8_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t m, \
+ size_t n, \
+ const uint8_t* x, \
+ size_t x_stride, \
+ const uint8_t* zero, \
+ uint8_t* y, \
+ const union xnn_q8_avgpool_params* params);
+
+DECLARE_Q8_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_q8_gavgpool_ukernel_up7__neon)
+DECLARE_Q8_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_q8_gavgpool_ukernel_up7__scalar)
+DECLARE_Q8_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_q8_gavgpool_ukernel_up7__sse2)
+
+
+#define DECLARE_F32_GAVGPOOL_SPCHW_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t elements, \
+ size_t channels, \
+ const float* input, \
+ float* output, \
+ const union xnn_f32_gavgpool_params* params);
+
+DECLARE_F32_GAVGPOOL_SPCHW_UKERNEL_FUNCTION(xnn_f32_gavgpool_spchw_ukernel__neon_x4)
+DECLARE_F32_GAVGPOOL_SPCHW_UKERNEL_FUNCTION(xnn_f32_gavgpool_spchw_ukernel__sse_x4)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
new file mode 100644
index 0000000..27f591d
--- /dev/null
+++ b/src/xnnpack/gemm.h
@@ -0,0 +1,189 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_GEMM_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t mr, \
+ size_t nr, \
+ size_t k, \
+ const float* a, \
+ size_t a_stride, \
+ const float* w, \
+ float* c, \
+ size_t cm_stride, \
+ size_t cn_stride, \
+ const union xnn_f32_output_params* params);
+
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x4__scalar)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__neon_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__psimd_splat)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__sse_dup)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__sse_load1)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8s4__psimd)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8s4__sse)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_2x4__scalar)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x12__neon_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x12__neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__neon_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__scalar)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x4__scalar)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neon_ld128)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neon_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neonfma_ld128)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__psimd_splat)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__sse_dup)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__sse_load1)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8s4__psimd)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8s4__sse)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__neon_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neon_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__psimd_splat)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8s4__psimd)
+
+#define DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t mr, \
+ size_t nr, \
+ size_t k, \
+ const float* a, \
+ size_t a_stride, \
+ const float* w, \
+ float* c, \
+ size_t cm_stride, \
+ size_t cn_stride, \
+ const float* acc, \
+ const union xnn_f32_output_params* params);
+
+
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x4__scalar)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__neon_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__psimd_splat)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__sse_dup)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__sse_load1)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8s4__psimd)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8s4__sse)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_2x4__scalar)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x12__neon_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x12__neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x4__scalar)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neon_ld128)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neon_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__psimd_splat)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__sse_dup)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__sse_load1)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8s4__psimd)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8s4__sse)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__neon_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neon_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__psimd_splat)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8s4__psimd)
+
+
+#define DECLARE_F16_GEMM_UKERNEL_FUNCTION(fn_name) \
+ void fn_name( \
+ size_t mr, \
+ size_t nr, \
+ size_t k, \
+ const void* a, \
+ size_t a_stride, \
+ const void* w, \
+ void* c, \
+ size_t cm_stride, \
+ size_t cn_stride, \
+ const struct xnn_f16_output_params* params);
+
+DECLARE_F16_GEMM_UKERNEL_FUNCTION(xnn_f16_gemm_ukernel_4x8__neonfp16arith_ld64)
+DECLARE_F16_GEMM_UKERNEL_FUNCTION(xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64)
+DECLARE_F16_GEMM_UKERNEL_FUNCTION(xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64)
+
+
+#define DECLARE_Q8_GEMM_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t mr, \
+ size_t nr, \
+ size_t k, \
+ const uint8_t* a, \
+ size_t a_stride, \
+ const void* w, \
+ uint8_t* c, \
+ size_t cm_stride, \
+ size_t cn_stride, \
+ const union xnn_q8_gemm_params* params);
+
+DECLARE_Q8_GEMM_UKERNEL_FUNCTION(xnn_q8_gemm_ukernel_2x2__scalar)
+DECLARE_Q8_GEMM_UKERNEL_FUNCTION(xnn_q8_gemm_ukernel_2x4c8__neon)
+DECLARE_Q8_GEMM_UKERNEL_FUNCTION(xnn_q8_gemm_ukernel_2x4c8__sse2)
+DECLARE_Q8_GEMM_UKERNEL_FUNCTION(xnn_q8_gemm_ukernel_3x3c8__neon)
+DECLARE_Q8_GEMM_UKERNEL_FUNCTION(xnn_q8_gemm_ukernel_4x4c2__sse2)
+DECLARE_Q8_GEMM_UKERNEL_FUNCTION(xnn_q8_gemm_ukernel_4x8__aarch32_neon)
+DECLARE_Q8_GEMM_UKERNEL_FUNCTION(xnn_q8_gemm_ukernel_4x8__neon)
+DECLARE_Q8_GEMM_UKERNEL_FUNCTION(xnn_q8_gemm_ukernel_6x4__neon)
+DECLARE_Q8_GEMM_UKERNEL_FUNCTION(xnn_q8_gemm_ukernel_8x8__aarch64_neon)
+DECLARE_Q8_GEMM_UKERNEL_FUNCTION(xnn_q8_gemm_ukernel_8x8__neon)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/hswish.h b/src/xnnpack/hswish.h
new file mode 100644
index 0000000..8d0ab93
--- /dev/null
+++ b/src/xnnpack/hswish.h
@@ -0,0 +1,35 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_HSWISH_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ const float* x, \
+ float* y, \
+ const union xnn_f32_hswish_params* params);
+
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__psimd)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__neon)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__neonfma)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__sse)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__scalar)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
new file mode 100644
index 0000000..4d30c6f
--- /dev/null
+++ b/src/xnnpack/igemm.h
@@ -0,0 +1,105 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_IGEMM_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t mr, \
+ size_t nr, \
+ size_t kc, \
+ size_t ks, \
+ const float** a, \
+ const float* w, \
+ float* c, \
+ size_t cm_stride, \
+ size_t cn_stride, \
+ size_t a_offset, \
+ const float* zero, \
+ const union xnn_f32_output_params* params);
+
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x4__scalar)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__neon_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__psimd_splat)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__sse_dup)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__sse_load1)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8s4__psimd)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8s4__sse)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_2x4__scalar)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x12__neon_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x12__neonfma_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__neon_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__neonfma_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__scalar)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2c4__psimd)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2c4__sse)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__neon_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__neonfma_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__scalar)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neon_ld128)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neon_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neonfma_ld128)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neonfma_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__psimd_splat)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__sse_dup)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__sse_load1)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8s4__psimd)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8s4__sse)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neon_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neonfma_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__psimd_splat)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8s4__psimd)
+
+
+#define DECLARE_Q8_IGEMM_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t mr, \
+ size_t nr, \
+ size_t kc, \
+ size_t ks, \
+ const uint8_t** a, \
+ const void* w, \
+ uint8_t* c, \
+ size_t cm_stride, \
+ size_t cn_stride, \
+ size_t a_offset, \
+ const uint8_t* zero, \
+ const union xnn_q8_gemm_params* params);
+
+DECLARE_Q8_IGEMM_UKERNEL_FUNCTION(xnn_q8_igemm_ukernel_2x2__scalar)
+DECLARE_Q8_IGEMM_UKERNEL_FUNCTION(xnn_q8_igemm_ukernel_4x4c2__sse2)
+DECLARE_Q8_IGEMM_UKERNEL_FUNCTION(xnn_q8_igemm_ukernel_4x8__neon)
+DECLARE_Q8_IGEMM_UKERNEL_FUNCTION(xnn_q8_igemm_ukernel_8x8__neon)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/im2col.h b/src/xnnpack/im2col.h
new file mode 100644
index 0000000..07323e3
--- /dev/null
+++ b/src/xnnpack/im2col.h
@@ -0,0 +1,37 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+XNN_INTERNAL void xnn_im2col_conv2d(
+ size_t output_height,
+ size_t output_width,
+ size_t kernel_height,
+ size_t kernel_width,
+ size_t subsampling_height,
+ size_t subsampling_width,
+ size_t dilation_height,
+ size_t dilation_width,
+ size_t input_width,
+ size_t input_padding_top,
+ size_t input_padding_left,
+ size_t group_input_channels_in_bytes,
+ size_t input_pixel_stride_in_bytes,
+ const void* input,
+ void* output);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/indirection.h b/src/xnnpack/indirection.h
new file mode 100644
index 0000000..60be1f6
--- /dev/null
+++ b/src/xnnpack/indirection.h
@@ -0,0 +1,57 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+XNN_INTERNAL void xnn_indirection_init_conv2d(
+ xnn_operator_t op,
+ size_t output_tile_size,
+ uint32_t log2_element_size);
+
+XNN_INTERNAL void xnn_indirection_init_dwconv2d(
+ xnn_operator_t op,
+ size_t batch_start,
+ size_t step_height,
+ size_t step_width,
+ uint32_t log2_element_size);
+
+XNN_INTERNAL void xnn_indirection_init_deconv2d(
+ xnn_operator_t op,
+ size_t output_tile_size,
+ uint32_t log2_element_size);
+
+XNN_INTERNAL void xnn_indirection_init_subconv2d(
+ xnn_operator_t op,
+ size_t output_tile_size,
+ uint32_t log2_element_size);
+
+XNN_INTERNAL void xnn_indirection_init_maxpool2d(
+ xnn_operator_t op,
+ size_t batch_start,
+ size_t step_height,
+ size_t step_width,
+ uint32_t log2_element_size);
+
+XNN_INTERNAL void xnn_indirection_init_unpool2d(
+ xnn_operator_t op,
+ size_t batch_start,
+ uint32_t log2_element_size);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/isa-checks.h b/src/xnnpack/isa-checks.h
new file mode 100644
index 0000000..0bdf97c
--- /dev/null
+++ b/src/xnnpack/isa-checks.h
@@ -0,0 +1,79 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <cpuinfo.h>
+
+
+#if CPUINFO_ARCH_PNACL || CPUINFO_ARCH_WASMSIMD
+ #define TEST_REQUIRES_PSIMD
+#else
+ #define TEST_REQUIRES_PSIMD \
+ do { \
+ if (!cpuinfo_initialize() || !(cpuinfo_has_arm_neon() || cpuinfo_has_x86_sse2())) { \
+ GTEST_SKIP(); \
+ } \
+ } while (0)
+#endif
+
+#define TEST_REQUIRES_X86_SSE \
+ do { \
+ if (!cpuinfo_initialize() || !cpuinfo_has_x86_sse()) { \
+ GTEST_SKIP(); \
+ } \
+ } while (0)
+
+#define TEST_REQUIRES_X86_SSE2 \
+ do { \
+ if (!cpuinfo_initialize() || !cpuinfo_has_x86_sse2()) { \
+ GTEST_SKIP(); \
+ } \
+ } while (0)
+
+#define TEST_REQUIRES_X86_AVX \
+ do { \
+ if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx()) { \
+ GTEST_SKIP(); \
+ } \
+ } while (0)
+
+#define TEST_REQUIRES_X86_AVX2 \
+ do { \
+ if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx2()) { \
+ GTEST_SKIP(); \
+ } \
+ } while (0)
+
+#define TEST_REQUIRES_X86_AVX512F \
+ do { \
+ if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx512f()) { \
+ GTEST_SKIP(); \
+ } \
+ } while (0)
+
+#define TEST_REQUIRES_ARM_NEON \
+ do { \
+ if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon()) { \
+ GTEST_SKIP(); \
+ } \
+ } while (0)
+
+#define TEST_REQUIRES_ARM_NEON_FMA \
+ do { \
+ if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_fma()) { \
+ GTEST_SKIP(); \
+ } \
+ } while (0)
+
+#define TEST_REQUIRES_ARM_NEON_FP16_ARITH \
+ do { \
+ if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_fp16_arith()) { \
+ GTEST_SKIP(); \
+ } \
+ } while (0)
diff --git a/src/xnnpack/log.h b/src/xnnpack/log.h
new file mode 100644
index 0000000..9eb5abf
--- /dev/null
+++ b/src/xnnpack/log.h
@@ -0,0 +1,23 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <inttypes.h>
+
+#include <clog.h>
+
+#ifndef XNN_LOG_LEVEL
+#define XNN_LOG_LEVEL CLOG_DEBUG
+#endif
+
+CLOG_DEFINE_LOG_DEBUG(xnn_log_debug, "XNNPACK", XNN_LOG_LEVEL);
+CLOG_DEFINE_LOG_INFO(xnn_log_info, "XNNPACK", XNN_LOG_LEVEL);
+CLOG_DEFINE_LOG_WARNING(xnn_log_warning, "XNNPACK", XNN_LOG_LEVEL);
+CLOG_DEFINE_LOG_ERROR(xnn_log_error, "XNNPACK", XNN_LOG_LEVEL);
+CLOG_DEFINE_LOG_FATAL(xnn_log_fatal, "XNNPACK", XNN_LOG_LEVEL);
diff --git a/src/xnnpack/lut.h b/src/xnnpack/lut.h
new file mode 100644
index 0000000..49b0ec4
--- /dev/null
+++ b/src/xnnpack/lut.h
@@ -0,0 +1,44 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_X8_LUT_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ const uint8_t* x, \
+ const uint8_t* t, \
+ uint8_t* y);
+
+DECLARE_X8_LUT_UKERNEL_FUNCTION(xnn_x8_lut_ukernel__scalar)
+
+
+#define DECLARE_U8_LUT32NORM_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ const uint8_t* x, \
+ const uint32_t* t, \
+ uint8_t* y);
+
+DECLARE_U8_LUT32NORM_UKERNEL_FUNCTION(xnn_u8_lut32norm_ukernel__scalar)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/math.h b/src/xnnpack/math.h
new file mode 100644
index 0000000..60e46dc
--- /dev/null
+++ b/src/xnnpack/math.h
@@ -0,0 +1,64 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <assert.h>
+
+inline static size_t min(size_t a, size_t b) {
+ return a < b ? a : b;
+}
+
+inline static size_t max(size_t a, size_t b) {
+ return a > b ? a : b;
+}
+
+inline static size_t doz(size_t a, size_t b) {
+ return a >= b ? a - b : 0;
+}
+
+inline static size_t divide_round_up(size_t n, size_t q) {
+ return n % q == 0 ? n / q : n / q + 1;
+}
+
+inline static size_t round_up(size_t n, size_t q) {
+ return divide_round_up(n, q) * q;
+}
+
+inline static size_t round_down_po2(size_t n, size_t q) {
+ assert(q != 0);
+ assert((q & (q - 1)) == 0);
+ return n & -q;
+}
+
+inline static size_t round_up_po2(size_t n, size_t q) {
+ return round_down_po2(n + q - 1, q);
+}
+
+inline static size_t subtract_modulo(size_t a, size_t b, size_t m) {
+ assert(a < m);
+ assert(b < m);
+ return a >= b ? a - b : a - b + m;
+}
+
+inline static float math_min_f32(float a, float b) {
+ #if defined(__wasm__)
+ return __builtin_wasm_min_f32(a, b);
+ #else
+ return a < b ? a : b;
+ #endif
+}
+
+inline static float math_max_f32(float a, float b) {
+ #if defined(__wasm__)
+ return __builtin_wasm_max_f32(a, b);
+ #else
+ return a > b ? a : b;
+ #endif
+}
diff --git a/src/xnnpack/maxpool.h b/src/xnnpack/maxpool.h
new file mode 100644
index 0000000..1c134d7
--- /dev/null
+++ b/src/xnnpack/maxpool.h
@@ -0,0 +1,56 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ size_t ks, \
+ size_t kc, \
+ const float** x, \
+ float* y, \
+ size_t x_increment, \
+ size_t y_increment, \
+ const union xnn_f32_output_params* params);
+
+DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8q__psimd)
+DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8q__scalar)
+DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8q__sse)
+
+
+#define DECLARE_U8_MAXPOOL_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ size_t ks, \
+ size_t kc, \
+ const uint8_t** x, \
+ uint8_t* y, \
+ size_t x_increment, \
+ size_t y_increment, \
+ const union xnn_u8_output_params* params);
+
+DECLARE_U8_MAXPOOL_UKERNEL_FUNCTION(xnn_u8_maxpool_ukernel_9p8q__neon)
+DECLARE_U8_MAXPOOL_UKERNEL_FUNCTION(xnn_u8_maxpool_ukernel_9p8q__sse2)
+DECLARE_U8_MAXPOOL_UKERNEL_FUNCTION(xnn_u8_maxpool_ukernel_9p8q__scalar)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h
new file mode 100644
index 0000000..a34d6fd
--- /dev/null
+++ b/src/xnnpack/operator.h
@@ -0,0 +1,275 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <pthreadpool.h>
+
+#include <xnnpack/requantization.h>
+#include <xnnpack/compute.h>
+
+
+enum xnn_ukernel_type {
+ xnn_ukernel_type_none = 0,
+ xnn_ukernel_type_add,
+ xnn_ukernel_type_argmax_pooling,
+ xnn_ukernel_type_average_pooling,
+ xnn_ukernel_type_channel_shuffle,
+ xnn_ukernel_type_clamp,
+ xnn_ukernel_type_igemm,
+ xnn_ukernel_type_dconv2d_hwc2spchw,
+ xnn_ukernel_type_dwconv,
+ xnn_ukernel_type_gemm,
+ xnn_ukernel_type_global_average_pooling,
+ xnn_ukernel_type_hswish,
+ xnn_ukernel_type_lut,
+ xnn_ukernel_type_max_pooling,
+ xnn_ukernel_type_pad,
+ xnn_ukernel_type_pixelwise_average_pooling,
+ xnn_ukernel_type_prelu,
+ xnn_ukernel_type_softargmax,
+ xnn_ukernel_type_spmm,
+ xnn_ukernel_type_subconv2d,
+ xnn_ukernel_type_unpooling,
+ xnn_ukernel_type_vmulcaddc,
+};
+
+enum xnn_operator_type {
+ xnn_operator_type_none = 0,
+ xnn_operator_type_add_f32,
+ xnn_operator_type_add_q8,
+ xnn_operator_type_argmax_pooling_f32,
+ xnn_operator_type_average_pooling_f32,
+ xnn_operator_type_average_pooling_q8,
+ xnn_operator_type_channel_pad_x32,
+ xnn_operator_type_channel_shuffle_x8,
+ xnn_operator_type_channel_shuffle_x32,
+ xnn_operator_type_clamp_f32,
+ xnn_operator_type_clamp_u8,
+ xnn_operator_type_convolution_f32,
+ xnn_operator_type_convolution_spnchw_f32,
+ xnn_operator_type_convolution_q8,
+ xnn_operator_type_deconvolution_f32,
+ xnn_operator_type_deconvolution_q8,
+ xnn_operator_type_fully_connected_f32,
+ xnn_operator_type_fully_connected_q8,
+ xnn_operator_type_global_average_pooling_f32,
+ xnn_operator_type_global_average_pooling_q8,
+ xnn_operator_type_global_average_pooling_spnchw_f32,
+ xnn_operator_type_hswish_f32,
+ xnn_operator_type_leaky_relu_q8,
+ xnn_operator_type_max_pooling_f32,
+ xnn_operator_type_max_pooling_u8,
+ xnn_operator_type_prelu_f32,
+ xnn_operator_type_sigmoid_q8,
+ xnn_operator_type_softargmax_q8,
+ xnn_operator_type_unpooling_x32,
+};
+
+struct xnn_ukernel_dconv2d {
+ union {
+ xnn_conv_hwc2spchw_ukernel_function hwc2spchw_function;
+ xnn_conv_hwc_ukernel_function hwc_function;
+ };
+ uint8_t output_height_tile;
+ uint8_t output_channel_tile;
+};
+
+struct xnn_ukernel_dwconv {
+ union {
+ xnn_dwconv_up_ukernel_function unipass_function;
+ xnn_dwconv_mp_ukernel_function multipass_function;
+ };
+ uint8_t mr;
+ uint8_t qr;
+};
+
+// Direct 2D Depthwise Convolution
+struct xnn_ukernel_dwconv2d {
+ union {
+ xnn_dwconv_spchw_ukernel_function spchw_function;
+ };
+ uint8_t input_width_tile;
+ uint8_t output_width_tile;
+};
+
+struct xnn_ukernel_gemm {
+ xnn_gemm_ukernel_function default_function;
+ xnn_gemm_ukernel_function mr1_function;
+ uint8_t mr;
+ uint8_t nr;
+ uint8_t kr;
+};
+
+struct xnn_ukernel_igemm {
+ xnn_igemm_ukernel_function default_function;
+ xnn_igemm_ukernel_function mr1_function;
+ uint8_t mr;
+ uint8_t nr;
+ uint8_t kr;
+};
+
+struct xnn_ukernel_spmm {
+ xnn_spmm_ukernel_function function;
+ uint8_t mr;
+};
+
+struct xnn_ukernel_vmulcaddc {
+ xnn_vmulcaddc_ukernel_function function;
+ uint8_t mr;
+};
+
+struct xnn_ukernel {
+ enum xnn_ukernel_type type;
+ union {
+ struct xnn_ukernel_dconv2d dconv2d;
+ struct xnn_ukernel_dwconv dwconv;
+ struct xnn_ukernel_dwconv2d dwconv2d;
+ struct xnn_ukernel_gemm gemm;
+ struct xnn_ukernel_igemm igemm;
+ struct xnn_ukernel_spmm spmm;
+ struct xnn_ukernel_vmulcaddc vmulcaddc;
+ };
+};
+
+enum xnn_run_state {
+ xnn_run_state_invalid = 0,
+ xnn_run_state_ready,
+ xnn_run_state_skip,
+};
+
+struct subconvolution_params {
+ void* weights;
+ size_t w_stride;
+ const void** indirection_buffer;
+ void* output;
+ size_t slice_width;
+ size_t slice_height;
+ size_t indirection_y_stride;
+ size_t indirection_x_stride;
+ /* kernel_size * mr * sizeof(void*) */
+ size_t scaled_kernel_size;
+};
+
+struct xnn_operator {
+ size_t batch_size;
+ uint32_t padding_top;
+ uint32_t padding_right;
+ uint32_t padding_bottom;
+ uint32_t padding_left;
+ uint32_t adjustment_height;
+ uint32_t adjustment_width;
+ uint32_t kernel_height;
+ uint32_t kernel_width;
+ uint32_t stride_height;
+ uint32_t stride_width;
+ uint32_t dilation_height;
+ uint32_t dilation_width;
+ uint32_t groups;
+ size_t group_channels;
+ size_t group_input_channels;
+ size_t group_output_channels;
+ size_t channels;
+
+ size_t pad_before_channels;
+ size_t pad_after_channels;
+ uint32_t pad_value;
+
+ size_t input_height;
+ size_t input_width;
+ size_t input_pixel_stride;
+ const void* input;
+ const void** indirection_buffer;
+ void* a_sum;
+
+ size_t input2_pixel_stride;
+ const void* input2;
+
+ size_t output_height;
+ size_t output_width;
+ size_t output_pixel_stride;
+ void* output;
+
+ void* packed_weights;
+ // Total number of non-zero kernel elements when weights use sparse representation.
+ size_t num_nonzero_values;
+ // Total number of non-zero kernel blocks when weights use sparse representation.
+ size_t num_nonzero_blocks;
+ // Total number of output channel blocks when weights use sparse representation.
+ size_t num_output_channel_blocks;
+ // Input channel corresponding to the first non-zero kernel element.
+ size_t first_input_channel;
+
+ float input_scale;
+ float output_scale;
+ uint8_t input_zero_point;
+ uint8_t kernel_zero_point;
+ uint8_t output_zero_point;
+ uint8_t output_min;
+ uint8_t output_max;
+
+ size_t valid_batch_size;
+ size_t last_input_height;
+ size_t last_input_width;
+ const void* last_input;
+ void* last_output;
+
+ void* zero_buffer;
+ void* lookup_table;
+ void* pixelwise_buffer;
+ struct subconvolution_params* subconvolution_buffer;
+
+ union {
+ union xnn_f32_avgpool_params f32_avgpool_params;
+ union xnn_f32_gavgpool_params f32_gavgpool_params;
+ union xnn_f32_hswish_params f32_hswish_params;
+ union xnn_f32_output_params f32_output_params;
+ union xnn_f32_spchw_params f32_spchw_params;
+ union xnn_q8_add_params q8_add_params;
+ union xnn_q8_avgpool_params q8_avgpool_params;
+ union xnn_q8_gemm_params q8_gemm_params;
+ union xnn_u8_output_params u8_output_params;
+ };
+ enum xnn_operator_type type;
+ struct xnn_ukernel ukernel;
+
+ struct compute_parameters compute;
+ struct compute_parameters compute2;
+ union {
+ struct add_contiguous_context add_contiguous;
+ struct add_strided_context add_strided;
+ struct argmax_pooling_context argmax_pooling;
+ struct average_pooling_context average_pooling;
+ struct channel_pad_context channel_pad;
+ struct channel_shuffle_context channel_shuffle;
+ struct dconv2d_context dconv2d;
+ struct dwconv2d_context dwconv2d;
+ struct dwconv_context dwconv;
+ struct gemm_context gemm;
+ struct global_average_pooling_context global_average_pooling;
+ struct global_average_pooling_spnchw_context global_average_pooling_spnchw;
+ struct igemm_context igemm;
+ struct lut_contiguous_context lut_contiguous;
+ struct lut_strided_context lut_strided;
+ struct max_pooling_context max_pooling;
+ struct pixelwise_average_pooling_context pixelwise_average_pooling;
+ struct prelu_context prelu;
+ struct spmm_context spmm;
+ struct subconv_context subconv;
+ struct u8_softargmax_context u8_softargmax;
+ struct univector_contiguous_context univector_contiguous;
+ struct univector_strided_context univector_strided;
+ struct unpooling_context unpooling;
+ struct vmulcaddc_context vmulcaddc;
+ } context;
+
+ enum xnn_run_state state;
+};
diff --git a/src/xnnpack/pack.h b/src/xnnpack/pack.h
new file mode 100644
index 0000000..4bc31c2
--- /dev/null
+++ b/src/xnnpack/pack.h
@@ -0,0 +1,646 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stdint.h>
+#include <xnnpack/math.h>
+#include <xnnpack/operator.h>
+
+
+static inline void xnn_pack_q8_gemm_goi_w(
+ size_t g,
+ size_t nc,
+ size_t kc,
+ uint32_t nr,
+ uint32_t kr,
+ uint8_t izp,
+ uint8_t kzp,
+ const uint8_t* k,
+ const int32_t* b,
+ void* packed_w)
+{
+ const int32_t boff = (int32_t) kc * (int32_t) izp * (int32_t) kzp;
+ do {
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ int32_t* packed_b = (int32_t*) packed_w;
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
+ for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
+ const size_t kr_block_size = min(kc - kr_block_start, kr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ int32_t ksum = 0;
+ for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+ const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
+ ksum += (int32_t) kv;
+ *((uint8_t*) packed_w) = kv;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+ }
+ packed_b[nr_block_offset] -= ksum * (int32_t) izp;
+ packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
+ }
+ }
+ k += nc * kc;
+ b += nc;
+ } while (--g != 0);
+}
+
+static inline void xnn_pack_q8_conv_goki_w(
+ size_t g,
+ size_t nc,
+ size_t ks,
+ size_t kc,
+ uint32_t nr,
+ uint32_t kr,
+ uint8_t izp,
+ uint8_t kzp,
+ const uint8_t* k,
+ const int32_t* b,
+ void* packed_w)
+{
+ const int32_t boff = (int32_t) ks * (int32_t) kc * (int32_t) izp * (int32_t) kzp;
+ do {
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ int32_t* packed_b = (int32_t*) packed_w;
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
+ for (size_t ki = 0; ki < ks; ki++) {
+ for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
+ const size_t kr_block_size = min(kc - kr_block_start, kr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ int32_t ksum = 0;
+ for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+ const uint8_t kv =
+ k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
+ ksum += (int32_t) kv;
+ *((uint8_t*) packed_w) = kv;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+ }
+ packed_b[nr_block_offset] -= ksum * (int32_t) izp;
+ packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
+ }
+ }
+ }
+ k += ks * kc * nc;
+ b += nc;
+ } while (--g != 0);
+}
+
+static inline void xnn_pack_q8_conv_kgo_w(
+ size_t g,
+ size_t nc,
+ size_t ks,
+ uint32_t nr,
+ uint32_t kr,
+ uint8_t izp,
+ uint8_t kzp,
+ const uint8_t* k,
+ const int32_t* b,
+ void* packed_w)
+{
+ const int32_t boff = (int32_t) ks * (int32_t) izp * (int32_t) kzp;
+ for (size_t i = 0; i < g; i++) {
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ int32_t* packed_b = (int32_t*) packed_w;
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
+ for (size_t ki = 0; ki < ks; ki++) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ const uint8_t kv =
+ k[ki * g * nc + (nr_block_start + nr_block_offset)];
+ *((uint8_t*) packed_w) = kv;
+ packed_b[nr_block_offset] -= (int32_t) kv * (int32_t) izp;
+ packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(uint8_t));
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
+ }
+ }
+ k += nc;
+ b += nc;
+ }
+}
+
+static inline void xnn_pack_q8_deconv_goki_w(
+ size_t g,
+ size_t nc,
+ size_t kh,
+ size_t kw,
+ size_t kc,
+ size_t sh,
+ size_t sw,
+ size_t nr,
+ size_t kr,
+ uint8_t izp,
+ uint8_t kzp,
+ const uint8_t* k,
+ const int32_t* b,
+ void* packed_w,
+ struct subconvolution_params* params)
+{
+ for (size_t i = 0; i < g; i++) {
+ for (size_t oy = 0; oy < sh; oy++) {
+ for (size_t ox = 0; ox < sw; ox++) {
+ if (i == 0) {
+ (*params++).weights = packed_w;
+ }
+ const int32_t boff = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * (int32_t) izp * (int32_t) kzp;
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ int32_t* packed_b = (int32_t*) packed_w;
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
+ for (size_t ky = oy; ky < kh; ky += sh) {
+ for (size_t kx = ox; kx < kw; kx += sw) {
+ for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
+ const size_t kr_block_size = min(kc - kr_block_start, kr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ int32_t ksum = 0;
+ for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+ const uint8_t kv =
+ k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
+ ksum += (int32_t) kv;
+ *((uint8_t*) packed_w) = kv;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+ }
+ packed_b[nr_block_offset] -= ksum * (int32_t) izp;
+ packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
+ }
+ }
+ }
+ }
+ }
+ }
+ k += kh * kw * kc * nc;
+ b += nc;
+ }
+}
+
+static inline void xnn_pack_q8_dwconv_ghw_w(
+ size_t h,
+ size_t w,
+ size_t c,
+ size_t cr,
+ uint8_t izp,
+ uint8_t kzp,
+ const uint8_t* k,
+ const int32_t* b,
+ void* packed_w)
+{
+ const int32_t boff = (int32_t) h * (int32_t) w * (int32_t) izp * (int32_t) kzp;
+ for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+ const size_t cr_block_size = min(c - cr_block_start, cr);
+ int32_t* packed_b = (int32_t*) packed_w;
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
+ for (size_t x = 0; x < w; x++) {
+ for (size_t y = 0; y < h; y++) {
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
+ packed_b[cr_block_offset] -= (int32_t) kv * (int32_t) izp;
+ *((uint8_t*) packed_w) = kv;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
+ }
+ }
+ }
+}
+
+static inline void xnn_pack_q8_dwconv_hwg_w(
+ size_t h,
+ size_t w,
+ size_t c,
+ size_t cr,
+ uint8_t izp,
+ uint8_t kzp,
+ const uint8_t* k,
+ const int32_t* b,
+ void* packed_w)
+{
+ const int32_t boff = (int32_t) h * (int32_t) w * (int32_t) izp * (int32_t) kzp;
+ for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+ const size_t cr_block_size = min(c - cr_block_start, cr);
+ int32_t* packed_b = (int32_t*) packed_w;
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
+ for (size_t x = 0; x < w; x++) {
+ for (size_t y = 0; y < h; y++) {
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
+ packed_b[cr_block_offset] -= (int32_t) kv * (int32_t) izp;
+ *((uint8_t*) packed_w) = kv;
+ packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+ }
+ packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
+ }
+ }
+ }
+}
+
+static inline void xnn_pack_f16_gemm_goi_w(
+ size_t g,
+ size_t nc,
+ size_t kc,
+ size_t nr,
+ size_t kr,
+ const uint16_t* k,
+ const uint16_t* b,
+ uint16_t* packed_w)
+{
+ do {
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ *packed_w++ = b[nr_block_start + nr_block_offset];
+ }
+ packed_w += nr - nr_block_size;
+ for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
+ const size_t kr_block_size = min(kc - kr_block_start, kr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+ *packed_w++ =
+ k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
+ }
+ packed_w += kr - kr_block_size;
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+ }
+ k += nc * kc;
+ b += nc;
+ } while (--g != 0);
+}
+
+static inline void xnn_pack_f32_gemm_goi_w(
+ size_t g,
+ size_t nc,
+ size_t kc,
+ size_t nr,
+ size_t kr,
+ size_t sr,
+ const float* k,
+ const float* b,
+ float* packed_w)
+{
+ const size_t skr = sr * kr;
+ const size_t skc = round_down_po2(kc, skr);
+ const size_t sr_mask = (sr - 1) * kr;
+ do {
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ *packed_w++ = b[nr_block_start + nr_block_offset];
+ }
+ packed_w += nr - nr_block_size;
+
+ for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+ *packed_w++ =
+ k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
+ }
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+
+ for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+ const size_t kr_block_size = min(kc - kr_block_start, kr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+ *packed_w++ =
+ k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
+ }
+ packed_w += kr - kr_block_size;
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+ }
+ k += nc * kc;
+ b += nc;
+ } while (--g != 0);
+}
+
+static inline void xnn_pack_f32_gemminc_goi_w(
+ size_t g,
+ size_t nc,
+ size_t kc,
+ size_t nr,
+ size_t kr,
+ size_t sr,
+ const float* k,
+ float* packed_w)
+{
+ const size_t skr = sr * kr;
+ const size_t skc = round_down_po2(kc, skr);
+ const size_t sr_mask = (sr - 1) * kr;
+ do {
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+
+ for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+ *packed_w++ =
+ k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
+ }
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+
+ for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+ const size_t kr_block_size = min(kc - kr_block_start, kr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+ *packed_w++ =
+ k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
+ }
+ packed_w += kr - kr_block_size;
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+ }
+ k += nc * kc;
+ } while (--g != 0);
+}
+
+static inline void xnn_pack_f32_conv_goki_w(
+ size_t g,
+ size_t nc,
+ size_t ks,
+ size_t kc,
+ size_t nr,
+ size_t kr,
+ size_t sr,
+ const float* k,
+ const float* b,
+ float* packed_w)
+{
+ const size_t skr = sr * kr;
+ const size_t skc = round_down_po2(kc, skr);
+ const size_t sr_mask = (sr - 1) * kr;
+ do {
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ *packed_w++ = b[nr_block_start + nr_block_offset];
+ }
+ packed_w += nr - nr_block_size;
+
+ for (size_t ki = 0; ki < ks; ki++) {
+ for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
+ *packed_w++ =
+ k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
+ }
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+
+ for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
+ const size_t kr_block_size = min(kc - kr_block_start, kr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+ *packed_w++ =
+ k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
+ }
+ packed_w += kr - kr_block_size;
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+ }
+ }
+ k += ks * kc * nc;
+ b += nc;
+ } while (--g != 0);
+}
+
+static inline void xnn_pack_f32_conv_kgo_w(
+ size_t g,
+ size_t nc,
+ size_t ks,
+ size_t nr,
+ size_t kr,
+ const float* k,
+ const float* b,
+ float* packed_w)
+{
+ for (size_t i = 0; i < g; i++) {
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ *packed_w++ = b[nr_block_start + nr_block_offset];
+ }
+ packed_w += nr - nr_block_size;
+ for (size_t ki = 0; ki < ks; ki++) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ *packed_w =
+ k[ki * g * nc + (nr_block_start + nr_block_offset)];
+ packed_w += kr;
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+ }
+ k += nc;
+ b += nc;
+ }
+}
+
+static inline void xnn_pack_f32_dconv_oki_w(
+ size_t nc,
+ size_t kc,
+ size_t nr,
+ size_t kh,
+ size_t kw,
+ const float* k,
+ const float* b,
+ float* packed_w)
+{
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
+ *packed_w++ = b[nr_block_start + min(nr_block_offset, nr_block_size - 1)];
+ }
+
+ for (size_t kx = 0; kx < kw; kx++) {
+ for (size_t c = 0; c < kc; c++) {
+ for (size_t ky = 0; ky < kh; ky++) {
+ for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
+ *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
+ }
+ }
+ }
+ }
+ }
+}
+
+static inline void xnn_pack_f32_deconv_goki_w(
+ size_t g,
+ size_t nc,
+ size_t kh,
+ size_t kw,
+ size_t kc,
+ size_t sh,
+ size_t sw,
+ size_t nr,
+ size_t kr,
+ const float* k,
+ const float* b,
+ float* packed_w,
+ struct subconvolution_params* params)
+{
+ for (size_t i = 0; i < g; i++) {
+ for (size_t oy = 0; oy < sh; oy++) {
+ for (size_t ox = 0; ox < sw; ox++) {
+ if (i == 0) {
+ (*params++).weights = packed_w;
+ }
+ for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+ const size_t nr_block_size = min(nc - nr_block_start, nr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ *packed_w++ = b[nr_block_start + nr_block_offset];
+ }
+ packed_w += nr - nr_block_size;
+ for (size_t ky = oy; ky < kh; ky += sh) {
+ for (size_t kx = ox; kx < kw; kx += sw) {
+ for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
+ const size_t kr_block_size = min(kc - kr_block_start, kr);
+ for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+ for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+ *packed_w++ =
+ k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
+ }
+ packed_w += kr - kr_block_size;
+ }
+ packed_w += (nr - nr_block_size) * kr;
+ }
+ }
+ }
+ }
+ }
+ }
+ k += kh * kw * kc * nc;
+ b += nc;
+ }
+}
+
+static inline void xnn_pack_f32_dwconv_ghw_w(
+ size_t h,
+ size_t w,
+ size_t c,
+ size_t cr,
+ const float* k,
+ const float* b,
+ float* packed_w)
+{
+ for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+ const size_t cr_block_size = min(c - cr_block_start, cr);
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ *packed_w++ = b[cr_block_start + cr_block_offset];
+ }
+ packed_w += cr - cr_block_size;
+ for (size_t x = 0; x < w; x++) {
+ for (size_t y = 0; y < h; y++) {
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
+ *packed_w++ = kv;
+ }
+ packed_w += cr - cr_block_size;
+ }
+ }
+ }
+}
+
+static inline void xnn_pack_f32_dwconv_hwg_w(
+ size_t h,
+ size_t w,
+ size_t c,
+ size_t cr,
+ const float* k,
+ const float* b,
+ float* packed_w)
+{
+ for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+ const size_t cr_block_size = min(c - cr_block_start, cr);
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ *packed_w++ = b[cr_block_start + cr_block_offset];
+ }
+ packed_w += cr - cr_block_size;
+ for (size_t x = 0; x < w; x++) {
+ for (size_t y = 0; y < h; y++) {
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
+ *packed_w++ = kv;
+ }
+ packed_w += cr - cr_block_size;
+ }
+ }
+ }
+}
+
+static inline void xnn_pack_f32_spchw_dwconv_ghw_w(
+ size_t kernel_size,
+ size_t groups,
+ const float* kernel,
+ const float* bias,
+ float* packed_weights)
+{
+ for (size_t g = 0; g < groups; g++) {
+ *packed_weights++ = *bias++;
+ for (size_t i = 0; i < kernel_size; i++) {
+ *packed_weights++ = kernel[g * kernel_size + i];
+ }
+ }
+}
+
+static inline void xnn_pack_f32_vmulcaddc_w(
+ size_t c,
+ size_t cr,
+ const float* s,
+ const float* b,
+ float* packed_w)
+{
+ for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
+ const size_t cr_block_size = min(c - cr_block_start, cr);
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ packed_w[cr_block_offset] = s[cr_block_start + cr_block_offset];
+ }
+ packed_w += cr;
+ for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
+ packed_w[cr_block_offset] = b[cr_block_start + cr_block_offset];
+ }
+ packed_w += cr;
+ }
+}
diff --git a/src/xnnpack/packx.h b/src/xnnpack/packx.h
new file mode 100644
index 0000000..20b3bc1
--- /dev/null
+++ b/src/xnnpack/packx.h
@@ -0,0 +1,36 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_X32_PACKX_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t m, \
+ size_t k, \
+ const uint32_t* x, \
+ size_t x_stride, \
+ uint32_t* y);
+
+DECLARE_X32_PACKX_UKERNEL_FUNCTION(xnn_x32_packx_ukernel_2x__scalar)
+DECLARE_X32_PACKX_UKERNEL_FUNCTION(xnn_x32_packx_ukernel_3x__scalar)
+DECLARE_X32_PACKX_UKERNEL_FUNCTION(xnn_x32_packx_ukernel_4x__neon_st4)
+DECLARE_X32_PACKX_UKERNEL_FUNCTION(xnn_x32_packx_ukernel_4x__psimd)
+DECLARE_X32_PACKX_UKERNEL_FUNCTION(xnn_x32_packx_ukernel_4x__scalar)
+DECLARE_X32_PACKX_UKERNEL_FUNCTION(xnn_x32_packx_ukernel_4x__sse)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/pad.h b/src/xnnpack/pad.h
new file mode 100644
index 0000000..3cb8103
--- /dev/null
+++ b/src/xnnpack/pad.h
@@ -0,0 +1,39 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_PAD_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t m, \
+ size_t n, \
+ size_t l, \
+ size_t r, \
+ uint32_t c, \
+ const void* input, \
+ size_t input_stride, \
+ void* output, \
+ size_t output_stride);
+
+DECLARE_PAD_UKERNEL_FUNCTION(xnn_x32_pad_x2__neon)
+DECLARE_PAD_UKERNEL_FUNCTION(xnn_x32_pad_x2__psimd)
+DECLARE_PAD_UKERNEL_FUNCTION(xnn_x32_pad_x2__scalar)
+DECLARE_PAD_UKERNEL_FUNCTION(xnn_x32_pad_x2__sse2)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
new file mode 100644
index 0000000..30e8393
--- /dev/null
+++ b/src/xnnpack/params.h
@@ -0,0 +1,1304 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <cpuinfo.h>
+
+#include <xnnpack/common.h>
+
+#define XNN_INTERNAL_EXTRA_BYTES 32
+
+struct xnn_f16_output_params {
+ uint16_t scale;
+ uint16_t max;
+ uint16_t min;
+};
+
+union xnn_f32_output_params {
+ struct {
+ float max;
+ float min;
+ } scalar;
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ struct {
+ XNN_ALIGN(16) float max[4];
+ XNN_ALIGN(16) float min[4];
+ } sse;
+#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+};
+
+union xnn_f32_spchw_params {
+ struct {
+ float max;
+ float min;
+ } scalar;
+#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+ struct {
+ XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels
+ XNN_ALIGN(16) uint32_t mask_odd[4]; // used by stride 2 kernels
+ XNN_ALIGN(16) uint32_t mask[4]; // used by stride 1 kernels
+ float min;
+ float max;
+ } neon;
+#elif CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ struct {
+ XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels
+ XNN_ALIGN(16) uint32_t mask_odd[4]; // used by stride 2 kernels
+ XNN_ALIGN(16) uint32_t mask[4]; // used by stride 1 kernels
+ XNN_ALIGN(16) float max[4];
+ XNN_ALIGN(16) float min[4];
+ } sse;
+#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+};
+
+union xnn_u8_output_params {
+ struct {
+ int32_t max;
+ int32_t min;
+ } scalar;
+#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+ struct {
+ uint8_t max;
+ uint8_t min;
+ } neon;
+#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ struct {
+ XNN_ALIGN(16) uint8_t max[16];
+ XNN_ALIGN(16) uint8_t min[16];
+ } sse2;
+#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+};
+
+union xnn_f32_avgpool_params {
+ struct {
+ float multiplier;
+ float output_min;
+ float output_max;
+ } scalar;
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ struct {
+ XNN_ALIGN(16) float multiplier[4];
+ XNN_ALIGN(16) float output_max[4];
+ XNN_ALIGN(16) float output_min[4];
+ } sse2;
+#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+ struct {
+ XNN_ALIGN(16) float multiplier;
+ XNN_ALIGN(16) float output_max;
+ XNN_ALIGN(16) float output_min;
+ } neon;
+#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+};
+
+union xnn_f32_gavgpool_params {
+ struct {
+ float multiplier;
+ float output_min;
+ float output_max;
+ } scalar;
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ struct {
+ XNN_ALIGN(16) float multiplier[4];
+ XNN_ALIGN(16) float output_max[4];
+ XNN_ALIGN(16) float output_min[4];
+ XNN_ALIGN(16) uint32_t mask[4];
+ } sse;
+#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+ struct {
+ XNN_ALIGN(16) float multiplier;
+ XNN_ALIGN(16) float output_max;
+ XNN_ALIGN(16) float output_min;
+ XNN_ALIGN(16) uint32_t mask[4];
+ } neon;
+#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+};
+
+union xnn_f32_hswish_params {
+ struct {
+ float sixth;
+ float half;
+ float one;
+ } scalar;
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ struct {
+ XNN_ALIGN(16) float sixth[4];
+ XNN_ALIGN(16) float half[4];
+ XNN_ALIGN(16) float one[4];
+ } sse;
+#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+};
+
+union xnn_q8_gemm_params {
+ struct {
+ int32_t kernel_zero_point;
+ int32_t input_zero_point;
+ int32_t multiplier;
+ int32_t remainder_mask;
+ int32_t remainder_threshold;
+ uint32_t shift;
+ int32_t output_min_less_zero_point;
+ int32_t output_max_less_zero_point;
+ int32_t output_zero_point;
+ } scalar;
+#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+ struct {
+ int16_t kernel_zero_point;
+ int16_t input_zero_point;
+ int32_t multiplier;
+ int32_t right_shift;
+ int16_t output_zero_point;
+ uint8_t output_max;
+ uint8_t output_min;
+ } neon;
+#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ struct {
+ XNN_ALIGN(16) int16_t kernel_zero_point[8];
+ XNN_ALIGN(16) int16_t input_zero_point[8];
+ XNN_ALIGN(16) uint32_t multiplier[4];
+ XNN_ALIGN(16) uint64_t rounding[2];
+ XNN_ALIGN(16) int32_t remainder_mask[4];
+ XNN_ALIGN(16) int32_t remainder_threshold[4];
+ XNN_ALIGN(16) uint64_t shift[2];
+ XNN_ALIGN(16) int16_t output_zero_point[8];
+ XNN_ALIGN(16) uint8_t output_max[16];
+ XNN_ALIGN(16) uint8_t output_min[16];
+ } sse2;
+#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+};
+
+union xnn_q8_add_params {
+ struct {
+ int32_t zero_point_product;
+ uint32_t a_multiplier;
+ uint32_t b_multiplier;
+ uint32_t shift;
+ int32_t remainder_mask;
+ int32_t remainder_threshold;
+ int32_t y_zero_point;
+ int32_t y_max;
+ int32_t y_min;
+ } scalar;
+#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+ struct {
+ uint8_t a_zero_point;
+ uint8_t b_zero_point;
+ int16_t y_zero_point;
+ int32_t a_multiplier;
+ int32_t b_multiplier;
+ int32_t right_shift;
+ uint8_t y_max;
+ uint8_t y_min;
+ } neon;
+#endif
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ struct {
+ XNN_ALIGN(16) int32_t zero_point_product[4];
+ XNN_ALIGN(16) uint16_t a_multiplier_lo[8];
+ XNN_ALIGN(16) uint16_t a_multiplier_hi[8];
+ XNN_ALIGN(16) uint16_t b_multiplier_lo[8];
+ XNN_ALIGN(16) uint16_t b_multiplier_hi[8];
+ XNN_ALIGN(16) int32_t remainder_mask[4];
+ XNN_ALIGN(16) int32_t remainder_threshold[4];
+ XNN_ALIGN(16) int16_t y_zero_point[8];
+ XNN_ALIGN(16) uint8_t y_max[16];
+ XNN_ALIGN(16) uint8_t y_min[16];
+ uint32_t shift;
+ uint32_t a_multiplier;
+ uint32_t b_multiplier;
+ } sse2;
+#endif
+};
+
+union xnn_q8_avgpool_params {
+ struct {
+ int32_t bias;
+ int32_t multiplier;
+ int64_t rounding;
+ uint32_t right_shift;
+ int32_t output_min_less_zero_point;
+ int32_t output_max_less_zero_point;
+ int32_t output_zero_point;
+ } scalar;
+#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+ struct {
+ int32_t bias;
+ int32_t multiplier;
+ int64_t left_shift;
+ int16_t output_zero_point;
+ uint8_t output_max;
+ uint8_t output_min;
+ } neon;
+#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ struct {
+ XNN_ALIGN(16) int32_t bias[4];
+ XNN_ALIGN(16) uint32_t multiplier[4];
+ XNN_ALIGN(16) uint64_t rounding[2];
+ XNN_ALIGN(16) uint64_t right_shift[2];
+ XNN_ALIGN(16) int16_t output_zero_point[8];
+ XNN_ALIGN(16) uint8_t output_max[16];
+ XNN_ALIGN(16) uint8_t output_min[16];
+ } sse2;
+#endif
+};
+
+union xnn_fp32_requantization_params {
+ struct {
+ float scale;
+ float min_less_zero_point;
+ float max_less_zero_point;
+ float magic;
+ int32_t magic_less_zero_point;
+ } scalar;
+ struct {
+ float scale;
+ float max;
+ float min;
+ float magic;
+ int32_t magic_less_zero_point;
+ } neon;
+ struct {
+ float scale;
+ int16_t zero_point;
+ uint8_t max;
+ uint8_t min;
+ } neonv8;
+ struct {
+ XNN_ALIGN(16) float scale[4];
+ XNN_ALIGN(16) int16_t zero_point[8];
+ XNN_ALIGN(16) uint8_t max[16];
+ XNN_ALIGN(16) uint8_t min[16];
+ } sse2;
+ struct {
+ XNN_ALIGN(16) float scale[4];
+ XNN_ALIGN(16) float min_less_zero_point[4];
+ XNN_ALIGN(16) float max_less_zero_point[4];
+ XNN_ALIGN(16) float magic[4];
+ XNN_ALIGN(16) int32_t magic_less_zero_point[4];
+ } psimd;
+};
+
+union xnn_precise_requantization_params {
+ struct {
+ uint32_t multiplier;
+ uint32_t rounding_lo;
+ uint32_t rounding_hi;
+ uint32_t shift_less_32;
+ int32_t min_less_zero_point;
+ int32_t max_less_zero_point;
+ int32_t zero_point;
+ } scalar;
+ struct {
+ int32_t multiplier;
+ int32_t right_shift;
+ int16_t zero_point;
+ uint8_t max;
+ uint8_t min;
+ } neon;
+ struct {
+ XNN_ALIGN(16) uint32_t multiplier[4];
+ XNN_ALIGN(16) uint64_t rounding[2];
+ XNN_ALIGN(16) uint32_t shift[4];
+ XNN_ALIGN(16) int16_t zero_point[8];
+ XNN_ALIGN(16) uint8_t max[16];
+ XNN_ALIGN(16) uint8_t min[16];
+ } sse2;
+};
+
+union xnn_q31_requantization_params {
+ struct {
+ int32_t multiplier;
+ int32_t remainder_mask;
+ int32_t remainder_threshold;
+ uint32_t shift;
+ int32_t min_less_zero_point;
+ int32_t max_less_zero_point;
+ int32_t zero_point;
+ } scalar;
+#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+ struct {
+ int32_t multiplier;
+ int32_t right_shift;
+ int16_t zero_point;
+ uint8_t max;
+ uint8_t min;
+ } neon;
+#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ struct {
+ XNN_ALIGN(16) uint32_t multiplier[4];
+ XNN_ALIGN(16) uint64_t rounding[2];
+ XNN_ALIGN(16) int32_t remainder_mask[4];
+ XNN_ALIGN(16) int32_t remainder_threshold[4];
+ XNN_ALIGN(16) uint64_t shift[2];
+ XNN_ALIGN(16) int16_t zero_point[8];
+ XNN_ALIGN(16) uint8_t max[16];
+ XNN_ALIGN(16) uint8_t min[16];
+ } sse2;
+#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */
+};
+
+union xnn_requantization_params {
+ union xnn_precise_requantization_params precise;
+ union xnn_fp32_requantization_params fp32;
+ union xnn_q31_requantization_params q31;
+};
+
+typedef void (*xnn_ppmm_ukernel_function)(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const void* a,
+ const void* w,
+ void* c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const void* params);
+
+typedef void (*xnn_f32_ppmm_ukernel_function)(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float* a,
+ const float* w,
+ float* c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_f32_output_params* params);
+
+typedef void (*xnn_f16_ppmm_ukernel_function)(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const void* a,
+ const void* w,
+ void* c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const struct xnn_f16_output_params* params);
+
+typedef void (*xnn_gemm_ukernel_function)(
+ size_t mr,
+ size_t nr,
+ size_t k,
+ const void* a,
+ size_t a_stride,
+ const void* w,
+ void* c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const void* params);
+
+typedef void (*xnn_f32_gemm_ukernel_function)(
+ size_t mr,
+ size_t nr,
+ size_t k,
+ const float* a,
+ size_t a_stride,
+ const float* w,
+ float* c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_f32_output_params* params);
+
+typedef void (*xnn_f32_gemminc_ukernel_function)(
+ size_t mr,
+ size_t nr,
+ size_t k,
+ const float* a,
+ size_t a_stride,
+ const float* w,
+ float* c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const float* acc,
+ const union xnn_f32_output_params* params);
+
+typedef void (*xnn_f16_gemm_ukernel_function)(
+ size_t mr,
+ size_t nr,
+ size_t k,
+ const void* a,
+ size_t a_stride,
+ const void* w,
+ void* c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const struct xnn_f16_output_params* params);
+
+typedef void (*xnn_q8_gemm_ukernel_function)(
+ size_t mr,
+ size_t nr,
+ size_t k,
+ const uint8_t* a,
+ size_t a_stride,
+ const void* w,
+ uint8_t* c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_q8_gemm_params* params);
+
+typedef void (*xnn_igemm_ukernel_function)(
+ size_t mr,
+ size_t nr,
+ size_t kc,
+ size_t ks,
+ const void** a,
+ const void* w,
+ void* c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const void* zero,
+ const void* params);
+
+typedef void (*xnn_f32_igemm_ukernel_function)(
+ size_t mr,
+ size_t nr,
+ size_t kc,
+ size_t ks,
+ const float** a,
+ const float* w,
+ float* c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const float* zero,
+ const union xnn_f32_output_params* params);
+
+typedef void (*xnn_q8_igemm_ukernel_function)(
+ size_t mr,
+ size_t nr,
+ size_t kc,
+ size_t ks,
+ const uint8_t** a,
+ const void* w,
+ uint8_t* c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const uint8_t* zero,
+ const union xnn_q8_gemm_params* params);
+
+typedef void (*xnn_conv_hwc_ukernel_function)(
+ size_t input_height,
+ size_t input_width,
+ size_t output_y_start,
+ size_t output_y_end,
+ const void* input,
+ const void* zero,
+ const void* weights,
+ void* output,
+ size_t input_padding_top,
+ size_t output_channels,
+ size_t output_height_stride,
+ size_t output_width_stride,
+ const void* params);
+
+typedef void (*xnn_f32_conv_hwc_ukernel_function)(
+ size_t input_height,
+ size_t input_width,
+ size_t output_y_start,
+ size_t output_y_end,
+ const float* input,
+ const float* zero,
+ const float* weights,
+ float* output,
+ size_t input_padding_top,
+ size_t output_channels,
+ size_t output_height_stride,
+ size_t output_width_stride,
+ const union xnn_f32_output_params* params);
+
+typedef void (*xnn_conv_hwc2spchw_ukernel_function)(
+ size_t input_height,
+ size_t input_width,
+ size_t output_y_start,
+ size_t output_y_end,
+ const void* input,
+ const void* zero,
+ const void* weights,
+ void* output,
+ size_t input_padding_top,
+ size_t output_channels,
+ size_t output_height_stride,
+ size_t output_channel_stride,
+ const void* params);
+
+typedef void (*xnn_f32_conv_hwc2spchw_ukernel_function)(
+ size_t input_height,
+ size_t input_width,
+ size_t output_y_start,
+ size_t output_y_end,
+ const float* input,
+ const float* zero,
+ const float* weights,
+ float* output,
+ size_t input_padding_top,
+ size_t output_channels,
+ size_t output_height_stride,
+ size_t output_channel_stride,
+ const union xnn_f32_output_params* params);
+
+typedef void (*xnn_spmm_ukernel_function)(
+ uint32_t m,
+ uint32_t n,
+ const void* a,
+ const void* w,
+ const int32_t* dmap,
+ const uint32_t* nmap,
+ void* c,
+ const void* params);
+
+typedef void (*xnn_f32_spmm_ukernel_function)(
+ uint32_t m,
+ uint32_t n,
+ const float* a,
+ const float* w,
+ const int32_t* dmap,
+ const uint32_t* nmap,
+ float* c,
+ const union xnn_f32_output_params* params);
+
+typedef void (*xnn_packx_ukernel_function)(
+ size_t m,
+ size_t k,
+ const void* x,
+ size_t x_stride,
+ void* y);
+
+typedef void (*xnn_x32_packx_ukernel_function)(
+ size_t m,
+ size_t k,
+ const uint32_t* x,
+ size_t x_stride,
+ uint32_t* y);
+
+typedef void (*xnn_pad_ukernel_function)(
+ size_t m,
+ size_t n,
+ size_t l,
+ size_t r,
+ uint32_t c,
+ const void* x,
+ size_t x_stride,
+ void* y,
+ size_t y_stride);
+
+typedef void (*xnn_unpool_ukernel_function)(
+ size_t p,
+ size_t c,
+ uint32_t f,
+ const void* input,
+ const uint32_t* index,
+ void** output);
+
+typedef void (*xnn_x32_unpool_ukernel_function)(
+ size_t p,
+ size_t c,
+ uint32_t f,
+ const uint32_t* input,
+ const uint32_t* index,
+ uint32_t** output);
+
+typedef void (*xnn_zipc_ukernel_function)(
+ size_t n,
+ const void* x,
+ void* y);
+
+typedef void (*xnn_x8_zipc_ukernel_function)(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y);
+
+typedef void (*xnn_x32_zipc_ukernel_function)(
+ size_t n,
+ const uint32_t* x,
+ uint32_t* y);
+
+typedef void (*xnn_zipv_ukernel_function)(
+ size_t n,
+ size_t m,
+ const void* x,
+ void* y);
+
+typedef void (*xnn_x8_zipv_ukernel_function)(
+ size_t n,
+ size_t m,
+ const uint8_t* x,
+ uint8_t* y);
+
+typedef void (*xnn_x32_zipv_ukernel_function)(
+ size_t n,
+ size_t m,
+ const uint32_t* x,
+ uint32_t* y);
+
+typedef void (*xnn_x8_lut_ukernel_function)(
+ size_t n,
+ const uint8_t* x,
+ const uint8_t* t,
+ uint8_t* y);
+
+typedef void (*xnn_dwconv_spchw_ukernel_function)(
+ size_t output_height,
+ size_t input_width,
+ const void* input,
+ const void* weights,
+ void* output,
+ size_t input_tuple_stride,
+ size_t output_tuple_stride,
+ size_t input_height_stride,
+ size_t output_height_stride,
+ const void* params);
+
+typedef void (*xnn_f32_dwconv_spchw_ukernel_function)(
+ size_t output_height,
+ size_t input_width,
+ const float* input,
+ const float* weights,
+ float* output,
+ size_t input_tuple_stride,
+ size_t output_tuple_stride,
+ size_t input_height_stride,
+ size_t output_height_stride,
+ const union xnn_f32_spchw_params* params);
+
+typedef void (*xnn_dwconv_up_ukernel_function)(
+ size_t channels,
+ size_t output_width,
+ const void** input,
+ const void* weights,
+ void* output,
+ size_t input_stride,
+ size_t output_increment,
+ const void* params);
+
+typedef void (*xnn_f32_dwconv_up_ukernel_function)(
+ size_t channels,
+ size_t output_width,
+ const float** input,
+ const float* weights,
+ float* output,
+ size_t input_stride,
+ size_t output_increment,
+ const union xnn_f32_output_params* params);
+
+typedef void (*xnn_q8_dwconv_up_ukernel_function)(
+ size_t channels,
+ size_t output_width,
+ const uint8_t** input,
+ const void* weights,
+ uint8_t* output,
+ size_t input_stride,
+ size_t output_increment,
+ const union xnn_q8_gemm_params* params);
+
+typedef void (*xnn_dwconv_mp_ukernel_function)(
+ size_t channels,
+ size_t output_width,
+ const void** input,
+ const void* weights,
+ void* buffer,
+ void* output,
+ size_t input_stride,
+ size_t output_increment,
+ const void* params);
+
+typedef void (*xnn_gavgpool_up_ukernel_function)(
+ size_t m,
+ size_t n,
+ const void* x,
+ size_t x_stride,
+ const void* zero,
+ void* y,
+ const void* params);
+
+typedef void (*xnn_f32_gavgpool_up_ukernel_function)(
+ size_t m,
+ size_t n,
+ const float* x,
+ size_t x_stride,
+ const float* zero,
+ float* y,
+ const union xnn_f32_avgpool_params* params);
+
+typedef void (*xnn_gavgpool_spchw_ukernel_function)(
+ size_t elements,
+ size_t channels,
+ const float* input,
+ float* output,
+ const void* params);
+
+typedef void (*xnn_f32_gavgpool_spchw_ukernel_function)(
+ size_t elements,
+ size_t channels,
+ const float* input,
+ float* output,
+ const union xnn_f32_gavgpool_params* params);
+
+typedef void (*xnn_q8_gavgpool_up_ukernel_function)(
+ size_t m,
+ size_t n,
+ const uint8_t* x,
+ size_t x_stride,
+ const uint8_t* zero,
+ uint8_t* y,
+ const union xnn_q8_avgpool_params* params);
+
+typedef void (*xnn_gavgpool_mp_ukernel_function)(
+ size_t m,
+ size_t n,
+ const void* x,
+ size_t x_stride,
+ const void* zero,
+ void* buffer,
+ void* y,
+ const void* params);
+
+typedef void (*xnn_f32_gavgpool_mp_ukernel_function)(
+ size_t m,
+ size_t n,
+ const float* x,
+ size_t x_stride,
+ const float* zero,
+ float* buffer,
+ float* y,
+ const union xnn_f32_avgpool_params* params);
+
+typedef void (*xnn_q8_gavgpool_mp_ukernel_function)(
+ size_t m,
+ size_t n,
+ const uint8_t* x,
+ size_t x_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* y,
+ const union xnn_q8_avgpool_params* params);
+
+typedef void (*xnn_avgpool_up_ukernel_function)(
+ size_t n,
+ size_t ks,
+ size_t kc,
+ const void** x,
+ const void* zero,
+ void* y,
+ size_t x_increment,
+ size_t y_increment,
+ const void* params);
+
+typedef void (*xnn_f32_avgpool_up_ukernel_function)(
+ size_t n,
+ size_t ks,
+ size_t kc,
+ const float** x,
+ const float* zero,
+ float* y,
+ size_t x_increment,
+ size_t y_increment,
+ const union xnn_f32_avgpool_params* params);
+
+typedef void (*xnn_q8_avgpool_up_ukernel_function)(
+ size_t n,
+ size_t ks,
+ size_t kc,
+ const uint8_t** x,
+ const uint8_t* zero,
+ uint8_t* y,
+ size_t x_increment,
+ size_t y_increment,
+ const union xnn_q8_avgpool_params* params);
+
+typedef void (*xnn_avgpool_mp_ukernel_function)(
+ size_t n,
+ size_t ks,
+ size_t kc,
+ const void** x,
+ const void* zero,
+ void* buffer,
+ void* y,
+ size_t x_increment,
+ size_t y_increment,
+ const void* params);
+
+typedef void (*xnn_f32_avgpool_mp_ukernel_function)(
+ size_t n,
+ size_t ks,
+ size_t kc,
+ const float** x,
+ const float* zero,
+ float* buffer,
+ float* y,
+ size_t x_increment,
+ size_t y_increment,
+ const union xnn_f32_avgpool_params* params);
+
+typedef void (*xnn_q8_avgpool_mp_ukernel_function)(
+ size_t n,
+ size_t ks,
+ size_t kc,
+ const uint8_t** x,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* y,
+ size_t x_increment,
+ size_t y_increment,
+ const union xnn_q8_avgpool_params* params);
+
+typedef void (*xnn_pavgpool_up_ukernel_function)(
+ size_t n,
+ size_t ks,
+ size_t kc,
+ const void** x,
+ const void* zero,
+ const void* multiplier,
+ void* y,
+ size_t x_increment,
+ size_t y_increment,
+ const void* params);
+
+typedef void (*xnn_f32_pavgpool_up_ukernel_function)(
+ size_t n,
+ size_t ks,
+ size_t kc,
+ const float** x,
+ const float* zero,
+ const float* multiplier,
+ float* y,
+ size_t x_increment,
+ size_t y_increment,
+ const union xnn_f32_output_params* params);
+
+typedef void (*xnn_pavgpool_mp_ukernel_function)(
+ size_t n,
+ size_t ks,
+ size_t kc,
+ const void** x,
+ const void* zero,
+ const void* multiplier,
+ void* buffer,
+ void* y,
+ size_t x_increment,
+ size_t y_increment,
+ const void* params);
+
+typedef void (*xnn_f32_pavgpool_mp_ukernel_function)(
+ size_t n,
+ size_t ks,
+ size_t kc,
+ const float** x,
+ const float* zero,
+ const float* multiplier,
+ float* buffer,
+ float* y,
+ size_t x_increment,
+ size_t y_increment,
+ const union xnn_f32_output_params* params);
+
+typedef void (*xnn_maxpool_ukernel_function)(
+ size_t n,
+ size_t ks,
+ size_t kc,
+ const void** x,
+ void* y,
+ size_t x_increment,
+ size_t y_increment,
+ const void* params);
+
+typedef void (*xnn_f32_maxpool_ukernel_function)(
+ size_t n,
+ size_t ks,
+ size_t kc,
+ const float** x,
+ float* y,
+ size_t x_increment,
+ size_t y_increment,
+ const union xnn_f32_output_params* params);
+
+typedef void (*xnn_u8_maxpool_ukernel_function)(
+ size_t n,
+ size_t ks,
+ size_t kc,
+ const uint8_t** x,
+ uint8_t* y,
+ size_t x_increment,
+ size_t y_increment,
+ const union xnn_u8_output_params* params);
+
+typedef void (*xnn_argmaxpool_up_ukernel_function)(
+ size_t n,
+ size_t ks,
+ size_t kc,
+ const void** x,
+ void* y,
+ uint32_t* i,
+ size_t x_increment,
+ size_t y_increment,
+ const void* params);
+
+typedef void (*xnn_f32_argmaxpool_up_ukernel_function)(
+ size_t n,
+ size_t ks,
+ size_t kc,
+ const float** x,
+ float* y,
+ uint32_t* i,
+ size_t x_increment,
+ size_t y_increment,
+ const union xnn_f32_output_params* params);
+
+typedef void (*xnn_argmaxpool_mp_ukernel_function)(
+ size_t n,
+ size_t ks,
+ size_t kc,
+ const void** x,
+ void* ab,
+ uint32_t* ib,
+ void* y,
+ uint32_t* i,
+ size_t x_increment,
+ size_t y_increment,
+ const void* params);
+
+typedef void (*xnn_f32_argmaxpool_mp_ukernel_function)(
+ size_t n,
+ size_t ks,
+ size_t kc,
+ const float** x,
+ float* ab,
+ uint32_t* ib,
+ float* y,
+ uint32_t* i,
+ size_t x_increment,
+ size_t y_increment,
+ const union xnn_f32_output_params* params);
+
+typedef void (*xnn_univector_ukernel_function)(
+ size_t n,
+ const void* x,
+ void* y,
+ const void* params);
+
+typedef void (*xnn_f32_clamp_ukernel_function)(
+ size_t n,
+ const float* x,
+ float* y,
+ const union xnn_f32_output_params* params);
+
+typedef void (*xnn_u8_clamp_ukernel_function)(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y,
+ const union xnn_u8_output_params* params);
+
+typedef void (*xnn_f32_hswish_ukernel_function)(
+ size_t n,
+ const float* x,
+ float* y,
+ const union xnn_f32_hswish_params* params);
+
+typedef void (*xnn_rmax_ukernel_function)(
+ size_t n,
+ const void* x,
+ void* y);
+
+typedef void (*xnn_u8_rmax_ukernel_function)(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y);
+
+typedef void (*xnn_f32_rmax_ukernel_function)(
+ size_t n,
+ const float* x,
+ float* y);
+
+typedef void (*xnn_u8_lut32norm_ukernel_function)(
+ size_t n,
+ const uint8_t* x,
+ const uint32_t* t,
+ uint8_t* y);
+
+typedef void (*xnn_vadd_ukernel_function)(
+ size_t n,
+ const void* a,
+ const void* b,
+ void* y,
+ const void* params);
+
+typedef void (*xnn_f32_vadd_ukernel_function)(
+ size_t n,
+ const float* a,
+ const float* b,
+ float* y,
+ const union xnn_f32_output_params* params);
+
+typedef void (*xnn_q8_vadd_ukernel_function)(
+ size_t n,
+ const uint8_t* a,
+ const uint8_t* b,
+ uint8_t* y,
+ const union xnn_q8_add_params* params);
+
+typedef void (*xnn_vmul_ukernel_function)(
+ size_t n,
+ const void* a,
+ const void* b,
+ void* y,
+ const void* params);
+
+typedef void (*xnn_f32_vmul_ukernel_function)(
+ size_t n,
+ const float* a,
+ const float* b,
+ float* y,
+ const union xnn_f32_output_params* params);
+
+typedef void (*xnn_vsub_ukernel_function)(
+ size_t n,
+ const void* a,
+ const void* b,
+ void* y,
+ const void* params);
+
+typedef void (*xnn_f32_vsub_ukernel_function)(
+ size_t n,
+ const float* a,
+ const float* b,
+ float* y,
+ const union xnn_f32_output_params* params);
+
+typedef void (*xnn_vmulcaddc_ukernel_function)(
+ size_t m,
+ size_t c,
+ const void* x,
+ size_t x_stride,
+ const void* w,
+ void* y,
+ size_t y_stride,
+ const void* params);
+
+typedef void (*xnn_f32_vmulcaddc_ukernel_function)(
+ size_t m,
+ size_t c,
+ const float* x,
+ size_t x_stride,
+ const float* w,
+ float* y,
+ size_t y_stride,
+ const union xnn_f32_output_params* params);
+
+typedef void (*xnn_prelu_ukernel_function)(
+ size_t mr,
+ size_t n,
+ const void* x,
+ size_t x_stride,
+ const void* w,
+ void* y,
+ size_t y_stride,
+ const void* params);
+
+typedef void (*xnn_f32_prelu_ukernel_function)(
+ size_t mr,
+ size_t n,
+ const float* x,
+ size_t x_stride,
+ const float* w,
+ float* y,
+ size_t y_stride,
+ const union xnn_f32_output_params* params);
+
+
+struct gemm_parameters {
+ xnn_gemm_ukernel_function gemm;
+ xnn_igemm_ukernel_function igemm;
+ /* Optional GEMM and IGEMM micro-kernels with MR=1 and the same NR and KR parameters */
+ xnn_gemm_ukernel_function gemm1;
+ xnn_igemm_ukernel_function igemm1;
+ uint8_t mr;
+ uint8_t nr;
+ uint8_t log2_kr;
+ uint8_t log2_sr;
+};
+
+struct spmm_parameters {
+ xnn_spmm_ukernel_function ukernel;
+ // Number of M-dimension elements in a tile.
+ // Corresponds to a block of pixels in 1x1 Convolution and a block of batch size in Fully Connected operator.
+ uint8_t mr;
+ // Number of N-dimension elements in a tile.
+ // Corresponds to a block of output channels/features in 1x1 Convolution and Fully Connected operator.
+ uint8_t nr;
+};
+
+struct hwc2spchw_dconv_parameters {
+ xnn_conv_hwc2spchw_ukernel_function ukernel_with_symm_padding;
+ // Number of output channels in a tile.
+ // This parameter must be passed as is to weight packing function.
+ uint8_t output_channel_tile;
+ // Number of output height pixels in a tile.
+ // For best efficiency, micro-kernel must produce a multiple of this number of rows in each call.
+ uint8_t output_height_tile;
+ // Number of output width pixes in a tile.
+ uint8_t output_width_tile;
+};
+
+struct spchw_dwconv_parameters {
+ xnn_dwconv_spchw_ukernel_function ukernel;
+ // Number of input width pixels in a tile.
+ uint8_t input_width_tile;
+ // Number of output width pixels in a tile.
+ uint8_t output_width_tile;
+ // Number of output height pixels in a tile.
+ // For best efficiency, micro-kernel must produce a multiple of this number of rows in each call.
+ uint8_t output_height_tile;
+};
+
+struct spchw_gavgpool_parameters {
+ xnn_gavgpool_spchw_ukernel_function ukernel;
+ // Number of channels in a tile.
+ // For best efficiency, micro-kernel must process a multiple of this number of channels in each call.
+ uint8_t channel_tile;
+};
+
+struct dwconv_parameters {
+ union {
+ xnn_dwconv_up_ukernel_function up;
+ xnn_dwconv_mp_ukernel_function mp;
+ };
+ uint8_t cr;
+ uint8_t mr;
+ uint8_t qr;
+};
+
+struct gavgpool_parameters {
+ xnn_gavgpool_up_ukernel_function up;
+ xnn_gavgpool_mp_ukernel_function mp;
+ uint8_t mr;
+};
+
+struct avgpool_parameters {
+ xnn_avgpool_up_ukernel_function up;
+ xnn_avgpool_mp_ukernel_function mp;
+ uint8_t mr;
+ uint8_t qr;
+};
+
+struct pavgpool_parameters {
+ xnn_pavgpool_up_ukernel_function up;
+ xnn_pavgpool_mp_ukernel_function mp;
+ uint8_t mr;
+ uint8_t qr;
+};
+
+struct argmaxpool_parameters {
+ union {
+ xnn_argmaxpool_up_ukernel_function up;
+ xnn_argmaxpool_mp_ukernel_function mp;
+ };
+ uint8_t mr;
+ uint8_t qr;
+};
+
+struct maxpool_parameters {
+ xnn_maxpool_ukernel_function ukernel;
+ uint8_t mr;
+ uint8_t qr;
+};
+
+struct zip_parameters {
+ xnn_zipc_ukernel_function x2;
+ xnn_zipc_ukernel_function x3;
+ xnn_zipc_ukernel_function x4;
+ xnn_zipv_ukernel_function xm;
+};
+
+struct prelu_parameters {
+ xnn_prelu_ukernel_function ukernel;
+ uint8_t mr;
+};
+
+struct pad_parameters {
+ xnn_pad_ukernel_function ukernel;
+ uint8_t mr;
+};
+
+struct vmulcaddc_parameters {
+ xnn_vmulcaddc_ukernel_function ukernel;
+ uint8_t cr;
+ uint8_t mr;
+};
+
+#define XNN_MAX_Q8_DWCONV_UKERNELS 1
+#define XNN_MAX_F32_DWCONV_UKERNELS 3
+#define XNN_MAX_F32_ARGMAXPOOL_UKERNELS 3
+
+struct xnn_parameters {
+ bool initialized;
+ struct {
+ struct gemm_parameters gemm;
+ struct dwconv_parameters dwconv[XNN_MAX_Q8_DWCONV_UKERNELS];
+ struct avgpool_parameters avgpool;
+ struct gavgpool_parameters gavgpool;
+ xnn_vadd_ukernel_function vadd;
+ } q8;
+ struct {
+ struct maxpool_parameters maxpool;
+ xnn_univector_ukernel_function clamp;
+ xnn_u8_lut32norm_ukernel_function lut32norm;
+ xnn_u8_rmax_ukernel_function rmax;
+ } u8;
+ struct {
+ xnn_x8_lut_ukernel_function lut;
+ struct zip_parameters zip;
+ } x8;
+ struct {
+ struct gemm_parameters gemm;
+ struct gemm_parameters gemm2;
+ struct dwconv_parameters dwconv[XNN_MAX_F32_DWCONV_UKERNELS];
+ struct avgpool_parameters avgpool;
+ struct pavgpool_parameters pavgpool;
+ struct gavgpool_parameters gavgpool;
+ struct maxpool_parameters maxpool;
+ struct argmaxpool_parameters argmaxpool[XNN_MAX_F32_ARGMAXPOOL_UKERNELS];
+ xnn_univector_ukernel_function clamp;
+ xnn_univector_ukernel_function hswish;
+ struct prelu_parameters prelu;
+ xnn_vadd_ukernel_function vadd;
+ struct vmulcaddc_parameters vmulcaddc;
+ // Sparse Matrix-Dense Matrix Multiplication (NR=1 block).
+ struct spmm_parameters spmm;
+ // Sparse Matrix-Dense Matrix Multiplication (NR=2 block).
+ struct spmm_parameters spmm2;
+ // Sparse Matrix-Dense Matrix Multiplication (NR=4 block).
+ struct spmm_parameters spmm4;
+ // Direct 3x3 stride-2 Convolution with 3 input channels and HWC->SpCHW layout conversion.
+ struct hwc2spchw_dconv_parameters hwc2spchw_dconv3x3c3s2;
+ // Direct 3x3 stride-1 Convolution with padding 1 on left and right in SpCHW layout.
+ struct spchw_dwconv_parameters spchw_dwconv3x3;
+ // Direct 3x3 stride-2 Convolution with padding 1 on left and right in SpCHW layout.
+ struct spchw_dwconv_parameters spchw_dwconv3x3s2;
+ // Global Average Pooling in SpCHW layout.
+ struct spchw_gavgpool_parameters spchw_gavgpool;
+ } f32;
+ struct {
+ struct pad_parameters pad;
+ xnn_unpool_ukernel_function unpool;
+ struct zip_parameters zip;
+ } x32;
+};
+
+extern XNN_INTERNAL struct xnn_parameters xnn_params;
diff --git a/src/xnnpack/pavgpool.h b/src/xnnpack/pavgpool.h
new file mode 100644
index 0000000..f124519
--- /dev/null
+++ b/src/xnnpack/pavgpool.h
@@ -0,0 +1,60 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_PAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ size_t ks, \
+ size_t kc, \
+ const float** x, \
+ const float* zero, \
+ const float* multiplier, \
+ float* buffer, \
+ float* y, \
+ size_t x_increment, \
+ size_t y_increment, \
+ const union xnn_f32_output_params* params);
+
+DECLARE_F32_PAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_mp9p8q__neon)
+DECLARE_F32_PAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_mp9p8q__psimd)
+DECLARE_F32_PAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_mp9p8q__scalar)
+DECLARE_F32_PAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_mp9p8q__sse)
+
+
+#define DECLARE_F32_PAVGPOOL_UNIPASS_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ size_t ks, \
+ size_t kc, \
+ const float** x, \
+ const float* zero, \
+ const float* multiplier, \
+ float* y, \
+ size_t x_increment, \
+ size_t y_increment, \
+ const union xnn_f32_output_params* params);
+
+DECLARE_F32_PAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_up9__neon)
+DECLARE_F32_PAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_up9__psimd)
+DECLARE_F32_PAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_up9__scalar)
+DECLARE_F32_PAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_up9__sse)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/ppmm.h b/src/xnnpack/ppmm.h
new file mode 100644
index 0000000..1bf6941
--- /dev/null
+++ b/src/xnnpack/ppmm.h
@@ -0,0 +1,45 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_PPMM_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t mr, \
+ size_t nc, \
+ size_t kc, \
+ const float* a, \
+ const float* w, \
+ float* c, \
+ size_t cm_stride, \
+ size_t cn_stride, \
+ const union xnn_f32_output_params* params);
+
+DECLARE_F32_PPMM_UKERNEL_FUNCTION(xnn_f32_ppmm_ukernel_2x4__scalar)
+DECLARE_F32_PPMM_UKERNEL_FUNCTION(xnn_f32_ppmm_ukernel_3x3__scalar)
+DECLARE_F32_PPMM_UKERNEL_FUNCTION(xnn_f32_ppmm_ukernel_4x2__scalar)
+DECLARE_F32_PPMM_UKERNEL_FUNCTION(xnn_f32_ppmm_ukernel_4x4__scalar)
+DECLARE_F32_PPMM_UKERNEL_FUNCTION(xnn_f32_ppmm_ukernel_4x8__neon)
+DECLARE_F32_PPMM_UKERNEL_FUNCTION(xnn_f32_ppmm_ukernel_4x8__neonfma)
+DECLARE_F32_PPMM_UKERNEL_FUNCTION(xnn_f32_ppmm_ukernel_4x8__psimd)
+DECLARE_F32_PPMM_UKERNEL_FUNCTION(xnn_f32_ppmm_ukernel_4x8__sse)
+DECLARE_F32_PPMM_UKERNEL_FUNCTION(xnn_f32_ppmm_ukernel_8x8__neon)
+DECLARE_F32_PPMM_UKERNEL_FUNCTION(xnn_f32_ppmm_ukernel_8x8__neonfma)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/prelu.h b/src/xnnpack/prelu.h
new file mode 100644
index 0000000..2a882a7
--- /dev/null
+++ b/src/xnnpack/prelu.h
@@ -0,0 +1,38 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_PRELU_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t mr, \
+ size_t n, \
+ const float* x, \
+ size_t x_stride, \
+ const float* w, \
+ float* y, \
+ size_t y_stride, \
+ const union xnn_f32_output_params* clamping_params);
+
+
+DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel_x4__psimd)
+DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel_x4__scalar)
+DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel_x4__sse)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/requantization-stubs.h b/src/xnnpack/requantization-stubs.h
new file mode 100644
index 0000000..ee6e86d
--- /dev/null
+++ b/src/xnnpack/requantization-stubs.h
@@ -0,0 +1,69 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stdint.h>
+#include <stddef.h>
+
+#include <xnnpack/params.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*requantization_function)(
+ size_t n,
+ const int32_t* input,
+ float scale,
+ uint8_t zero_point,
+ uint8_t qmin,
+ uint8_t qmax,
+ uint8_t* output);
+
+#define DECLARE_REQUANTIZATION_FUNCTION(fn_name) \
+ void fn_name( \
+ size_t n, \
+ const int32_t* input, \
+ float scale, \
+ uint8_t zero_point, \
+ uint8_t qmin, \
+ uint8_t qmax, \
+ uint8_t* output);
+
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_precise__scalar_unsigned32)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_precise__scalar_unsigned64)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_precise__scalar_signed64)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_precise__sse2)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_precise__ssse3)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_precise__sse4)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_precise__neon)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_precise__psimd)
+
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_fp32__scalar_lrintf)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_fp32__scalar_magic)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_fp32__sse2)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_fp32__neon)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_fp32__psimd)
+
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_q31__scalar)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_q31__sse2)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_q31__ssse3)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_q31__sse4)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_q31__neon)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_q31__psimd)
+
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_gemmlowp__scalar)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_gemmlowp__sse2)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_gemmlowp__ssse3)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_gemmlowp__sse4)
+DECLARE_REQUANTIZATION_FUNCTION(xnn_requantize_gemmlowp__neon)
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/requantization.h b/src/xnnpack/requantization.h
new file mode 100644
index 0000000..bf3e100
--- /dev/null
+++ b/src/xnnpack/requantization.h
@@ -0,0 +1,1307 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+ #include <cstdint>
+ #include <cstddef>
+ #include <cassert>
+ #include <cmath>
+#else
+ #include <stdint.h>
+ #include <stddef.h>
+ #include <assert.h>
+ #include <math.h>
+#endif
+
+#include <fp16.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/scalar-utils.h>
+
+
+static inline union xnn_q8_gemm_params xnn_compute_scalar_q8_gemm_params(
+ uint8_t input_zero_point,
+ uint8_t kernel_zero_point,
+ float scale,
+ uint8_t output_zero_point,
+ uint8_t output_min,
+ uint8_t output_max)
+{
+ /* Compute requantization parameters */
+ const uint32_t scale_bits = fp32_to_bits(scale);
+
+ /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+ const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
+ assert(multiplier >= INT32_C(0x40000000));
+ assert(multiplier <= INT32_C(0x7FFFFF80));
+
+ /* Shift is in [0, 31] range */
+ const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
+ assert(shift >= 0);
+ assert(shift < 32);
+
+ const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+ const uint32_t remainder_threshold = remainder_mask >> 1;
+
+ union xnn_q8_gemm_params params;
+ params.scalar.input_zero_point = (int32_t) (uint32_t) input_zero_point;
+ params.scalar.kernel_zero_point = (int32_t) (uint32_t) kernel_zero_point;
+ params.scalar.multiplier = multiplier;
+ params.scalar.remainder_mask = (int32_t) remainder_mask;
+ params.scalar.remainder_threshold = (int32_t) remainder_threshold;
+ params.scalar.shift = (uint32_t) shift;
+ params.scalar.output_min_less_zero_point =
+ (int32_t) (uint32_t) output_min - (int32_t) (uint32_t) output_zero_point;
+ params.scalar.output_max_less_zero_point =
+ (int32_t) (uint32_t) output_max - (int32_t) (uint32_t) output_zero_point;
+ params.scalar.output_zero_point = (int32_t) (uint32_t) output_zero_point;
+ return params;
+}
+
+static inline union xnn_q8_gemm_params xnn_compute_q8_gemm_params(
+ uint8_t input_zero_point,
+ uint8_t kernel_zero_point,
+ float scale,
+ uint8_t output_zero_point,
+ uint8_t output_min,
+ uint8_t output_max)
+{
+ /* Compute requantization parameters */
+ const uint32_t scale_bits = fp32_to_bits(scale);
+
+ /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+ const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
+ assert(multiplier >= INT32_C(0x40000000));
+ assert(multiplier <= INT32_C(0x7FFFFF80));
+
+ /* Shift is in [0, 31] range */
+ const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
+ assert(shift >= 0);
+ assert(shift < 32);
+
+ union xnn_q8_gemm_params params;
+ #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+ const uint32_t remainder_threshold = remainder_mask >> 1;
+ for (uint32_t i = 0; i < 8; i++) {
+ params.sse2.input_zero_point[i] = (int16_t) (uint16_t) input_zero_point;
+ params.sse2.kernel_zero_point[i] = (int16_t) (uint16_t) kernel_zero_point;
+ }
+ params.sse2.multiplier[0] = multiplier;
+ params.sse2.multiplier[1] = multiplier;
+ params.sse2.multiplier[2] = multiplier;
+ params.sse2.multiplier[3] = multiplier;
+ params.sse2.rounding[0] = UINT64_C(0x40000000);
+ params.sse2.rounding[1] = UINT64_C(0x40000000);
+ params.sse2.remainder_mask[0] = (int32_t) remainder_mask;
+ params.sse2.remainder_mask[1] = (int32_t) remainder_mask;
+ params.sse2.remainder_mask[2] = (int32_t) remainder_mask;
+ params.sse2.remainder_mask[3] = (int32_t) remainder_mask;
+ params.sse2.remainder_threshold[0] = (int32_t) remainder_threshold;
+ params.sse2.remainder_threshold[1] = (int32_t) remainder_threshold;
+ params.sse2.remainder_threshold[2] = (int32_t) remainder_threshold;
+ params.sse2.remainder_threshold[3] = (int32_t) remainder_threshold;
+ params.sse2.shift[0] = (uint64_t) (uint32_t) shift;
+ params.sse2.shift[1] = (uint64_t) (uint32_t) shift;
+ for (uint32_t i = 0; i < 8; i++) {
+ params.sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
+ }
+ for (uint32_t i = 0; i < 16; i++) {
+ params.sse2.output_max[i] = output_max;
+ params.sse2.output_min[i] = output_min;
+ }
+ #elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+ params.neon.input_zero_point = (int16_t) (uint16_t) input_zero_point;
+ params.neon.kernel_zero_point = (int16_t) (uint16_t) kernel_zero_point;
+ params.neon.multiplier = multiplier;
+ params.neon.right_shift = -shift;
+ params.neon.output_zero_point = (int16_t) (uint16_t) output_zero_point;
+ params.neon.output_max = output_max;
+ params.neon.output_min = output_min;
+ #else
+ const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+ const uint32_t remainder_threshold = remainder_mask >> 1;
+ params.scalar.input_zero_point = (int32_t) (uint32_t) input_zero_point;
+ params.scalar.kernel_zero_point = (int32_t) (uint32_t) kernel_zero_point;
+ params.scalar.multiplier = multiplier;
+ params.scalar.remainder_mask = (int32_t) remainder_mask;
+ params.scalar.remainder_threshold = (int32_t) remainder_threshold;
+ params.scalar.shift = (uint32_t) shift;
+ params.scalar.output_min_less_zero_point =
+ (int32_t) (uint32_t) output_min - (int32_t) (uint32_t) output_zero_point;
+ params.scalar.output_max_less_zero_point =
+ (int32_t) (uint32_t) output_max - (int32_t) (uint32_t) output_zero_point;
+ params.scalar.output_zero_point = (int32_t) (uint32_t) output_zero_point;
+ #endif
+ return params;
+}
+
+static inline union xnn_q8_avgpool_params xnn_compute_q8_avgpool_params(
+ int32_t bias,
+ float scale,
+ uint8_t output_zero_point,
+ uint8_t output_min,
+ uint8_t output_max)
+{
+ /* Compute requantization parameters */
+ assert(scale >= 0x1.0p-32f);
+ assert(scale < 256.0f);
+ const uint32_t scale_bits = fp32_to_bits(scale);
+
+ /* Multiplier is in [0x00800000, 0x00FFFFFF] range */
+ const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
+ assert(multiplier >= INT32_C(0x00800000));
+ assert(multiplier <= INT32_C(0x00FFFFFF));
+
+ /* Shift is in [16, 55] range */
+ const int32_t shift = 127 + 23 - (scale_bits >> 23);
+ assert(shift >= 16);
+ assert(shift < 64);
+
+ union xnn_q8_avgpool_params params;
+ #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ const uint32_t right_shift = (uint32_t) shift;
+ const uint64_t rounding = UINT64_C(1) << (right_shift - 1);
+ params.sse2.bias[0] = bias;
+ params.sse2.bias[1] = bias;
+ params.sse2.bias[2] = bias;
+ params.sse2.bias[3] = bias;
+ params.sse2.multiplier[0] = (uint32_t) multiplier;
+ params.sse2.multiplier[1] = (uint32_t) multiplier;
+ params.sse2.multiplier[2] = (uint32_t) multiplier;
+ params.sse2.multiplier[3] = (uint32_t) multiplier;
+ params.sse2.rounding[0] = rounding;
+ params.sse2.rounding[1] = rounding;
+ params.sse2.right_shift[0] = (uint64_t) right_shift;
+ params.sse2.right_shift[1] = (uint64_t) right_shift;
+ for (uint32_t i = 0; i < 8; i++) {
+ params.sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
+ }
+ for (uint32_t i = 0; i < 16; i++) {
+ params.sse2.output_max[i] = output_max;
+ params.sse2.output_min[i] = output_min;
+ }
+ #elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+ params.neon.bias = bias;
+ params.neon.multiplier = multiplier;
+ params.neon.left_shift = (int64_t) -shift;
+ params.neon.output_zero_point = (int16_t) (uint16_t) output_zero_point;
+ params.neon.output_max = output_max;
+ params.neon.output_min = output_min;
+ #else
+ const uint32_t right_shift = (uint32_t) shift;
+ const int64_t rounding = INT64_C(1) << (right_shift - 1);
+ params.scalar.bias = bias;
+ params.scalar.multiplier = multiplier;
+ params.scalar.rounding = rounding;
+ params.scalar.right_shift = right_shift;
+ params.scalar.output_min_less_zero_point =
+ (int32_t) (uint32_t) output_min - (int32_t) (uint32_t) output_zero_point;
+ params.scalar.output_max_less_zero_point =
+ (int32_t) (uint32_t) output_max - (int32_t) (uint32_t) output_zero_point;
+ params.scalar.output_zero_point = (int32_t) (uint32_t) output_zero_point;
+ #endif
+ return params;
+}
+
+static inline union xnn_q8_avgpool_params xnn_compute_scalar_q8_avgpool_params(
+ int32_t bias,
+ float scale,
+ uint8_t output_zero_point,
+ uint8_t output_min,
+ uint8_t output_max)
+{
+ /* Compute requantization parameters */
+ assert(scale >= 0x1.0p-32f);
+ assert(scale < 256.0f);
+ const uint32_t scale_bits = fp32_to_bits(scale);
+
+ /* Multiplier is in [0x00800000, 0x00FFFFFF] range */
+ const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
+ assert(multiplier >= INT32_C(0x00800000));
+ assert(multiplier <= INT32_C(0x00FFFFFF));
+
+ /* Shift is in [16, 55] range */
+ const int32_t shift = 127 + 23 - (scale_bits >> 23);
+ assert(shift >= 16);
+ assert(shift < 64);
+
+ union xnn_q8_avgpool_params params;
+ const uint32_t right_shift = (uint32_t) shift;
+ const int64_t rounding = INT64_C(1) << (right_shift - 1);
+ params.scalar.bias = bias;
+ params.scalar.rounding = rounding;
+ params.scalar.multiplier = multiplier;
+ params.scalar.right_shift = right_shift;
+ params.scalar.output_min_less_zero_point =
+ (int32_t) (uint32_t) output_min - (int32_t) (uint32_t) output_zero_point;
+ params.scalar.output_max_less_zero_point =
+ (int32_t) (uint32_t) output_max - (int32_t) (uint32_t) output_zero_point;
+ params.scalar.output_zero_point = (int32_t) (uint32_t) output_zero_point;
+ return params;
+}
+
+static inline void xnn_update_f32_avgpool_params(
+ union xnn_f32_avgpool_params* params,
+ float multiplier)
+{
+ #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ for (uint32_t i = 0; i < 4; i++) {
+ params->sse2.multiplier[i] = multiplier;
+ }
+ #else
+ params->scalar.multiplier = multiplier;
+ #endif
+}
+
+static inline union xnn_f32_avgpool_params xnn_compute_f32_avgpool_params(
+ float multiplier,
+ float output_min,
+ float output_max)
+{
+ union xnn_f32_avgpool_params params;
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ for (uint32_t i = 0; i < 4; i++) {
+ params.sse2.multiplier[i] = multiplier;
+ params.sse2.output_min[i] = output_min;
+ params.sse2.output_max[i] = output_max;
+ }
+#else
+ params.scalar.multiplier = multiplier;
+ params.scalar.output_min = output_min;
+ params.scalar.output_max = output_max;
+#endif
+return params;
+}
+
+static inline union xnn_f32_gavgpool_params xnn_compute_f32_gavgpool_params(
+ float multiplier,
+ float output_min,
+ float output_max,
+ uint32_t width)
+{
+ union xnn_f32_gavgpool_params params;
+ #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ for (uint32_t i = 0; i < 4; i++) {
+ params.sse.multiplier[i] = multiplier;
+ params.sse.output_min[i] = output_min;
+ params.sse.output_max[i] = output_max;
+ }
+ switch (width % 4) {
+ case 0:
+ params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask[1] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask[2] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask[3] = UINT32_C(0xFFFFFFFF);
+ break;
+ case 1:
+ params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask[1] = 0;
+ params.sse.mask[2] = 0;
+ params.sse.mask[3] = 0;
+ break;
+ case 2:
+ params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask[1] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask[2] = 0;
+ params.sse.mask[3] = 0;
+ break;
+ case 3:
+ params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask[1] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask[2] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask[3] = 0;
+ break;
+ }
+#elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+ switch (width % 4) {
+ case 0:
+ params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask[1] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask[2] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask[3] = UINT32_C(0xFFFFFFFF);
+ break;
+ case 1:
+ params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask[1] = 0;
+ params.neon.mask[2] = 0;
+ params.neon.mask[3] = 0;
+ break;
+ case 2:
+ params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask[1] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask[2] = 0;
+ params.neon.mask[3] = 0;
+ break;
+ case 3:
+ params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask[1] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask[2] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask[3] = 0;
+ break;
+ }
+ params.neon.multiplier = multiplier;
+ params.neon.output_min = output_min;
+ params.neon.output_max = output_max;
+ #else
+ params.scalar.multiplier = multiplier;
+ params.scalar.output_min = output_min;
+ params.scalar.output_max = output_max;
+ #endif
+ return params;
+}
+
+static inline void xnn_update_f32_gavgpool_params(
+ union xnn_f32_gavgpool_params* params,
+ float multiplier,
+ uint32_t width)
+{
+ #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ for (uint32_t i = 0; i < 4; i++) {
+ params->sse.multiplier[i] = multiplier;
+ }
+ switch (width % 4) {
+ case 0:
+ params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask[1] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask[2] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask[3] = UINT32_C(0xFFFFFFFF);
+ break;
+ case 1:
+ params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask[1] = 0;
+ params->sse.mask[2] = 0;
+ params->sse.mask[3] = 0;
+ break;
+ case 2:
+ params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask[1] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask[2] = 0;
+ params->sse.mask[3] = 0;
+ break;
+ case 3:
+ params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask[1] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask[2] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask[3] = 0;
+ break;
+ }
+ #elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+ params->neon.multiplier = multiplier;
+ switch (width % 4) {
+ case 0:
+ params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask[1] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask[2] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask[3] = UINT32_C(0xFFFFFFFF);
+ break;
+ case 1:
+ params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask[1] = 0;
+ params->neon.mask[2] = 0;
+ params->neon.mask[3] = 0;
+ break;
+ case 2:
+ params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask[1] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask[2] = 0;
+ params->neon.mask[3] = 0;
+ break;
+ case 3:
+ params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask[1] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask[2] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask[3] = 0;
+ break;
+ }
+ #endif
+}
+
+static inline union xnn_f32_avgpool_params xnn_compute_scalar_f32_avgpool_params(
+ float multiplier,
+ float output_min,
+ float output_max)
+{
+ union xnn_f32_avgpool_params params;
+ params.scalar.multiplier = multiplier;
+ params.scalar.output_min = output_min;
+ params.scalar.output_max = output_max;
+ return params;
+}
+
+static inline union xnn_f32_gavgpool_params xnn_compute_scalar_f32_gavgpool_params(
+ float multiplier,
+ float output_min,
+ float output_max,
+ uint32_t width)
+{
+ union xnn_f32_gavgpool_params params;
+ params.scalar.multiplier = multiplier;
+ params.scalar.output_min = output_min;
+ params.scalar.output_max = output_max;
+ return params;
+}
+
+static inline union xnn_f32_output_params xnn_compute_f32_output_params(
+ float output_min,
+ float output_max)
+{
+ union xnn_f32_output_params params;
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ for (uint32_t i = 0; i < 4; i++) {
+ params.sse.min[i] = output_min;
+ params.sse.max[i] = output_max;
+ }
+#else
+ params.scalar.min = output_min;
+ params.scalar.max = output_max;
+#endif
+ return params;
+}
+
+static inline union xnn_f32_output_params xnn_compute_scalar_f32_output_params(
+ float output_min,
+ float output_max)
+{
+ union xnn_f32_output_params params;
+ params.scalar.min = output_min;
+ params.scalar.max = output_max;
+ return params;
+}
+
+static inline union xnn_f32_hswish_params xnn_compute_f32_hswish_params(void)
+{
+ union xnn_f32_hswish_params params;
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ for (uint32_t i = 0; i < 4; i++) {
+ params.sse.sixth[i] = 0x1.555556p-3f;
+ params.sse.half[i] = 0.5f;
+ params.sse.one[i] = 1.0f;
+ }
+#else
+ params.scalar.sixth = 0x1.555556p-3f;
+ params.scalar.half = 0.5f;
+ params.scalar.one = 1.0f;
+#endif
+ return params;
+}
+
+static inline union xnn_f32_hswish_params xnn_compute_scalar_f32_hswish_params(void)
+{
+ union xnn_f32_hswish_params params;
+ params.scalar.sixth = 0x1.555556p-3f;
+ params.scalar.half = 0.5f;
+ params.scalar.one = 1.0f;
+ return params;
+}
+
+static inline union xnn_f32_spchw_params xnn_compute_f32_spchw_params(
+ uint32_t width,
+ float output_min,
+ float output_max)
+{
+ union xnn_f32_spchw_params params;
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ switch (width % 4) {
+ case 0:
+ params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask[1] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask[2] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask[3] = UINT32_C(0xFFFFFFFF);
+ break;
+ case 1:
+ params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask[1] = 0;
+ params.sse.mask[2] = 0;
+ params.sse.mask[3] = 0;
+ break;
+ case 2:
+ params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask[1] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask[2] = 0;
+ params.sse.mask[3] = 0;
+ break;
+ case 3:
+ params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask[1] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask[2] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask[3] = 0;
+ break;
+ }
+ switch (width % 8) {
+ case 0:
+ params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_even[3] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_odd[3] = UINT32_C(0xFFFFFFFF);
+ break;
+ case 1:
+ params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_even[1] = 0;
+ params.sse.mask_even[2] = 0;
+ params.sse.mask_even[3] = 0;
+ params.sse.mask_odd[0] = 0;
+ params.sse.mask_odd[1] = 0;
+ params.sse.mask_odd[2] = 0;
+ params.sse.mask_odd[3] = 0;
+ break;
+ case 2:
+ params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_even[1] = 0;
+ params.sse.mask_even[2] = 0;
+ params.sse.mask_even[3] = 0;
+ params.sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_odd[1] = 0;
+ params.sse.mask_odd[2] = 0;
+ params.sse.mask_odd[3] = 0;
+ break;
+ case 3:
+ params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_even[2] = 0;
+ params.sse.mask_even[3] = 0;
+ params.sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_odd[1] = 0;
+ params.sse.mask_odd[2] = 0;
+ params.sse.mask_odd[3] = 0;
+ break;
+ case 4:
+ params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_even[2] = 0;
+ params.sse.mask_even[3] = 0;
+ params.sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_odd[2] = 0;
+ params.sse.mask_odd[3] = 0;
+ break;
+ case 5:
+ params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_even[3] = 0;
+ params.sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_odd[2] = 0;
+ params.sse.mask_odd[3] = 0;
+ break;
+ case 6:
+ params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_even[3] = 0;
+ params.sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_odd[3] = 0;
+ break;
+ case 7:
+ params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_even[3] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_odd[3] = 0;
+ break;
+ }
+ for (uint32_t i = 0; i < 4; i++) {
+ params.sse.max[i] = output_max;
+ params.sse.min[i] = output_min;
+ }
+#elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+ switch (width % 4) {
+ case 0:
+ params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask[1] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask[2] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask[3] = UINT32_C(0xFFFFFFFF);
+ break;
+ case 1:
+ params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask[1] = 0;
+ params.neon.mask[2] = 0;
+ params.neon.mask[3] = 0;
+ break;
+ case 2:
+ params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask[1] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask[2] = 0;
+ params.neon.mask[3] = 0;
+ break;
+ case 3:
+ params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask[1] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask[2] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask[3] = 0;
+ break;
+ }
+ switch (width % 8) {
+ case 0:
+ params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_even[3] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_odd[3] = UINT32_C(0xFFFFFFFF);
+ break;
+ case 1:
+ params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_even[1] = 0;
+ params.neon.mask_even[2] = 0;
+ params.neon.mask_even[3] = 0;
+ params.neon.mask_odd[0] = 0;
+ params.neon.mask_odd[1] = 0;
+ params.neon.mask_odd[2] = 0;
+ params.neon.mask_odd[3] = 0;
+ break;
+ case 2:
+ params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_even[1] = 0;
+ params.neon.mask_even[2] = 0;
+ params.neon.mask_even[3] = 0;
+ params.neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_odd[1] = 0;
+ params.neon.mask_odd[2] = 0;
+ params.neon.mask_odd[3] = 0;
+ break;
+ case 3:
+ params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_even[2] = 0;
+ params.neon.mask_even[3] = 0;
+ params.neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_odd[1] = 0;
+ params.neon.mask_odd[2] = 0;
+ params.neon.mask_odd[3] = 0;
+ break;
+ case 4:
+ params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_even[2] = 0;
+ params.neon.mask_even[3] = 0;
+ params.neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_odd[2] = 0;
+ params.neon.mask_odd[3] = 0;
+ break;
+ case 5:
+ params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_even[3] = 0;
+ params.neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_odd[2] = 0;
+ params.neon.mask_odd[3] = 0;
+ break;
+ case 6:
+ params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_even[3] = 0;
+ params.neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_odd[3] = 0;
+ break;
+ case 7:
+ params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_even[3] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_odd[3] = 0;
+ break;
+ }
+ params.neon.max = output_max;
+ params.neon.min = output_min;
+#else
+ params.scalar.max = output_max;
+ params.scalar.min = output_min;
+#endif
+ return params;
+}
+
+static inline void xnn_update_f32_spchw_params(
+ union xnn_f32_spchw_params* params,
+ uint32_t width)
+{
+ #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ switch (width % 4) {
+ case 0:
+ params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask[1] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask[2] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask[3] = UINT32_C(0xFFFFFFFF);
+ break;
+ case 1:
+ params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask[1] = 0;
+ params->sse.mask[2] = 0;
+ params->sse.mask[3] = 0;
+ break;
+ case 2:
+ params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask[1] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask[2] = 0;
+ params->sse.mask[3] = 0;
+ break;
+ case 3:
+ params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask[1] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask[2] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask[3] = 0;
+ break;
+ }
+ switch (width % 8) {
+ case 0:
+ params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_even[3] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_odd[3] = UINT32_C(0xFFFFFFFF);
+ break;
+ case 1:
+ params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_even[1] = 0;
+ params->sse.mask_even[2] = 0;
+ params->sse.mask_even[3] = 0;
+ params->sse.mask_odd[0] = 0;
+ params->sse.mask_odd[1] = 0;
+ params->sse.mask_odd[2] = 0;
+ params->sse.mask_odd[3] = 0;
+ break;
+ case 2:
+ params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_even[1] = 0;
+ params->sse.mask_even[2] = 0;
+ params->sse.mask_even[3] = 0;
+ params->sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_odd[1] = 0;
+ params->sse.mask_odd[2] = 0;
+ params->sse.mask_odd[3] = 0;
+ break;
+ case 3:
+ params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_even[2] = 0;
+ params->sse.mask_even[3] = 0;
+ params->sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_odd[1] = 0;
+ params->sse.mask_odd[2] = 0;
+ params->sse.mask_odd[3] = 0;
+ break;
+ case 4:
+ params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_even[2] = 0;
+ params->sse.mask_even[3] = 0;
+ params->sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_odd[2] = 0;
+ params->sse.mask_odd[3] = 0;
+ break;
+ case 5:
+ params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_even[3] = 0;
+ params->sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_odd[2] = 0;
+ params->sse.mask_odd[3] = 0;
+ break;
+ case 6:
+ params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_even[3] = 0;
+ params->sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_odd[3] = 0;
+ break;
+ case 7:
+ params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_even[3] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_odd[3] = 0;
+ break;
+ }
+ #elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+ switch (width % 4) {
+ case 0:
+ params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask[1] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask[2] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask[3] = UINT32_C(0xFFFFFFFF);
+ break;
+ case 1:
+ params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask[1] = 0;
+ params->neon.mask[2] = 0;
+ params->neon.mask[3] = 0;
+ break;
+ case 2:
+ params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask[1] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask[2] = 0;
+ params->neon.mask[3] = 0;
+ break;
+ case 3:
+ params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask[1] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask[2] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask[3] = 0;
+ break;
+ }
+ switch (width % 8) {
+ case 0:
+ params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_even[3] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_odd[3] = UINT32_C(0xFFFFFFFF);
+ break;
+ case 1:
+ params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_even[1] = 0;
+ params->neon.mask_even[2] = 0;
+ params->neon.mask_even[3] = 0;
+ params->neon.mask_odd[0] = 0;
+ params->neon.mask_odd[1] = 0;
+ params->neon.mask_odd[2] = 0;
+ params->neon.mask_odd[3] = 0;
+ break;
+ case 2:
+ params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_even[1] = 0;
+ params->neon.mask_even[2] = 0;
+ params->neon.mask_even[3] = 0;
+ params->neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_odd[1] = 0;
+ params->neon.mask_odd[2] = 0;
+ params->neon.mask_odd[3] = 0;
+ break;
+ case 3:
+ params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_even[2] = 0;
+ params->neon.mask_even[3] = 0;
+ params->neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_odd[1] = 0;
+ params->neon.mask_odd[2] = 0;
+ params->neon.mask_odd[3] = 0;
+ break;
+ case 4:
+ params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_even[2] = 0;
+ params->neon.mask_even[3] = 0;
+ params->neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_odd[2] = 0;
+ params->neon.mask_odd[3] = 0;
+ break;
+ case 5:
+ params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_even[3] = 0;
+ params->neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_odd[2] = 0;
+ params->neon.mask_odd[3] = 0;
+ break;
+ case 6:
+ params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_even[3] = 0;
+ params->neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_odd[3] = 0;
+ break;
+ case 7:
+ params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_even[3] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_odd[2] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_odd[3] = 0;
+ break;
+ }
+ #endif
+}
+
+static inline union xnn_f32_spchw_params xnn_compute_scalar_f32_spchw_params(
+ uint32_t width,
+ float output_min,
+ float output_max)
+{
+ union xnn_f32_spchw_params params;
+ params.scalar.max = output_max;
+ params.scalar.min = output_min;
+ return params;
+}
+
+static inline union xnn_u8_output_params xnn_compute_u8_output_params(
+ uint8_t output_min,
+ uint8_t output_max)
+{
+ assert(output_min < output_max);
+
+ union xnn_u8_output_params params;
+ #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ for (uint32_t i = 0; i < 16; i++) {
+ params.sse2.max[i] = output_max;
+ params.sse2.min[i] = output_min;
+ }
+ #elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+ params.neon.max = output_max;
+ params.neon.min = output_min;
+ #else
+ params.scalar.min = (int32_t) (uint32_t) output_min;
+ params.scalar.max = (int32_t) (uint32_t) output_max;
+ #endif
+ return params;
+}
+
+static inline union xnn_u8_output_params xnn_compute_scalar_u8_output_params(
+ uint8_t output_min,
+ uint8_t output_max)
+{
+ assert(output_min < output_max);
+
+ union xnn_u8_output_params params;
+ params.scalar.min = (int32_t) (uint32_t) output_min;
+ params.scalar.max = (int32_t) (uint32_t) output_max;
+ return params;
+}
+
+static inline union xnn_q8_add_params xnn_compute_q8_add_params(
+ uint8_t a_zero_point,
+ uint8_t b_zero_point,
+ uint8_t output_zero_point,
+ float a_output_scale,
+ float b_output_scale,
+ uint8_t output_min,
+ uint8_t output_max)
+{
+ assert(a_output_scale >= 0x1.0p-14f);
+ assert(b_output_scale >= 0x1.0p-14f);
+ assert(a_output_scale < 0x1.0p+8f);
+ assert(b_output_scale < 0x1.0p+8f);
+
+ /* Compute requantization parameters */
+ const float max_output_scale = a_output_scale > b_output_scale ? a_output_scale : b_output_scale;
+ assert(max_output_scale >= 0x1.0p-14f);
+ assert(max_output_scale < 0x1.0p+8f);
+ const uint32_t max_scale_bits = fp32_to_bits(max_output_scale);
+ const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
+ /* Shift is in [13, 31] range */
+ const uint32_t shift = (uint32_t) (21 - max_scale_exponent);
+ assert(shift < 32);
+ assert(shift >= 13);
+
+ const float scale_multiplier = fp32_from_bits((uint32_t) (21 - max_scale_exponent + 127) << 23);
+
+ /* Multipliers are in [0, 2**22) range, largest multiplier is in [2**21, 2**22) range */
+ const uint32_t a_multiplier = (uint32_t) (int32_t) __builtin_lrintf(a_output_scale * scale_multiplier);
+ const uint32_t b_multiplier = (uint32_t) (int32_t) __builtin_lrintf(b_output_scale * scale_multiplier);
+ assert((a_multiplier > b_multiplier ? a_multiplier : b_multiplier) >= UINT32_C(0x00200000));
+ assert(a_multiplier < UINT32_C(0x00400000));
+ assert(b_multiplier < UINT32_C(0x00400000));
+
+ union xnn_q8_add_params params;
+ #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+ const uint32_t remainder_threshold = remainder_mask >> 1;
+ const int32_t zero_point_product =
+ (int32_t) -(a_multiplier * (uint32_t) a_zero_point + b_multiplier * (uint32_t) b_zero_point);
+ for (uint32_t i = 0; i < 4; i++) {
+ params.sse2.zero_point_product[i] = zero_point_product;
+ }
+ for (uint32_t i = 0; i < 8; i++) {
+ params.sse2.y_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
+ }
+ for (uint32_t i = 0; i < 8; i++) {
+ params.sse2.a_multiplier_lo[i] = (uint16_t) (uint32_t) a_multiplier;
+ params.sse2.a_multiplier_hi[i] = (uint16_t) ((uint32_t) a_multiplier >> 16);
+ params.sse2.b_multiplier_lo[i] = (uint16_t) (uint32_t) b_multiplier;
+ params.sse2.b_multiplier_hi[i] = (uint16_t) ((uint32_t) b_multiplier >> 16);
+ }
+ params.sse2.a_multiplier = a_multiplier;
+ params.sse2.b_multiplier = b_multiplier;
+ for (uint32_t i = 0; i < 4; i++) {
+ params.sse2.remainder_mask[i] = remainder_mask;
+ params.sse2.remainder_threshold[i] = remainder_threshold;
+ }
+ params.sse2.shift = shift;
+ for (uint32_t i = 0; i < 16; i++) {
+ params.sse2.y_max[i] = output_max;
+ params.sse2.y_min[i] = output_min;
+ }
+ #elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+ params.neon.a_zero_point = a_zero_point;
+ params.neon.b_zero_point = b_zero_point;
+ params.neon.y_zero_point = (int16_t) (uint16_t) output_zero_point;
+ params.neon.a_multiplier = (int32_t) a_multiplier;
+ params.neon.b_multiplier = (int32_t) b_multiplier;
+ params.neon.right_shift = (int32_t) -shift;
+ params.neon.y_max = output_max;
+ params.neon.y_min = output_min;
+ #else
+ const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+ const uint32_t remainder_threshold = remainder_mask >> 1;
+ params.scalar.zero_point_product =
+ (int32_t) -(a_multiplier * (uint32_t) a_zero_point + b_multiplier * (uint32_t) b_zero_point);
+ params.scalar.a_multiplier = a_multiplier;
+ params.scalar.b_multiplier = b_multiplier;
+ params.scalar.remainder_mask = (int32_t) remainder_mask;
+ params.scalar.remainder_threshold = (int32_t) remainder_threshold;
+ params.scalar.shift = shift;
+ params.scalar.y_zero_point = (int32_t) (uint32_t) output_zero_point;
+ params.scalar.y_max = (int32_t) (uint32_t) output_max;
+ params.scalar.y_min = (int32_t) (uint32_t) output_min;
+ #endif
+ return params;
+}
+
+static inline union xnn_q8_add_params xnn_compute_scalar_q8_add_params(
+ uint8_t a_zero_point,
+ uint8_t b_zero_point,
+ uint8_t output_zero_point,
+ float a_output_scale,
+ float b_output_scale,
+ uint8_t output_min,
+ uint8_t output_max)
+{
+ assert(a_output_scale >= 0x1.0p-10f);
+ assert(b_output_scale >= 0x1.0p-10f);
+ assert(a_output_scale < 0x1.0p+8f);
+ assert(b_output_scale < 0x1.0p+8f);
+
+ /* Compute requantization parameters */
+ const float max_output_scale = a_output_scale > b_output_scale ? a_output_scale : b_output_scale;
+ assert(max_output_scale >= 0x1.0p-10f);
+ assert(max_output_scale < 0x1.0p+8f);
+ const uint32_t max_scale_bits = fp32_to_bits(max_output_scale);
+ const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
+ /* Shift is in [13, 31] range */
+ const uint32_t shift = (uint32_t) (21 - max_scale_exponent);
+ assert(shift < 32);
+ assert(shift >= 13);
+
+ /* Multipliers are in [0, 2**22) range, largest multiplier is in [2**21, 2**22) range */
+ const uint32_t a_multiplier = (uint32_t) (int32_t) __builtin_lrintf(fp32_from_bits(fp32_to_bits(a_output_scale) + (shift << 23)));
+ const uint32_t b_multiplier = (uint32_t) (int32_t) __builtin_lrintf(fp32_from_bits(fp32_to_bits(b_output_scale) + (shift << 23)));
+ assert((a_multiplier > b_multiplier ? a_multiplier : b_multiplier) >= UINT32_C(0x00200000));
+ assert(a_multiplier < UINT32_C(0x00400000));
+ assert(b_multiplier < UINT32_C(0x00400000));
+
+ union xnn_q8_add_params params;
+ const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+ const uint32_t remainder_threshold = remainder_mask >> 1;
+ params.scalar.zero_point_product =
+ (int32_t) -(a_multiplier * (uint32_t) a_zero_point + b_multiplier * (uint32_t) b_zero_point);
+ params.scalar.a_multiplier = a_multiplier;
+ params.scalar.b_multiplier = b_multiplier;
+ params.scalar.remainder_mask = (int32_t) remainder_mask;
+ params.scalar.remainder_threshold = (int32_t) remainder_threshold;
+ params.scalar.shift = shift;
+ params.scalar.y_zero_point = (int32_t) (uint32_t) output_zero_point;
+ params.scalar.y_max = (int32_t) (uint32_t) output_max;
+ params.scalar.y_min = (int32_t) (uint32_t) output_min;
+ return params;
+}
+
+static inline union xnn_q31_requantization_params xnn_compute_scalar_requantization_params(
+ float scale,
+ uint8_t zero_point,
+ uint8_t min,
+ uint8_t max)
+{
+ /* Compute requantization parameters */
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+ const uint32_t scale_bits = fp32_to_bits(scale);
+
+ /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+ const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
+ assert(multiplier >= INT32_C(0x40000000));
+ assert(multiplier <= INT32_C(0x7FFFFF80));
+
+ /* Shift is in [0, 31] range */
+ const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
+ assert(shift >= 0);
+ assert(shift < 32);
+
+ union xnn_q31_requantization_params params;
+ const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+ const uint32_t remainder_threshold = remainder_mask >> 1;
+ params.scalar.multiplier = multiplier;
+ params.scalar.remainder_mask = (int32_t) remainder_mask;
+ params.scalar.remainder_threshold = (int32_t) remainder_threshold;
+ params.scalar.shift = (uint32_t) shift;
+ params.scalar.min_less_zero_point = (int32_t) (uint32_t) min - (int32_t) (uint32_t) zero_point;
+ params.scalar.max_less_zero_point = (int32_t) (uint32_t) max - (int32_t) (uint32_t) zero_point;
+ params.scalar.zero_point = (int32_t) (uint32_t) zero_point;
+ return params;
+}
+
+static inline union xnn_q31_requantization_params xnn_compute_requantization_params(
+ float scale,
+ uint8_t zero_point,
+ uint8_t min,
+ uint8_t max)
+{
+ /* Compute requantization parameters */
+ const uint32_t scale_bits = fp32_to_bits(scale);
+
+ /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+ const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
+ assert(multiplier >= INT32_C(0x40000000));
+ assert(multiplier <= INT32_C(0x7FFFFF80));
+
+ /* Shift is in [0, 31] range */
+ const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
+ assert(shift >= 0);
+ assert(shift < 32);
+
+ union xnn_q31_requantization_params params;
+ #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+ const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+ const uint32_t remainder_threshold = remainder_mask >> 1;
+ params.sse2.multiplier[0] = multiplier;
+ params.sse2.multiplier[1] = multiplier;
+ params.sse2.multiplier[2] = multiplier;
+ params.sse2.multiplier[3] = multiplier;
+ params.sse2.rounding[0] = UINT64_C(0x40000000);
+ params.sse2.rounding[1] = UINT64_C(0x40000000);
+ params.sse2.remainder_mask[0] = (int32_t) remainder_mask;
+ params.sse2.remainder_mask[1] = (int32_t) remainder_mask;
+ params.sse2.remainder_mask[2] = (int32_t) remainder_mask;
+ params.sse2.remainder_mask[3] = (int32_t) remainder_mask;
+ params.sse2.remainder_threshold[0] = (int32_t) remainder_threshold;
+ params.sse2.remainder_threshold[1] = (int32_t) remainder_threshold;
+ params.sse2.remainder_threshold[2] = (int32_t) remainder_threshold;
+ params.sse2.remainder_threshold[3] = (int32_t) remainder_threshold;
+ params.sse2.shift[0] = (uint64_t) (uint32_t) shift;
+ params.sse2.shift[1] = (uint64_t) (uint32_t) shift;
+ for (uint32_t i = 0; i < 8; i++) {
+ params.sse2.zero_point[i] = (int16_t) (uint16_t) zero_point;
+ }
+ for (uint32_t i = 0; i < 16; i++) {
+ params.sse2.max[i] = max;
+ params.sse2.min[i] = min;
+ }
+ #elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+ params.neon.multiplier = multiplier;
+ params.neon.right_shift = -shift;
+ params.neon.zero_point = (int16_t) (uint16_t) zero_point;
+ params.neon.max = max;
+ params.neon.min = min;
+ #else
+ const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+ const uint32_t remainder_threshold = remainder_mask >> 1;
+ params.scalar.multiplier = multiplier;
+ params.scalar.remainder_mask = (int32_t) remainder_mask;
+ params.scalar.remainder_threshold = (int32_t) remainder_threshold;
+ params.scalar.shift = (uint32_t) shift;
+ params.scalar.min_less_zero_point = (int32_t) (uint32_t) min - (int32_t) (uint32_t) zero_point;
+ params.scalar.max_less_zero_point = (int32_t) (uint32_t) max - (int32_t) (uint32_t) zero_point;
+ params.scalar.zero_point = (int32_t) (uint32_t) zero_point;
+ #endif
+ return params;
+}
+
+static inline uint8_t xnn_q31_requantize(
+ int32_t n,
+ union xnn_q31_requantization_params params)
+{
+ const int64_t product = (int64_t) n * (int64_t) params.scalar.multiplier;
+ const int32_t q31product = (int32_t) (uint32_t) ((uint64_t) (product + INT64_C(0x40000000)) >> 31);
+ const int32_t remainder = (q31product & params.scalar.remainder_mask) - (int32_t) (n < 0);
+ n = asr_s32(q31product, params.scalar.shift) + (int32_t) (remainder > params.scalar.remainder_threshold);
+ if (n < params.scalar.min_less_zero_point) {
+ n = params.scalar.min_less_zero_point;
+ }
+ if (n > params.scalar.max_less_zero_point) {
+ n = params.scalar.max_less_zero_point;
+ }
+
+ return (uint8_t) (n + params.scalar.zero_point);
+}
+
+static inline uint8_t xnn_avgpool_quantize(
+ int32_t n,
+ union xnn_q8_avgpool_params params)
+{
+ const int64_t product = (int64_t) n * (int64_t) params.scalar.multiplier;
+ const int64_t adjusted_product = product - (int64_t) (n < 0);
+
+ n = (int32_t) asr_s64(adjusted_product + params.scalar.rounding, params.scalar.right_shift);
+ if (n < params.scalar.output_min_less_zero_point) {
+ n = params.scalar.output_min_less_zero_point;
+ }
+ if (n > params.scalar.output_max_less_zero_point) {
+ n = params.scalar.output_max_less_zero_point;
+ }
+
+ return (uint8_t) (n + params.scalar.output_zero_point);
+}
+
+static inline uint8_t xnn_add_quantize(
+ uint8_t a, uint8_t b,
+ union xnn_q8_add_params params)
+{
+ /* Multiply by factors and accumulate products */
+ int32_t acc = params.scalar.zero_point_product +
+ (int32_t) ((uint32_t) a * params.scalar.a_multiplier) +
+ (int32_t) ((uint32_t) b * params.scalar.b_multiplier);
+
+ /* Shift right and round */
+ const int32_t rem = (acc & params.scalar.remainder_mask) - (int32_t) (acc < 0);
+ acc = asr_s32(acc, params.scalar.shift) + (int32_t) (rem > params.scalar.remainder_threshold);
+
+ /* Clamp and add output zero point */
+ int32_t y = acc + params.scalar.y_zero_point;
+ if (y >= params.scalar.y_max) {
+ y = params.scalar.y_max;
+ }
+ if (y <= params.scalar.y_min) {
+ y = params.scalar.y_min;
+ }
+ return (uint8_t) y;
+}
diff --git a/src/xnnpack/rmax.h b/src/xnnpack/rmax.h
new file mode 100644
index 0000000..25f6e32
--- /dev/null
+++ b/src/xnnpack/rmax.h
@@ -0,0 +1,47 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_RMAX_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ const float* x, \
+ float* y);
+
+DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__avx)
+DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__avx512f)
+DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__neon)
+DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__scalar)
+DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__sse)
+
+
+#define DECLARE_U8_RMAX_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ const uint8_t* x, \
+ uint8_t* y);
+
+DECLARE_U8_RMAX_UKERNEL_FUNCTION(xnn_u8_rmax_ukernel__neon)
+DECLARE_U8_RMAX_UKERNEL_FUNCTION(xnn_u8_rmax_ukernel__scalar)
+DECLARE_U8_RMAX_UKERNEL_FUNCTION(xnn_u8_rmax_ukernel__sse2)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/scalar-utils.h b/src/xnnpack/scalar-utils.h
new file mode 100644
index 0000000..88d30c8
--- /dev/null
+++ b/src/xnnpack/scalar-utils.h
@@ -0,0 +1,121 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+ #include <climits>
+ #include <cstdint>
+ #include <cstdbool>
+ #include <cassert>
+#else
+ #include <limits.h>
+ #include <stdint.h>
+ #include <stdbool.h>
+ #include <assert.h>
+#endif
+
+#include <fp16.h>
+
+#if defined(__clang__) && !defined(__pnacl__)
+ #if __clang_major__ == 3 && __clang_minor__ >= 7 || __clang_major__ > 3
+ #define XNN_IGNORE_SHIFT_BASE_UB __attribute__((__no_sanitize__("shift-base")))
+ #else
+ #define XNN_IGNORE_SHIFT_BASE_UB
+ #endif
+#elif defined(__GNUC__)
+ #if __GNUC__ >= 8
+ #define XNN_IGNORE_SHIFT_BASE_UB __attribute__((__no_sanitize__("shift-base")))
+ #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 || __GNUC__ > 4
+ /* 4.9 <= gcc < 8 support ubsan, but doesn't support no_sanitize attribute */
+ #define XNN_IGNORE_SHIFT_BASE_UB
+ #ifndef XNN_USE_SHIFT_BASE_UB_WORKAROUND
+ #define XNN_USE_SHIFT_BASE_UB_WORKAROUND 1
+ #endif
+ #else
+ #define XNN_IGNORE_SHIFT_BASE_UB
+ #endif
+#else
+ #define XNN_IGNORE_SHIFT_BASE_UB
+#endif
+
+XNN_IGNORE_SHIFT_BASE_UB
+inline static int32_t asr_s32(int32_t x, uint32_t n) {
+ #ifdef XNN_USE_SHIFT_BASE_UB_WORKAROUND
+ #if defined(__x86_64__) || defined(__aarch64__)
+ return (int32_t) ((uint64_t) (int64_t) x >> n);
+ #else
+ return x >= 0 ? x >> n : ~(~x >> n);
+ #endif
+ #else
+ return x >> n;
+ #endif
+}
+
+XNN_IGNORE_SHIFT_BASE_UB
+inline static int64_t asr_s64(int64_t x, uint32_t n) {
+ #ifdef XNN_USE_SHIFT_BASE_UB_WORKAROUND
+ return x >= 0 ? x >> n : ~(~x >> n);
+ #else
+ return x >> n;
+ #endif
+}
+
+inline static uint8_t scalar_requantize_precise(
+ int32_t value,
+ float scale,
+ uint8_t zero_point,
+ uint8_t qmin,
+ uint8_t qmax)
+{
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ const uint32_t scale_bits = fp32_to_bits(scale);
+ const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000);
+ const uint32_t shift = 127 + 23 - (scale_bits >> 23);
+ assert(shift >= 24);
+ assert(shift < 56);
+
+ /*
+ * Compute absolute value of input as unsigned 32-bit int.
+ * All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
+ */
+ const uint32_t abs_value = (value >= 0) ? (uint32_t) value : -(uint32_t) value;
+
+ /* Compute full 64-bit product of 32-bit factors */
+ const uint64_t product = (uint64_t) abs_value * (uint64_t) multiplier;
+
+ /*
+ * Shift the full 64-bit product right with rounding.
+ * Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
+ */
+ const uint64_t rounding = UINT64_C(1) << (shift - 1);
+ const uint32_t abs_scaled_value = (uint32_t) ((product + rounding) >> shift);
+
+ /*
+ * Copy the sign of input to scaled absolute input value.
+ */
+ const int32_t scaled_value = (int32_t) (value >= 0 ? abs_scaled_value : -abs_scaled_value);
+
+ /* Clamp scaled value with zero point between smin and smax */
+ int32_t clamped_value = scaled_value;
+ const int32_t smin = (int32_t) (uint32_t) qmin - (int32_t) (uint32_t) zero_point;
+ if (clamped_value < smin) {
+ clamped_value = smin;
+ }
+ const int32_t smax = (int32_t) (uint32_t) qmax - (int32_t) (uint32_t) zero_point;
+ if (clamped_value > smax) {
+ clamped_value = smax;
+ }
+
+ /* Add zero point to clamped value */
+ const int32_t biased_value = clamped_value + (int32_t) (uint32_t) zero_point;
+
+ return biased_value;
+}
diff --git a/src/xnnpack/spmm.h b/src/xnnpack/spmm.h
new file mode 100644
index 0000000..7ea16bf
--- /dev/null
+++ b/src/xnnpack/spmm.h
@@ -0,0 +1,66 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_SPMM_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ uint32_t m, \
+ uint32_t n, \
+ const float* a, \
+ const float* w, \
+ const int32_t* dmap, \
+ const uint32_t* nmap, \
+ float* c, \
+ const union xnn_f32_output_params* params);
+
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_12x1__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_12x2__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_12x4__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_16x1__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_16x2__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_16x4__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_16x1__neonfma_pipelined)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_16x1__neonfma_unroll2)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_1x1__scalar)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_1x1__scalar_pipelined)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_1x1__scalar_unroll2)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_2x1__scalar)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_2x1__scalar_pipelined)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_2x1__scalar_unroll2)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_4x1__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_4x2__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_4x4__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_4x1__neonfma_pipelined)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_4x1__neonfma_unroll2)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_4x1__scalar)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_4x1__scalar_pipelined)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_4x1__scalar_unroll2)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_4x1__sse)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_8x1__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_8x2__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_8x4__neonfma)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_8x1__neonfma_pipelined)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_8x1__neonfma_unroll2)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_8x1__scalar)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_8x1__scalar_pipelined)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_8x1__scalar_unroll2)
+DECLARE_F32_SPMM_UKERNEL_FUNCTION(xnn_f32_spmm_ukernel_8x1__sse)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/unpool.h b/src/xnnpack/unpool.h
new file mode 100644
index 0000000..c02457a
--- /dev/null
+++ b/src/xnnpack/unpool.h
@@ -0,0 +1,34 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_X32_UNPOOL_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t p, \
+ size_t c, \
+ uint32_t f, \
+ const uint32_t* input, \
+ const uint32_t* index, \
+ uint32_t** output);
+
+DECLARE_X32_UNPOOL_UKERNEL_FUNCTION(xnn_x32_unpool_ukernel__psimd)
+DECLARE_X32_UNPOOL_UKERNEL_FUNCTION(xnn_x32_unpool_ukernel__scalar)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/vadd.h b/src/xnnpack/vadd.h
new file mode 100644
index 0000000..a66d171
--- /dev/null
+++ b/src/xnnpack/vadd.h
@@ -0,0 +1,51 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_VADD_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ const float* a, \
+ const float* b, \
+ float* y, \
+ const union xnn_f32_output_params* params);
+
+DECLARE_F32_VADD_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__neon)
+DECLARE_F32_VADD_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__psimd)
+DECLARE_F32_VADD_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__scalar)
+DECLARE_F32_VADD_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__sse)
+
+
+#define DECLARE_Q8_VADD_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ const uint8_t* a, \
+ const uint8_t* b, \
+ uint8_t* y, \
+ const union xnn_q8_add_params* params);
+
+DECLARE_Q8_VADD_UKERNEL_FUNCTION(xnn_q8_vadd_ukernel__neon)
+DECLARE_Q8_VADD_UKERNEL_FUNCTION(xnn_q8_vadd_ukernel__scalar)
+DECLARE_Q8_VADD_UKERNEL_FUNCTION(xnn_q8_vadd_ukernel__sse2)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/vmul.h b/src/xnnpack/vmul.h
new file mode 100644
index 0000000..9747de8
--- /dev/null
+++ b/src/xnnpack/vmul.h
@@ -0,0 +1,35 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_VMUL_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ const float* a, \
+ const float* b, \
+ float* y, \
+ const union xnn_f32_output_params* params);
+
+DECLARE_F32_VMUL_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__neon)
+DECLARE_F32_VMUL_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__psimd)
+DECLARE_F32_VMUL_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__scalar)
+DECLARE_F32_VMUL_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__sse)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/vmulcaddc.h b/src/xnnpack/vmulcaddc.h
new file mode 100644
index 0000000..a37e747
--- /dev/null
+++ b/src/xnnpack/vmulcaddc.h
@@ -0,0 +1,39 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t m, \
+ size_t c, \
+ const float* x, \
+ size_t x_stride, \
+ const float* w, \
+ float* y, \
+ size_t y_stride, \
+ const union xnn_f32_output_params* params);
+
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c1__scalar_x2)
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__neon_x2)
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__neonfma_x2)
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__psimd_x2)
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__sse_x2)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/vsub.h b/src/xnnpack/vsub.h
new file mode 100644
index 0000000..e444eb6
--- /dev/null
+++ b/src/xnnpack/vsub.h
@@ -0,0 +1,35 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_F32_VSUB_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ const float* a, \
+ const float* b, \
+ float* y, \
+ const union xnn_f32_output_params* params);
+
+DECLARE_F32_VSUB_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__neon)
+DECLARE_F32_VSUB_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__psimd)
+DECLARE_F32_VSUB_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__scalar)
+DECLARE_F32_VSUB_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__sse)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/xnnpack/zip.h b/src/xnnpack/zip.h
new file mode 100644
index 0000000..48b164e
--- /dev/null
+++ b/src/xnnpack/zip.h
@@ -0,0 +1,86 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/params.h>
+#include <xnnpack/common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define DECLARE_X8_ZIPC_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ const uint8_t* x, \
+ uint8_t* y);
+
+DECLARE_X8_ZIPC_UKERNEL_FUNCTION(xnn_x8_zip_x2_ukernel__neon)
+DECLARE_X8_ZIPC_UKERNEL_FUNCTION(xnn_x8_zip_x2_ukernel__sse2)
+DECLARE_X8_ZIPC_UKERNEL_FUNCTION(xnn_x8_zip_x2_ukernel__scalar)
+DECLARE_X8_ZIPC_UKERNEL_FUNCTION(xnn_x8_zip_x3_ukernel__neon)
+DECLARE_X8_ZIPC_UKERNEL_FUNCTION(xnn_x8_zip_x3_ukernel__sse2)
+DECLARE_X8_ZIPC_UKERNEL_FUNCTION(xnn_x8_zip_x3_ukernel__scalar)
+DECLARE_X8_ZIPC_UKERNEL_FUNCTION(xnn_x8_zip_x4_ukernel__neon)
+DECLARE_X8_ZIPC_UKERNEL_FUNCTION(xnn_x8_zip_x4_ukernel__sse2)
+DECLARE_X8_ZIPC_UKERNEL_FUNCTION(xnn_x8_zip_x4_ukernel__scalar)
+
+
+#define DECLARE_X32_ZIPC_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ const uint32_t* x, \
+ uint32_t* y);
+
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x2_ukernel__neon)
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x2_ukernel__psimd)
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x2_ukernel__scalar)
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x2_ukernel__sse2)
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x3_ukernel__neon)
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x3_ukernel__psimd)
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x3_ukernel__scalar)
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x3_ukernel__sse2)
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x4_ukernel__neon)
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x4_ukernel__psimd)
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x4_ukernel__scalar)
+DECLARE_X32_ZIPC_UKERNEL_FUNCTION(xnn_x32_zip_x4_ukernel__sse2)
+
+
+#define DECLARE_X8_ZIPV_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ size_t m, \
+ const uint8_t* x, \
+ uint8_t* y);
+
+DECLARE_X8_ZIPV_UKERNEL_FUNCTION(xnn_x8_zip_xm_ukernel__neon)
+DECLARE_X8_ZIPV_UKERNEL_FUNCTION(xnn_x8_zip_xm_ukernel__sse2)
+DECLARE_X8_ZIPV_UKERNEL_FUNCTION(xnn_x8_zip_xm_ukernel__scalar)
+
+
+#define DECLARE_X32_ZIPV_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ size_t m, \
+ const uint32_t* x, \
+ uint32_t* y);
+
+DECLARE_X32_ZIPV_UKERNEL_FUNCTION(xnn_x32_zip_xm_ukernel__neon)
+DECLARE_X32_ZIPV_UKERNEL_FUNCTION(xnn_x32_zip_xm_ukernel__psimd)
+DECLARE_X32_ZIPV_UKERNEL_FUNCTION(xnn_x32_zip_xm_ukernel__scalar)
+DECLARE_X32_ZIPV_UKERNEL_FUNCTION(xnn_x32_zip_xm_ukernel__sse2)
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif