| // Copyright (c) Facebook, Inc. and its affiliates. |
| // All rights reserved. |
| // |
| // Copyright 2019 Google LLC |
| // |
| // This source code is licensed under the BSD-style license found in the |
| // LICENSE file in the root directory of this source tree. |
| |
| #pragma once |
| |
| #include <stdbool.h> |
| #include <stddef.h> |
| #include <stdint.h> |
| |
| #include <xnnpack.h> |
| #include <xnnpack/common.h> |
| |
| |
| union xnn_f16_default_params { |
| // Empty; serves to differentiate pointer types for micro-kernels without fused activation. |
| char _; // Dummy member variable to comply with the C standard |
| }; |
| |
| // scaleminmax is used for gemm/igemm ukernels. |
| union xnn_f16_scaleminmax_params { |
| // Empty; serves to differentiate pointer types for micro-kernels without fused activation. |
| char _; // Dummy member variable to comply with the C standard |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| uint16_t scale; |
| uint16_t min; |
| uint16_t max; |
| uint16_t pad; // pad to 8 bytes for neonfp16arith assembly. |
| } neon; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(32) float scale[8]; |
| XNN_ALIGN(32) float min[8]; |
| XNN_ALIGN(32) float max[8]; |
| } avx; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| }; |
| |
| union xnn_f16_minmax_params { |
| // Empty; serves to differentiate pointer types for micro-kernels without fused activation. |
| char _; // Dummy member variable to comply with the C standard |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| uint16_t min; |
| uint16_t max; |
| } neon; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(32) float min[8]; |
| XNN_ALIGN(32) float max[8]; |
| } avx; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| }; |
| |
| union xnn_f32_default_params { |
| // Empty; serves to differentiate pointer types for micro-kernels without fused activation. |
| char _; // Dummy member variable to comply with the C standard |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| int32_t mask_table[14]; |
| } avx; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| }; |
| |
| union xnn_f32_relu_params { |
| // Empty; serves to differentiate pointer types for micro-kernels with different fused activations. |
| char _; // Dummy member variable to comply with the C standard |
| }; |
| |
| union xnn_f32_minmax_params { |
| struct { |
| float min; |
| float max; |
| } scalar; |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) float min[4]; |
| XNN_ALIGN(16) float max[4]; |
| } sse; |
| struct { |
| XNN_ALIGN(32) float min[8]; |
| XNN_ALIGN(32) float max[8]; |
| int32_t mask_table[14]; |
| } avx; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) float min[2]; |
| XNN_ALIGN(8) float max[2]; |
| } wasmsimd; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_f32_abs_params { |
| char _; // Dummy member variable to comply with the C standard |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) float nonsign_mask[4]; |
| } sse; |
| struct { |
| XNN_ALIGN(32) float nonsign_mask[8]; |
| int32_t mask_table[14]; |
| } avx; |
| struct { |
| uint32_t nonsign_mask; |
| } avx512; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) float nonsign_mask[2]; |
| } wasmsimd; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_f32_neg_params { |
| char _; // Dummy member variable to comply with the C standard |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) float sign_mask[4]; |
| } sse; |
| struct { |
| XNN_ALIGN(32) float sign_mask[8]; |
| int32_t mask_table[14]; |
| } avx; |
| struct { |
| uint32_t sign_mask; |
| } avx512; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) float sign_mask[2]; |
| } wasmsimd; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_f32_rnd_params { |
| char _; // Dummy member variable to comply with the C standard |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) float sign_mask[4]; |
| XNN_ALIGN(16) float one[4]; |
| } sse2; |
| struct { |
| int32_t mask_table[14]; |
| } avx; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) float sign_mask[2]; |
| XNN_ALIGN(8) float magic_bias[2]; |
| XNN_ALIGN(8) float one[2]; |
| } wasmsimd; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_f32_elu_params { |
| struct { |
| float prescale; |
| float alpha; |
| float beta; |
| float sat_cutoff; |
| float magic_bias; |
| float log2e; |
| float minus_ln2_hi; |
| float minus_ln2_lo; |
| float c3; |
| float c2; |
| float one; |
| } scalar_rr2_lut16_p3; |
| struct { |
| float prescale; |
| float alpha; |
| float beta; |
| float sat_cutoff; |
| float magic_bias; |
| float log2e; |
| float minus_ln2_hi; |
| float minus_ln2_lo; |
| float c6; |
| float c5; |
| float c4; |
| float c3; |
| float c2; |
| float one; |
| } scalar_rr2_p6; |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| float prescale; |
| float alpha; |
| float beta; |
| float sat_cutoff; |
| float magic_bias; |
| float log2e; |
| float minus_ln2_hi; |
| float minus_ln2_lo; |
| float c6; |
| float c5; |
| float c4; |
| float c3; |
| float c2; |
| } neon_rr2_p6; |
| struct { |
| float prescale; |
| float alpha; |
| float beta; |
| float sat_cutoff; |
| float magic_bias; |
| float log2e; |
| float minus_ln2_hi; |
| float minus_ln2_lo; |
| float c3; |
| float c2; |
| } neon_rr2_lut16_p3; |
| struct { |
| float prescale; |
| float alpha; |
| float beta; |
| float sat_cutoff; |
| float magic_bias; |
| float log2e; |
| float minus_ln2; |
| float c6; |
| float c5; |
| float c4; |
| float c3; |
| float c2; |
| } neonfma_rr1_p6; |
| struct { |
| float prescale; |
| float alpha; |
| float beta; |
| float sat_cutoff; |
| float magic_bias; |
| float log2e; |
| float minus_ln2; |
| float c3; |
| float c2; |
| } neonfma_rr1_lut16_p3; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) float prescale[4]; |
| XNN_ALIGN(16) float alpha[4]; |
| XNN_ALIGN(16) float beta[4]; |
| XNN_ALIGN(16) float sat_cutoff[4]; |
| XNN_ALIGN(16) float magic_bias[4]; |
| XNN_ALIGN(16) float log2e[4]; |
| XNN_ALIGN(16) uint32_t index_mask[4]; |
| XNN_ALIGN(16) float minus_ln2_hi[4]; |
| XNN_ALIGN(16) float minus_ln2_lo[4]; |
| XNN_ALIGN(16) float c3[4]; |
| XNN_ALIGN(16) float c2[4]; |
| XNN_ALIGN(16) float one[4]; |
| } sse2_rr2_lut16_p3; |
| struct { |
| XNN_ALIGN(16) float prescale[4]; |
| XNN_ALIGN(16) float alpha[4]; |
| XNN_ALIGN(16) float beta[4]; |
| XNN_ALIGN(16) float sat_cutoff[4]; |
| XNN_ALIGN(16) float magic_bias[4]; |
| XNN_ALIGN(16) float log2e[4]; |
| XNN_ALIGN(16) float minus_ln2_hi[4]; |
| XNN_ALIGN(16) float minus_ln2_lo[4]; |
| XNN_ALIGN(16) float c6[4]; |
| XNN_ALIGN(16) float c5[4]; |
| XNN_ALIGN(16) float c4[4]; |
| XNN_ALIGN(16) float c3[4]; |
| XNN_ALIGN(16) float c2[4]; |
| XNN_ALIGN(16) float one[4]; |
| } sse2_rr2_p6; |
| struct { |
| XNN_ALIGN(32) float prescale[8]; |
| XNN_ALIGN(32) float alpha[8]; |
| XNN_ALIGN(32) float beta[8]; |
| XNN_ALIGN(32) float sat_cutoff[8]; |
| XNN_ALIGN(32) float magic_bias[8]; |
| XNN_ALIGN(32) float log2e[8]; |
| XNN_ALIGN(32) uint32_t index_mask[8]; |
| XNN_ALIGN(32) float minus_ln2_hi[8]; |
| XNN_ALIGN(32) float minus_ln2_lo[8]; |
| XNN_ALIGN(32) float c3[8]; |
| XNN_ALIGN(32) float c2[8]; |
| XNN_ALIGN(32) float one[8]; |
| int32_t mask_table[14]; |
| } avx_rr2_lut16_p3; |
| struct { |
| XNN_ALIGN(32) float prescale[8]; |
| XNN_ALIGN(32) float alpha[8]; |
| XNN_ALIGN(32) float beta[8]; |
| XNN_ALIGN(32) float sat_cutoff[8]; |
| XNN_ALIGN(32) float magic_bias[8]; |
| XNN_ALIGN(32) float log2e[8]; |
| XNN_ALIGN(32) uint32_t index_mask[8]; |
| XNN_ALIGN(32) float table[8]; |
| XNN_ALIGN(32) float minus_ln2_hi[8]; |
| XNN_ALIGN(32) float minus_ln2_lo[8]; |
| XNN_ALIGN(32) float c4[8]; |
| XNN_ALIGN(32) float c3[8]; |
| XNN_ALIGN(32) float c2[8]; |
| XNN_ALIGN(32) float one[8]; |
| int32_t mask_table[14]; |
| } avx_rr2_lut4_p4; |
| struct { |
| XNN_ALIGN(32) float prescale[8]; |
| XNN_ALIGN(32) float alpha[8]; |
| XNN_ALIGN(32) float beta[8]; |
| XNN_ALIGN(32) float sat_cutoff[8]; |
| XNN_ALIGN(32) float magic_bias[8]; |
| XNN_ALIGN(32) float log2e[8]; |
| XNN_ALIGN(32) float minus_ln2_hi[8]; |
| XNN_ALIGN(32) float minus_ln2_lo[8]; |
| XNN_ALIGN(32) float c6[8]; |
| XNN_ALIGN(32) float c5[8]; |
| XNN_ALIGN(32) float c4[8]; |
| XNN_ALIGN(32) float c3[8]; |
| XNN_ALIGN(32) float c2[8]; |
| XNN_ALIGN(32) float one[8]; |
| int32_t mask_table[14]; |
| } avx_rr2_p6; |
| struct { |
| XNN_ALIGN(32) float prescale[8]; |
| XNN_ALIGN(32) float alpha[8]; |
| XNN_ALIGN(32) float beta[8]; |
| XNN_ALIGN(32) float sat_cutoff[8]; |
| XNN_ALIGN(32) float magic_bias[8]; |
| XNN_ALIGN(32) float log2e[8]; |
| XNN_ALIGN(32) uint32_t index_mask[8]; |
| XNN_ALIGN(32) float minus_ln2[8]; |
| XNN_ALIGN(32) float c3[8]; |
| XNN_ALIGN(32) float c2[8]; |
| int32_t mask_table[14]; |
| } avx2_rr1_lut16_p3; |
| struct { |
| XNN_ALIGN(32) float prescale[8]; |
| XNN_ALIGN(32) float alpha[8]; |
| XNN_ALIGN(32) float beta[8]; |
| XNN_ALIGN(32) float sat_cutoff[8]; |
| XNN_ALIGN(32) float magic_bias[8]; |
| XNN_ALIGN(32) float log2e[8]; |
| XNN_ALIGN(32) uint32_t table[8]; |
| XNN_ALIGN(32) float minus_ln2[8]; |
| XNN_ALIGN(32) float c4[8]; |
| XNN_ALIGN(32) float c3[8]; |
| XNN_ALIGN(32) float c2[8]; |
| int32_t mask_table[14]; |
| } avx2_rr1_lut8_p4; |
| struct { |
| XNN_ALIGN(32) float prescale[8]; |
| XNN_ALIGN(32) float alpha[8]; |
| XNN_ALIGN(32) float beta[8]; |
| XNN_ALIGN(32) float sat_cutoff[8]; |
| XNN_ALIGN(32) float magic_bias[8]; |
| XNN_ALIGN(32) float log2e[8]; |
| XNN_ALIGN(32) float table[8]; |
| XNN_ALIGN(32) float minus_ln2[8]; |
| XNN_ALIGN(32) float c4[8]; |
| XNN_ALIGN(32) float c3[8]; |
| XNN_ALIGN(32) float c2[8]; |
| int32_t mask_table[14]; |
| } avx2_rr1_lut4_p4; |
| struct { |
| XNN_ALIGN(32) float prescale[8]; |
| XNN_ALIGN(32) float alpha[8]; |
| XNN_ALIGN(32) float beta[8]; |
| XNN_ALIGN(32) float sat_cutoff[8]; |
| XNN_ALIGN(32) float magic_bias[8]; |
| XNN_ALIGN(32) float log2e[8]; |
| XNN_ALIGN(32) float minus_ln2[8]; |
| XNN_ALIGN(32) float c6[8]; |
| XNN_ALIGN(32) float c5[8]; |
| XNN_ALIGN(32) float c4[8]; |
| XNN_ALIGN(32) float c3[8]; |
| XNN_ALIGN(32) float c2[8]; |
| int32_t mask_table[14]; |
| } avx2_rr1_p6; |
| struct { |
| float prescale; |
| float alpha; |
| float beta; |
| float sat_cutoff; |
| float magic_bias; |
| float log2e; |
| float minus_ln2; |
| float c3; |
| float c2; |
| XNN_ALIGN(64) uint32_t table[16]; |
| } avx512_rr1_lut16_p3; |
| struct { |
| float prescale; |
| float alpha; |
| float beta; |
| float sat_cutoff; |
| float magic_bias; |
| float log2e; |
| float minus_ln2; |
| float c6; |
| float c5; |
| float c4; |
| float c3; |
| float c2; |
| } avx512_rr1_p6; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) float prescale[2]; |
| XNN_ALIGN(8) float alpha[2]; |
| XNN_ALIGN(8) float beta[2]; |
| XNN_ALIGN(8) float sat_cutoff[2]; |
| XNN_ALIGN(8) float magic_bias[2]; |
| XNN_ALIGN(8) float log2e[2]; |
| XNN_ALIGN(8) uint32_t index_mask[2]; |
| XNN_ALIGN(8) float minus_ln2_hi[2]; |
| XNN_ALIGN(8) float minus_ln2_lo[2]; |
| XNN_ALIGN(8) float c3[2]; |
| XNN_ALIGN(8) float c2[2]; |
| XNN_ALIGN(8) float one[2]; |
| } wasmsimd_rr2_lut16_p3; |
| struct { |
| XNN_ALIGN(8) float prescale[2]; |
| XNN_ALIGN(8) float alpha[2]; |
| XNN_ALIGN(8) float beta[2]; |
| XNN_ALIGN(8) float sat_cutoff[2]; |
| XNN_ALIGN(8) float magic_bias[2]; |
| XNN_ALIGN(8) float log2e[2]; |
| XNN_ALIGN(8) float minus_ln2_hi[2]; |
| XNN_ALIGN(8) float minus_ln2_lo[2]; |
| XNN_ALIGN(8) float c6[2]; |
| XNN_ALIGN(8) float c5[2]; |
| XNN_ALIGN(8) float c4[2]; |
| XNN_ALIGN(8) float c3[2]; |
| XNN_ALIGN(8) float c2[2]; |
| XNN_ALIGN(8) float one[2]; |
| } wasmsimd_rr2_p6; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_f32_expminus_params { |
| struct { |
| float log2e; |
| float magic_bias; |
| float minus_ln2_hi; |
| float minus_ln2_lo; |
| float c5; |
| float c4; |
| float c3; |
| float c2; |
| float c1; |
| float denorm_cutoff; |
| } scalar_rr2_p5; |
| struct { |
| float log2e; |
| float magic_bias; |
| float minus_ln2_hi; |
| float minus_ln2_lo; |
| float c2; |
| float denorm_cutoff; |
| } scalar_rr2_lut64_p2; |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| float log2e; |
| float magic_bias; |
| float minus_ln2_hi; |
| float minus_ln2_lo; |
| float c5; |
| float c4; |
| float c3; |
| float c2; |
| float c1; |
| float denorm_cutoff; |
| } neon_rr2_p5; |
| struct { |
| float log2e; |
| float magic_bias; |
| float minus_ln2_hi; |
| float minus_ln2_lo; |
| float c2; |
| float denorm_cutoff; |
| } neon_rr2_lut64_p2; |
| struct { |
| float log2e; |
| float magic_bias; |
| float minus_ln2; |
| float c5; |
| float c4; |
| float c3; |
| float c2; |
| float c1; |
| float denorm_cutoff; |
| } neonfma_rr1_p5; |
| struct { |
| float log2e; |
| float magic_bias; |
| float minus_ln2; |
| float c2; |
| float denorm_cutoff; |
| } neonfma_rr1_lut64_p2; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) float log2e[4]; |
| XNN_ALIGN(16) float magic_bias[4]; |
| XNN_ALIGN(16) float minus_ln2_hi[4]; |
| XNN_ALIGN(16) float minus_ln2_lo[4]; |
| XNN_ALIGN(16) float c5[4]; |
| XNN_ALIGN(16) float c4[4]; |
| XNN_ALIGN(16) float c3[4]; |
| XNN_ALIGN(16) float c2[4]; |
| XNN_ALIGN(16) float c1[4]; |
| XNN_ALIGN(16) float denorm_cutoff[4]; |
| } sse2_rr2_p5; |
| struct { |
| XNN_ALIGN(32) float log2e[8]; |
| XNN_ALIGN(32) float magic_bias[8]; |
| XNN_ALIGN(32) float minus_ln2[8]; |
| XNN_ALIGN(32) float c5[8]; |
| XNN_ALIGN(32) float c4[8]; |
| XNN_ALIGN(32) float c3[8]; |
| XNN_ALIGN(32) float c2[8]; |
| XNN_ALIGN(32) float c1[8]; |
| XNN_ALIGN(32) float denorm_cutoff[8]; |
| int32_t mask_table[14]; |
| } avx2_rr1_p5; |
| struct { |
| float log2e; |
| float minus_ln2; |
| float c5; |
| float c4; |
| float c3; |
| float c2; |
| float c1; |
| float c0; |
| } avx512_rr1_p5; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) float log2e[2]; |
| XNN_ALIGN(8) float magic_bias[2]; |
| XNN_ALIGN(8) float minus_ln2_hi[2]; |
| XNN_ALIGN(8) float minus_ln2_lo[2]; |
| XNN_ALIGN(8) float c5[2]; |
| XNN_ALIGN(8) float c4[2]; |
| XNN_ALIGN(8) float c3[2]; |
| XNN_ALIGN(8) float c2[2]; |
| XNN_ALIGN(8) float c1[2]; |
| XNN_ALIGN(8) float denorm_cutoff[2]; |
| } wasmsimd_rr2_p5; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_f32_lrelu_params { |
| struct { |
| float slope; |
| } scalar; |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) float slope[4]; |
| } sse; |
| struct { |
| XNN_ALIGN(32) float slope[8]; |
| int32_t mask_table[14]; |
| } avx; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) float slope[2]; |
| } wasmsimd; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_f32_sigmoid_params { |
| struct { |
| float magic_bias; |
| float minus_log2e; |
| float ln2_hi; |
| float ln2_lo; |
| float c1; |
| float one; |
| float denorm_cutoff; |
| } scalar_rr2_lut2048_p1; |
| struct { |
| float magic_bias; |
| float minus_log2e; |
| float ln2_hi; |
| float ln2_lo; |
| float c2; |
| float one; |
| float denorm_cutoff; |
| } scalar_rr2_lut64_p2; |
| struct { |
| float magic_bias; |
| float minus_log2e; |
| float ln2_hi; |
| float ln2_lo; |
| float c5; |
| float c4; |
| float c3; |
| float c2; |
| float c1; |
| float one; |
| float denorm_cutoff; |
| } scalar_rr2_p5; |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| float magic_bias; |
| float minus_log2e; |
| float ln2_hi; |
| float ln2_lo; |
| float c1; |
| float denorm_cutoff; |
| } neon_rr2_lut2048_p1; |
| struct { |
| float magic_bias; |
| float minus_log2e; |
| float ln2_hi; |
| float ln2_lo; |
| float c2; |
| float denorm_cutoff; |
| } neon_rr2_lut64_p2; |
| struct { |
| float magic_bias; |
| float minus_log2e; |
| float ln2_hi; |
| float ln2_lo; |
| float c5; |
| float c4; |
| float c3; |
| float c2; |
| float c1; |
| float denorm_cutoff; |
| } neon_rr2_p5; |
| struct { |
| float magic_bias; |
| float minus_log2e; |
| float ln2; |
| float c1; |
| float denorm_cutoff; |
| } neonfma_rr1_lut2048_p1; |
| struct { |
| float magic_bias; |
| float minus_log2e; |
| float ln2; |
| float c2; |
| float denorm_cutoff; |
| } neonfma_rr1_lut64_p2; |
| struct { |
| float magic_bias; |
| float minus_log2e; |
| float ln2; |
| float c5; |
| float c4; |
| float c3; |
| float c2; |
| float c1; |
| float denorm_cutoff; |
| } neonfma_rr1_p5; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) float sign_mask[4]; |
| XNN_ALIGN(16) float magic_bias[4]; |
| XNN_ALIGN(16) float log2e[4]; |
| XNN_ALIGN(16) uint32_t index_mask[4]; |
| XNN_ALIGN(16) float minus_ln2_hi[4]; |
| XNN_ALIGN(16) float minus_ln2_lo[4]; |
| XNN_ALIGN(16) float c2[4]; |
| XNN_ALIGN(16) float one[4]; |
| XNN_ALIGN(16) float denorm_cutoff[4]; |
| } sse2_rr2_lut64_p2; |
| struct { |
| XNN_ALIGN(16) float sign_mask[4]; |
| XNN_ALIGN(16) float magic_bias[4]; |
| XNN_ALIGN(16) float log2e[4]; |
| XNN_ALIGN(16) float minus_ln2_hi[4]; |
| XNN_ALIGN(16) float minus_ln2_lo[4]; |
| XNN_ALIGN(16) float c5[4]; |
| XNN_ALIGN(16) float c4[4]; |
| XNN_ALIGN(16) float c3[4]; |
| XNN_ALIGN(16) float c2[4]; |
| XNN_ALIGN(16) float c1[4]; |
| XNN_ALIGN(16) float one[4]; |
| XNN_ALIGN(16) float denorm_cutoff[4]; |
| } sse2_rr2_p5; |
| struct { |
| XNN_ALIGN(32) float sign_mask[8]; |
| XNN_ALIGN(32) float magic_bias[8]; |
| XNN_ALIGN(32) float log2e[8]; |
| XNN_ALIGN(32) float minus_ln2_hi[8]; |
| XNN_ALIGN(32) float minus_ln2_lo[8]; |
| XNN_ALIGN(32) float c5[8]; |
| XNN_ALIGN(32) float c4[8]; |
| XNN_ALIGN(32) float c3[8]; |
| XNN_ALIGN(32) float c2[8]; |
| XNN_ALIGN(32) float c1[8]; |
| XNN_ALIGN(32) float one[8]; |
| XNN_ALIGN(32) float two[8]; |
| XNN_ALIGN(32) float denorm_cutoff[8]; |
| int32_t mask_table[14]; |
| } avx_rr2_p5; |
| struct { |
| XNN_ALIGN(32) float sign_mask[8]; |
| XNN_ALIGN(32) float magic_bias[8]; |
| XNN_ALIGN(32) float log2e[8]; |
| XNN_ALIGN(32) float minus_ln2[8]; |
| XNN_ALIGN(32) float c5[8]; |
| XNN_ALIGN(32) float c4[8]; |
| XNN_ALIGN(32) float c3[8]; |
| XNN_ALIGN(32) float c2[8]; |
| XNN_ALIGN(32) float c1[8]; |
| XNN_ALIGN(32) float one[8]; |
| XNN_ALIGN(32) float denorm_cutoff[8]; |
| int32_t mask_table[14]; |
| } avx2_rr1_p5; |
| struct { |
| uint32_t sign_mask; |
| float magic_bias; |
| float log2e; |
| float minus_ln2; |
| float c3; |
| float c2; |
| float one; |
| XNN_ALIGN(64) float table[16]; |
| } avx512_rr1_lut16_p3; |
| struct { |
| uint32_t sign_mask; |
| float magic_bias; |
| float log2e; |
| float minus_ln2_hi; |
| float minus_ln2_lo; |
| float c2; |
| float c1; |
| float one; |
| XNN_ALIGN(64) float table_lo[16]; |
| XNN_ALIGN(64) float table_hi[16]; |
| } avx512_rr2_lut32_p2; |
| struct { |
| uint32_t sign_mask; |
| float log2e; |
| float minus_ln2; |
| float c5; |
| float c4; |
| float c3; |
| float c2; |
| float c1; |
| float one; |
| } avx512_rr1_p5; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) float magic_bias[2]; |
| XNN_ALIGN(8) float minus_log2e[2]; |
| XNN_ALIGN(8) uint32_t index_mask[2]; |
| XNN_ALIGN(8) float ln2_hi[2]; |
| XNN_ALIGN(8) float ln2_lo[2]; |
| XNN_ALIGN(8) float c2[2]; |
| XNN_ALIGN(8) float one[2]; |
| XNN_ALIGN(8) float denorm_cutoff[2]; |
| } wasmsimd_rr2_lut64_p2; |
| struct { |
| XNN_ALIGN(8) float magic_bias[2]; |
| XNN_ALIGN(8) float minus_log2e[2]; |
| XNN_ALIGN(8) float ln2_hi[2]; |
| XNN_ALIGN(8) float ln2_lo[2]; |
| XNN_ALIGN(8) float c5[2]; |
| XNN_ALIGN(8) float c4[2]; |
| XNN_ALIGN(8) float c3[2]; |
| XNN_ALIGN(8) float c2[2]; |
| XNN_ALIGN(8) float c1[2]; |
| XNN_ALIGN(8) float one[2]; |
| XNN_ALIGN(8) float denorm_cutoff[2]; |
| } wasmsimd_rr2_p5; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_f32_sqrt_params { |
| char _; // Dummy member variable to comply with the C standard |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| int32_t mask_table[14]; |
| } avx; |
| struct { |
| XNN_ALIGN(32) float half[8]; |
| int32_t mask_table[14]; |
| } fma; |
| struct { |
| float half; |
| } avx512; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| }; |
| |
| union xnn_f32_chw_params { |
| struct { |
| XNN_ALIGN(16) int32_t mask_even[4]; // used by stride 2 kernels |
| XNN_ALIGN(16) int32_t mask_odd[4]; // used by stride 2 kernels |
| XNN_ALIGN(16) int32_t mask[4]; // used by stride 1 kernels |
| float min; |
| float max; |
| } scalar; |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| float min; |
| float max; |
| XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels |
| XNN_ALIGN(16) uint32_t mask_odd[4]; // used by stride 2 kernels |
| XNN_ALIGN(16) uint32_t mask[4]; // used by stride 1 kernels |
| } neon; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) float min[4]; |
| XNN_ALIGN(16) float max[4]; |
| XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels |
| XNN_ALIGN(16) uint32_t mask_odd[4]; // used by stride 2 kernels |
| XNN_ALIGN(16) uint32_t mask[4]; // used by stride 1 kernels |
| } sse; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| }; |
| |
| union xnn_s8_minmax_params { |
| struct { |
| int32_t min; |
| int32_t max; |
| } scalar; |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) uint8_t bias[16]; |
| XNN_ALIGN(16) uint8_t min_with_bias[16]; |
| XNN_ALIGN(16) uint8_t max_with_bias[16]; |
| } sse2; |
| struct { |
| XNN_ALIGN(16) int8_t min[16]; |
| XNN_ALIGN(16) int8_t max[16]; |
| } sse4; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| int8_t min; |
| int8_t max; |
| } neon; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) int8_t min[8]; |
| XNN_ALIGN(8) int8_t max[8]; |
| } wasmsimd; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_u8_minmax_params { |
| struct { |
| uint32_t min; |
| uint32_t max; |
| } scalar; |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) uint8_t min[16]; |
| XNN_ALIGN(16) uint8_t max[16]; |
| } sse2; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| uint8_t min; |
| uint8_t max; |
| } neon; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) uint8_t min[8]; |
| XNN_ALIGN(8) uint8_t max[8]; |
| } wasmsimd; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_f32_scaleminmax_params { |
| struct { |
| float scale; |
| float min; |
| float max; |
| } scalar; |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) float scale[4]; |
| XNN_ALIGN(16) float min[4]; |
| XNN_ALIGN(16) float max[4]; |
| } sse; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| }; |
| |
| union xnn_f32_gavgpool_params { |
| struct { |
| XNN_ALIGN(16) int32_t mask[4]; |
| float multiplier; |
| float output_min; |
| float output_max; |
| } scalar; |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) float multiplier[4]; |
| XNN_ALIGN(16) float output_min[4]; |
| XNN_ALIGN(16) float output_max[4]; |
| XNN_ALIGN(16) uint32_t mask[4]; |
| } sse; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| XNN_ALIGN(16) float multiplier; |
| XNN_ALIGN(16) float output_min; |
| XNN_ALIGN(16) float output_max; |
| XNN_ALIGN(16) uint32_t mask[4]; |
| } neon; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 */ |
| }; |
| |
| union xnn_f16_hswish_params { |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| uint16_t sixth; |
| uint16_t three; |
| uint16_t six; |
| uint16_t pad; // pad to 8 bytes for neonfp16arith assembly. |
| } neon; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 */ |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(32) float sixth[8]; |
| XNN_ALIGN(32) float three[8]; |
| XNN_ALIGN(16) uint16_t six[8]; |
| } avx; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| }; |
| |
| union xnn_f32_hswish_params { |
| struct { |
| float sixth; |
| float three; |
| float six; |
| } scalar; |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) float sixth[4]; |
| XNN_ALIGN(16) float half[4]; |
| XNN_ALIGN(16) float one[4]; |
| } sse; |
| struct { |
| XNN_ALIGN(32) float sixth[8]; |
| XNN_ALIGN(32) float half[8]; |
| XNN_ALIGN(32) float one[8]; |
| int32_t mask_table[14]; |
| } avx; |
| struct { |
| float sixth; |
| float half; |
| float one; |
| } avx512; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) float sixth[2]; |
| XNN_ALIGN(8) float three[2]; |
| XNN_ALIGN(8) float six[2]; |
| } wasmsimd; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_qu8_conv_minmax_params { |
| struct { |
| int32_t kernel_zero_point; |
| float scale; |
| float output_min_less_zero_point; |
| float output_max_less_zero_point; |
| float magic_bias; |
| int32_t magic_bias_less_output_zero_point; |
| } fp32_scalar_fmagic; |
| struct { |
| int32_t kernel_zero_point; |
| float scale; |
| float magic_bias; |
| int32_t magic_min; |
| int32_t magic_max; |
| int32_t magic_bias_less_zero_point; |
| } fp32_scalar_imagic; |
| struct { |
| int32_t kernel_zero_point; |
| float scale; |
| float output_min_less_zero_point; |
| float output_max_less_zero_point; |
| int32_t output_zero_point; |
| } fp32_scalar_lrintf; |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| uint8_t kernel_zero_point[4]; |
| float scale; |
| float magic_bias; |
| int32_t magic_bias_less_output_zero_point; |
| uint8_t output_min; |
| uint8_t output_max; |
| } fp32_neon; |
| struct { |
| uint8_t kernel_zero_point[4]; |
| float scale; |
| int16_t output_zero_point; |
| uint8_t output_min; |
| uint8_t output_max; |
| } fp32_neonv8; |
| struct { |
| uint8_t kernel_zero_point[4]; |
| int32_t right_pre_shift; |
| int32_t multiplier; |
| int32_t right_post_shift; |
| int16_t output_zero_point; |
| uint8_t output_min; |
| uint8_t output_max; |
| } rndnu_neon; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) int16_t kernel_zero_point[8]; |
| XNN_ALIGN(16) float scale[4]; |
| XNN_ALIGN(16) float output_max_less_zero_point[4]; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) uint8_t output_min[16]; |
| } fp32_sse2; |
| struct { |
| XNN_ALIGN(32) int16_t kernel_zero_point[16]; |
| XNN_ALIGN(32) float scale[8]; |
| XNN_ALIGN(32) float output_max_less_zero_point[8]; |
| XNN_ALIGN(32) int16_t output_zero_point[16]; |
| XNN_ALIGN(32) uint8_t output_min[32]; |
| } fp32_avx2; |
| struct { |
| XNN_ALIGN(64) int16_t kernel_zero_point[32]; |
| XNN_ALIGN(64) float scale[16]; |
| XNN_ALIGN(64) float output_max_less_zero_point[16]; |
| XNN_ALIGN(64) int16_t output_zero_point[32]; |
| XNN_ALIGN(64) uint8_t output_min[64]; |
| } fp32_avx512; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) int16_t kernel_zero_point[4]; |
| XNN_ALIGN(8) float scale[2]; |
| XNN_ALIGN(8) float magic_bias[2]; |
| XNN_ALIGN(8) int32_t magic_min[2]; |
| XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2]; |
| XNN_ALIGN(8) int8_t output_max[8]; |
| } fp32_wasmsimd; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_qs8_minmax_params { |
| struct { |
| float magic_bias; |
| int32_t magic_min; |
| int32_t magic_max; |
| int32_t magic_bias_less_zero_point; |
| } scalar_imagic; |
| struct { |
| float output_min_less_zero_point; |
| float output_max_less_zero_point; |
| float magic_bias; |
| int32_t magic_bias_less_output_zero_point; |
| } scalar_fmagic; |
| struct { |
| float output_min_less_zero_point; |
| float output_max_less_zero_point; |
| int32_t output_zero_point; |
| } scalar_lrintf; |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| float magic_bias; |
| int32_t magic_bias_less_output_zero_point; |
| int8_t output_min; |
| int8_t output_max; |
| } neon; |
| struct { |
| int16_t output_zero_point; |
| uint8_t output_min; |
| uint8_t output_max; |
| } neonv8; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) float output_max_less_zero_point[4]; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) int16_t output_min[8]; |
| } sse2; |
| struct { |
| XNN_ALIGN(16) float output_max_less_zero_point[4]; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) int8_t output_min[16]; |
| } sse4; |
| struct { |
| XNN_ALIGN(32) float output_max_less_zero_point[8]; |
| XNN_ALIGN(32) int16_t output_zero_point[16]; |
| XNN_ALIGN(32) int8_t output_min[32]; |
| } avx2; |
| struct { |
| XNN_ALIGN(64) float output_max_less_zero_point[16]; |
| XNN_ALIGN(64) int16_t output_zero_point[32]; |
| XNN_ALIGN(64) int8_t output_min[64]; |
| } avx512; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) float magic_bias[2]; |
| XNN_ALIGN(8) int32_t magic_min[2]; |
| XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2]; |
| XNN_ALIGN(8) int8_t output_max[8]; |
| } wasmsimd; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_qs8_conv_minmax_params { |
| struct { |
| float scale; |
| float output_min_less_zero_point; |
| float output_max_less_zero_point; |
| float magic_bias; |
| int32_t magic_bias_less_output_zero_point; |
| } fp32_scalar_fmagic; |
| struct { |
| float scale; |
| float magic_bias; |
| int32_t magic_min; |
| int32_t magic_max; |
| int32_t magic_bias_less_zero_point; |
| } fp32_scalar_imagic; |
| struct { |
| float scale; |
| float output_min_less_zero_point; |
| float output_max_less_zero_point; |
| int32_t output_zero_point; |
| } fp32_scalar_lrintf; |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| float scale; |
| float magic_bias; |
| int32_t magic_bias_less_output_zero_point; |
| int8_t output_min; |
| int8_t output_max; |
| } fp32_neon; |
| struct { |
| float scale; |
| int16_t output_zero_point; |
| int8_t output_min; |
| int8_t output_max; |
| } fp32_neonv8; |
| struct { |
| int32_t right_pre_shift; |
| int32_t multiplier; |
| int32_t right_post_shift; |
| int16_t output_zero_point; |
| int8_t output_min; |
| int8_t output_max; |
| } rndnu_neon; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) float scale[4]; |
| XNN_ALIGN(16) float output_max_less_zero_point[4]; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) int16_t output_min[8]; |
| } fp32_sse2; |
| struct { |
| XNN_ALIGN(16) float scale[4]; |
| XNN_ALIGN(16) float output_max_less_zero_point[4]; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) int8_t output_min[16]; |
| } fp32_sse4; |
| struct { |
| XNN_ALIGN(32) float scale[8]; |
| XNN_ALIGN(32) float output_max_less_zero_point[8]; |
| XNN_ALIGN(32) int16_t output_zero_point[16]; |
| XNN_ALIGN(32) int8_t output_min[32]; |
| } fp32_avx2; |
| struct { |
| XNN_ALIGN(64) float scale[16]; |
| XNN_ALIGN(64) float output_max_less_zero_point[16]; |
| XNN_ALIGN(64) int16_t output_zero_point[32]; |
| XNN_ALIGN(64) int8_t output_min[64]; |
| } fp32_avx512; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) float scale[2]; |
| XNN_ALIGN(8) float magic_bias[2]; |
| XNN_ALIGN(8) int32_t magic_min[2]; |
| XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2]; |
| XNN_ALIGN(8) int8_t output_max[8]; |
| } fp32_wasmsimd; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_qu8_addsub_minmax_params { |
| struct { |
| int32_t bias; |
| int32_t a_multiplier; |
| int32_t b_multiplier; |
| int32_t rounding; |
| uint32_t shift; |
| int32_t output_min_less_zero_point; |
| int32_t output_max_less_zero_point; |
| int32_t output_zero_point; |
| } scalar; |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| uint8_t a_zero_point; |
| uint8_t b_zero_point; |
| int16_t output_zero_point; |
| int32_t a_multiplier; |
| int32_t b_multiplier; |
| int32_t right_shift; |
| uint8_t output_min; |
| uint8_t output_max; |
| } neon; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) int32_t bias[4]; |
| XNN_ALIGN(16) uint16_t a_multiplier_lo[8]; |
| XNN_ALIGN(16) uint16_t a_multiplier_hi[8]; |
| XNN_ALIGN(16) uint16_t b_multiplier_lo[8]; |
| XNN_ALIGN(16) uint16_t b_multiplier_hi[8]; |
| uint32_t shift; |
| uint32_t b_multiplier; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) uint8_t output_min[16]; |
| XNN_ALIGN(16) uint8_t output_max[16]; |
| } sse2; |
| struct { |
| XNN_ALIGN(16) int32_t bias[4]; |
| XNN_ALIGN(16) int32_t a_multiplier[4]; |
| XNN_ALIGN(16) int32_t b_multiplier[4]; |
| XNN_ALIGN(16) uint32_t shift[4]; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) uint8_t output_min[16]; |
| XNN_ALIGN(16) uint8_t output_max[16]; |
| } sse4; |
| struct { |
| XNN_ALIGN(32) int32_t bias[8]; |
| XNN_ALIGN(32) int32_t a_multiplier[8]; |
| XNN_ALIGN(32) int32_t b_multiplier[8]; |
| XNN_ALIGN(32) uint32_t shift[8]; |
| XNN_ALIGN(32) int16_t output_zero_point[16]; |
| XNN_ALIGN(16) uint8_t output_min[16]; |
| XNN_ALIGN(16) uint8_t output_max[16]; |
| } avx2; |
| struct { |
| XNN_ALIGN(64) int32_t bias[16]; |
| XNN_ALIGN(64) int32_t a_multiplier[16]; |
| XNN_ALIGN(64) int32_t b_multiplier[16]; |
| XNN_ALIGN(64) uint32_t shift[16]; |
| XNN_ALIGN(64) int16_t output_zero_point[32]; |
| XNN_ALIGN(32) uint8_t output_min[32]; |
| XNN_ALIGN(32) uint8_t output_max[32]; |
| } avx512; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) int32_t bias[2]; |
| XNN_ALIGN(8) int32_t a_multiplier[2]; |
| XNN_ALIGN(8) int32_t b_multiplier[2]; |
| uint32_t shift; |
| XNN_ALIGN(8) int16_t output_zero_point[4]; |
| XNN_ALIGN(8) uint8_t output_min[8]; |
| XNN_ALIGN(8) uint8_t output_max[8]; |
| } wasmsimd; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_qs8_addsub_minmax_params { |
| struct { |
| int32_t bias; |
| int32_t a_multiplier; |
| int32_t b_multiplier; |
| uint32_t shift; |
| int32_t output_min_less_zero_point; |
| int32_t output_max_less_zero_point; |
| int32_t output_zero_point; |
| } scalar; |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| int8_t a_zero_point; |
| int8_t b_zero_point; |
| int16_t output_zero_point; |
| int32_t a_multiplier; |
| int32_t b_multiplier; |
| int32_t right_shift; |
| int8_t output_min; |
| int8_t output_max; |
| } neon; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) int32_t bias[4]; |
| XNN_ALIGN(16) uint16_t a_multiplier_lo[8]; |
| XNN_ALIGN(16) uint16_t a_multiplier_hi[8]; |
| XNN_ALIGN(16) uint16_t b_multiplier_lo[8]; |
| XNN_ALIGN(16) uint16_t b_multiplier_hi[8]; |
| uint32_t shift; |
| uint32_t b_multiplier; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) int16_t output_min[8]; |
| XNN_ALIGN(16) int16_t output_max[8]; |
| } sse2; |
| struct { |
| XNN_ALIGN(16) int32_t bias[4]; |
| XNN_ALIGN(16) uint16_t a_multiplier_lo[8]; |
| XNN_ALIGN(16) uint16_t a_multiplier_hi[8]; |
| XNN_ALIGN(16) uint16_t b_multiplier_lo[8]; |
| XNN_ALIGN(16) uint16_t b_multiplier_hi[8]; |
| uint32_t shift; |
| uint32_t b_multiplier; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) int8_t output_min[16]; |
| XNN_ALIGN(16) int8_t output_max[16]; |
| } sse4_mul16; |
| struct { |
| XNN_ALIGN(16) int32_t bias[4]; |
| XNN_ALIGN(16) int32_t a_multiplier[4]; |
| XNN_ALIGN(16) int32_t b_multiplier[4]; |
| XNN_ALIGN(16) uint32_t shift[4]; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) int8_t output_min[16]; |
| XNN_ALIGN(16) int8_t output_max[16]; |
| } sse4_mul32; |
| struct { |
| XNN_ALIGN(32) int32_t bias[8]; |
| XNN_ALIGN(32) int32_t a_multiplier[8]; |
| XNN_ALIGN(32) int32_t b_multiplier[8]; |
| XNN_ALIGN(32) uint32_t shift[8]; |
| XNN_ALIGN(32) int16_t output_zero_point[16]; |
| XNN_ALIGN(16) int8_t output_min[16]; |
| XNN_ALIGN(16) int8_t output_max[16]; |
| } avx2; |
| struct { |
| XNN_ALIGN(64) int32_t bias[16]; |
| XNN_ALIGN(64) int32_t a_multiplier[16]; |
| XNN_ALIGN(64) int32_t b_multiplier[16]; |
| XNN_ALIGN(64) uint32_t shift[16]; |
| XNN_ALIGN(64) int16_t output_zero_point[32]; |
| XNN_ALIGN(32) int8_t output_min[32]; |
| XNN_ALIGN(32) int8_t output_max[32]; |
| } avx512; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) int32_t bias[2]; |
| XNN_ALIGN(8) int32_t a_multiplier[2]; |
| XNN_ALIGN(8) int32_t b_multiplier[2]; |
| uint32_t shift; |
| XNN_ALIGN(8) int16_t output_zero_point[4]; |
| XNN_ALIGN(8) int8_t output_min[8]; |
| XNN_ALIGN(8) int8_t output_max[8]; |
| } wasmsimd; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_qu8_mul_minmax_params { |
| struct { |
| int32_t a_zero_point; |
| int32_t b_zero_point; |
| float scale; |
| float output_min_less_zero_point; |
| float output_max_less_zero_point; |
| float magic_bias; |
| int32_t magic_bias_less_output_zero_point; |
| } fp32_scalar; |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| uint8_t a_zero_point[2]; |
| uint8_t b_zero_point[2]; |
| float scale; |
| float magic_bias; |
| int32_t magic_bias_less_output_zero_point; |
| uint8_t output_min; |
| uint8_t output_max; |
| } fp32_neon; |
| struct { |
| uint8_t a_zero_point[2]; |
| uint8_t b_zero_point[2]; |
| float scale; |
| int16_t output_zero_point; |
| uint8_t output_min; |
| uint8_t output_max; |
| } fp32_neonv8; |
| struct { |
| uint8_t a_zero_point[2]; |
| uint8_t b_zero_point[2]; |
| int32_t left_pre_shift; |
| int32_t multiplier; |
| int32_t left_post_shift; |
| int16_t output_zero_point; |
| uint8_t output_min; |
| uint8_t output_max; |
| } rndnu_neon; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) int16_t a_zero_point[8]; |
| XNN_ALIGN(16) int16_t b_zero_point[8]; |
| XNN_ALIGN(16) float scale[4]; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) uint8_t output_min[16]; |
| XNN_ALIGN(16) uint8_t output_max[16]; |
| } fp32_sse2; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) int16_t a_zero_point[4]; |
| XNN_ALIGN(8) int16_t b_zero_point[4]; |
| XNN_ALIGN(8) float scale[2]; |
| XNN_ALIGN(8) float magic_bias[2]; |
| XNN_ALIGN(8) int32_t magic_min[2]; |
| XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2]; |
| XNN_ALIGN(8) uint8_t output_max[8]; |
| } fp32_wasmsimd; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_qs8_mul_minmax_params { |
| struct { |
| int32_t a_zero_point; |
| int32_t b_zero_point; |
| float scale; |
| float output_min_less_zero_point; |
| float output_max_less_zero_point; |
| float magic_bias; |
| int32_t magic_bias_less_output_zero_point; |
| } fp32_scalar; |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| int8_t a_zero_point[2]; |
| int8_t b_zero_point[2]; |
| float scale; |
| float magic_bias; |
| int32_t magic_bias_less_output_zero_point; |
| int8_t output_min; |
| int8_t output_max; |
| } fp32_neon; |
| struct { |
| int8_t a_zero_point[2]; |
| int8_t b_zero_point[2]; |
| float scale; |
| int16_t output_zero_point; |
| int8_t output_min; |
| int8_t output_max; |
| } fp32_neonv8; |
| struct { |
| int8_t a_zero_point[2]; |
| int8_t b_zero_point[2]; |
| int32_t left_pre_shift; |
| int32_t multiplier; |
| int32_t left_post_shift; |
| int16_t output_zero_point; |
| int8_t output_min; |
| int8_t output_max; |
| } rndnu_neon; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) int16_t a_zero_point[8]; |
| XNN_ALIGN(16) int16_t b_zero_point[8]; |
| XNN_ALIGN(16) float scale[4]; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) int16_t output_min[8]; |
| XNN_ALIGN(16) int16_t output_max[8]; |
| } fp32_sse2; |
| struct { |
| XNN_ALIGN(16) int16_t a_zero_point[8]; |
| XNN_ALIGN(16) int16_t b_zero_point[8]; |
| XNN_ALIGN(16) float scale[4]; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) int8_t output_min[16]; |
| XNN_ALIGN(16) int8_t output_max[16]; |
| } fp32_sse4; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) int16_t a_zero_point[4]; |
| XNN_ALIGN(8) int16_t b_zero_point[4]; |
| XNN_ALIGN(8) float scale[2]; |
| XNN_ALIGN(8) float magic_bias[2]; |
| XNN_ALIGN(8) int32_t magic_min[2]; |
| XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2]; |
| XNN_ALIGN(8) int8_t output_max[8]; |
| } fp32_wasmsimd; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_qu8_avgpool_minmax_params { |
| struct { |
| int32_t init_bias; |
| float scale; |
| float output_min_less_zero_point; |
| float output_max_less_zero_point; |
| float magic_bias; |
| int32_t magic_bias_less_output_zero_point; |
| } fp32_scalar_fmagic; |
| struct { |
| int32_t init_bias; |
| float scale; |
| float magic_bias; |
| int32_t magic_min; |
| int32_t magic_max; |
| int32_t magic_bias_less_zero_point; |
| } fp32_scalar_imagic; |
| struct { |
| int32_t init_bias; |
| float scale; |
| float output_min_less_zero_point; |
| float output_max_less_zero_point; |
| int32_t output_zero_point; |
| } fp32_scalar_lrintf; |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| int32_t init_bias; |
| float scale; |
| float magic_bias; |
| int32_t magic_bias_less_output_zero_point; |
| uint8_t output_min; |
| uint8_t output_max; |
| } fp32_neon; |
| struct { |
| int32_t init_bias; |
| float scale; |
| int16_t output_zero_point; |
| uint8_t output_min; |
| uint8_t output_max; |
| } fp32_neonv8; |
| struct { |
| int32_t init_bias; |
| int32_t left_pre_shift; |
| int32_t multiplier; |
| int32_t left_post_shift; |
| int16_t output_zero_point; |
| uint8_t output_min; |
| uint8_t output_max; |
| } rndnu_neon; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) int32_t init_bias[4]; |
| XNN_ALIGN(16) float scale[4]; |
| XNN_ALIGN(16) float output_max_less_zero_point[4]; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) uint8_t output_min[16]; |
| } fp32_sse2; |
| struct { |
| XNN_ALIGN(16) int32_t init_bias[4]; |
| XNN_ALIGN(16) float scale[4]; |
| XNN_ALIGN(16) float output_max_less_zero_point[4]; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) uint8_t output_min[16]; |
| } fp32_sse4; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) int32_t init_bias[2]; |
| XNN_ALIGN(8) float scale[2]; |
| XNN_ALIGN(8) float magic_bias[2]; |
| XNN_ALIGN(8) int32_t magic_min[2]; |
| XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2]; |
| XNN_ALIGN(8) uint8_t output_max[8]; |
| } fp32_wasmsimd; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| |
| // Legacy parameters used by QU8 AVGPOOL microkernels |
| struct { |
| int32_t bias; |
| int32_t multiplier; |
| int64_t rounding; |
| uint32_t right_shift; |
| int32_t output_min_less_zero_point; |
| int32_t output_max_less_zero_point; |
| int32_t output_zero_point; |
| } scalar; |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| int32_t bias; |
| int32_t multiplier; |
| int64_t left_shift; |
| int16_t output_zero_point; |
| uint8_t output_min; |
| uint8_t output_max; |
| } neon; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) int32_t bias[4]; |
| XNN_ALIGN(16) uint32_t multiplier[4]; |
| XNN_ALIGN(16) uint64_t rounding[2]; |
| XNN_ALIGN(16) uint64_t right_shift[2]; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) uint8_t output_min[16]; |
| XNN_ALIGN(16) uint8_t output_max[16]; |
| } sse2; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| }; |
| |
| union xnn_qs8_avgpool_minmax_params { |
| struct { |
| int32_t init_bias; |
| float scale; |
| float output_min_less_zero_point; |
| float output_max_less_zero_point; |
| float magic_bias; |
| int32_t magic_bias_less_output_zero_point; |
| } fp32_scalar_fmagic; |
| struct { |
| int32_t init_bias; |
| float scale; |
| float magic_bias; |
| int32_t magic_min; |
| int32_t magic_max; |
| int32_t magic_bias_less_zero_point; |
| } fp32_scalar_imagic; |
| struct { |
| int32_t init_bias; |
| float scale; |
| float output_min_less_zero_point; |
| float output_max_less_zero_point; |
| int32_t output_zero_point; |
| } fp32_scalar_lrintf; |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| int32_t init_bias; |
| float scale; |
| float magic_bias; |
| int32_t magic_bias_less_output_zero_point; |
| int8_t output_min; |
| int8_t output_max; |
| } fp32_neon; |
| struct { |
| int32_t init_bias; |
| float scale; |
| int16_t output_zero_point; |
| int8_t output_min; |
| int8_t output_max; |
| } fp32_neonv8; |
| struct { |
| int32_t init_bias; |
| int32_t left_pre_shift; |
| int32_t multiplier; |
| int32_t left_post_shift; |
| int16_t output_zero_point; |
| int8_t output_min; |
| int8_t output_max; |
| } rndnu_neon; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) int32_t init_bias[4]; |
| XNN_ALIGN(16) float scale[4]; |
| XNN_ALIGN(16) float output_max_less_zero_point[4]; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) int16_t output_min[8]; |
| } fp32_sse2; |
| struct { |
| XNN_ALIGN(16) int32_t init_bias[4]; |
| XNN_ALIGN(16) float scale[4]; |
| XNN_ALIGN(16) float output_max_less_zero_point[4]; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) int8_t output_min[16]; |
| } fp32_sse4; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) int32_t init_bias[2]; |
| XNN_ALIGN(8) float scale[2]; |
| XNN_ALIGN(8) float magic_bias[2]; |
| XNN_ALIGN(8) int32_t magic_min[2]; |
| XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2]; |
| XNN_ALIGN(8) int8_t output_max[8]; |
| } fp32_wasmsimd; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_f16_f32_cvt_params { |
| struct { |
| uint32_t sign_mask; |
| uint32_t exp_offset; |
| float exp_scale; |
| uint32_t magic_mask; |
| float magic_bias; |
| uint32_t denorm_cutoff; |
| } scalar; |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| float exp_scale; |
| } neon; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) uint16_t sign_mask[8]; |
| XNN_ALIGN(16) uint16_t exp_offset[8]; |
| XNN_ALIGN(16) float exp_scale[4]; |
| XNN_ALIGN(16) uint16_t magic_mask[8]; |
| XNN_ALIGN(16) float magic_bias[4]; |
| XNN_ALIGN(16) int16_t denorm_cutoff[8]; |
| } sse_int16; |
| struct { |
| XNN_ALIGN(16) uint32_t sign_mask[4]; |
| XNN_ALIGN(16) uint32_t exp_offset[4]; |
| XNN_ALIGN(16) float exp_scale[4]; |
| XNN_ALIGN(16) uint32_t magic_bias[4]; |
| XNN_ALIGN(16) int32_t denorm_cutoff[4]; |
| } sse_int32; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) uint16_t sign_mask[4]; |
| XNN_ALIGN(8) uint16_t exp_offset[4]; |
| XNN_ALIGN(8) float exp_scale[2]; |
| XNN_ALIGN(8) uint16_t magic_mask[4]; |
| XNN_ALIGN(8) float magic_bias[2]; |
| XNN_ALIGN(8) int16_t denorm_cutoff[4]; |
| } wasmsimd_int16; |
| struct { |
| XNN_ALIGN(8) uint32_t sign_mask[2]; |
| XNN_ALIGN(8) uint32_t exp_offset[2]; |
| XNN_ALIGN(8) float exp_scale[2]; |
| XNN_ALIGN(8) uint32_t magic_bias[2]; |
| XNN_ALIGN(8) int32_t denorm_cutoff[2]; |
| } wasmsimd_int32; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_f32_f16_cvt_params { |
| struct { |
| uint32_t nonsign_mask; |
| uint32_t exp_bias; |
| float scale_to_inf; |
| uint32_t expw_max; |
| float scale_to_zero; |
| uint32_t bias_min; |
| uint16_t exph_mask; |
| uint16_t manth_mask; |
| uint16_t nanh; |
| } scalar_bitcast; |
| struct { |
| float scale_to_inf; |
| uint32_t exp_bias; |
| float scale_to_zero; |
| uint32_t expw_max; |
| uint32_t bias_min; |
| uint16_t exph_mask; |
| uint16_t manth_mask; |
| uint16_t nanh; |
| } scalar_fabsf; |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| uint32_t exp_bias; |
| float scale_to_inf; |
| uint32_t expw_max; |
| float scale_to_zero; |
| } neon; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) uint32_t nonsign_mask[4]; |
| XNN_ALIGN(16) uint32_t exp_bias[4]; |
| XNN_ALIGN(16) float scale_to_inf[4]; |
| XNN_ALIGN(16) uint32_t expw_max[4]; |
| XNN_ALIGN(16) float scale_to_zero[4]; |
| XNN_ALIGN(16) int16_t bias_min[8]; |
| XNN_ALIGN(16) uint32_t manth_mask[4]; |
| XNN_ALIGN(16) uint32_t exph_mask[4]; |
| XNN_ALIGN(16) uint16_t nanh[8]; |
| } sse2; |
| struct { |
| int32_t mask_table[14]; |
| } f16c; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) uint32_t exp_bias[2]; |
| XNN_ALIGN(8) float scale_to_inf[2]; |
| XNN_ALIGN(8) uint32_t expw_max[2]; |
| XNN_ALIGN(8) float scale_to_zero[2]; |
| XNN_ALIGN(8) int16_t bias_min[4]; |
| XNN_ALIGN(8) uint32_t manth_mask[2]; |
| XNN_ALIGN(8) uint32_t exph_mask[2]; |
| XNN_ALIGN(8) uint16_t nanh[4]; |
| } wasmsimd; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_f32_qs8_cvt_params { |
| struct { |
| float scale; |
| float output_min_less_zero_point; |
| float output_max_less_zero_point; |
| float magic_bias; |
| int32_t magic_bias_less_zero_point; |
| } scalar_fmagic; |
| struct { |
| float scale; |
| float magic_bias; |
| int32_t magic_min; |
| int32_t magic_max; |
| int32_t magic_bias_less_zero_point; |
| } scalar_imagic; |
| struct { |
| float scale; |
| float output_min_less_zero_point; |
| float output_max_less_zero_point; |
| int32_t output_zero_point; |
| } scalar_lrintf; |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| float scale; |
| float magic_bias; |
| int32_t magic_bias_less_zero_point; |
| int8_t output_min; |
| int8_t output_max; |
| } neon; |
| struct { |
| float scale; |
| int16_t output_zero_point; |
| int8_t output_min; |
| int8_t output_max; |
| } neonv8; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) float scale[4]; |
| XNN_ALIGN(16) float output_max_less_zero_point[4]; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) int16_t output_min[8]; |
| } sse2; |
| struct { |
| XNN_ALIGN(16) float scale[4]; |
| XNN_ALIGN(16) float output_max_less_zero_point[4]; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) int8_t output_min[16]; |
| } sse4; |
| struct { |
| XNN_ALIGN(32) float scale[8]; |
| XNN_ALIGN(32) float output_max_less_zero_point[8]; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) int8_t output_min[16]; |
| int32_t mask_table[14]; |
| } avx; |
| struct { |
| XNN_ALIGN(32) float scale[8]; |
| XNN_ALIGN(32) float output_max_less_zero_point[8]; |
| XNN_ALIGN(32) int16_t output_zero_point[16]; |
| XNN_ALIGN(32) uint32_t shuffle_mask[8]; |
| XNN_ALIGN(32) int8_t output_min[32]; |
| int32_t mask_table[14]; |
| } avx2; |
| struct { |
| XNN_ALIGN(64) float scale[16]; |
| XNN_ALIGN(64) float output_max_less_zero_point[16]; |
| XNN_ALIGN(64) int16_t output_zero_point[32]; |
| XNN_ALIGN(64) int8_t output_min[64]; |
| XNN_ALIGN(64) uint32_t shuffle512_mask[16]; |
| XNN_ALIGN(32) uint32_t shuffle256_mask[8]; |
| } avx512; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) float scale[2]; |
| XNN_ALIGN(8) int16_t output_zero_point[4]; |
| XNN_ALIGN(8) int8_t output_min[8]; |
| XNN_ALIGN(8) int8_t output_max[8]; |
| } wasmsimd_cvt; |
| struct { |
| XNN_ALIGN(8) float scale[2]; |
| XNN_ALIGN(8) float magic_bias[2]; |
| XNN_ALIGN(8) int32_t magic_min[2]; |
| XNN_ALIGN(8) int32_t magic_bias_less_zero_point[2]; |
| XNN_ALIGN(8) int8_t output_max[8]; |
| } wasmsimd_magic; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_f32_qu8_cvt_params { |
| struct { |
| float scale; |
| float output_min_less_zero_point; |
| float output_max_less_zero_point; |
| float magic_bias; |
| int32_t magic_bias_less_zero_point; |
| } scalar_fmagic; |
| struct { |
| float scale; |
| float magic_bias; |
| int32_t magic_min; |
| int32_t magic_max; |
| int32_t magic_bias_less_zero_point; |
| } scalar_imagic; |
| struct { |
| float scale; |
| float output_min_less_zero_point; |
| float output_max_less_zero_point; |
| int32_t output_zero_point; |
| } scalar_lrintf; |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| float scale; |
| float magic_bias; |
| int32_t magic_bias_less_zero_point; |
| uint8_t output_min; |
| uint8_t output_max; |
| } neon; |
| struct { |
| float scale; |
| int16_t output_zero_point; |
| uint8_t output_min; |
| uint8_t output_max; |
| } neonv8; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) float scale[4]; |
| XNN_ALIGN(16) float output_max_less_zero_point[4]; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) uint8_t output_min[16]; |
| } sse2; |
| struct { |
| XNN_ALIGN(32) float scale[8]; |
| XNN_ALIGN(32) float output_max_less_zero_point[8]; |
| XNN_ALIGN(16) int16_t output_zero_point[8]; |
| XNN_ALIGN(16) uint8_t output_min[16]; |
| int32_t mask_table[14]; |
| } avx; |
| struct { |
| XNN_ALIGN(32) float scale[8]; |
| XNN_ALIGN(32) float output_max_less_zero_point[8]; |
| XNN_ALIGN(32) int16_t output_zero_point[16]; |
| XNN_ALIGN(32) uint32_t shuffle_mask[8]; |
| XNN_ALIGN(32) uint8_t output_min[32]; |
| int32_t mask_table[14]; |
| } avx2; |
| struct { |
| XNN_ALIGN(64) float scale[16]; |
| XNN_ALIGN(64) float output_max_less_zero_point[16]; |
| XNN_ALIGN(64) int16_t output_zero_point[32]; |
| XNN_ALIGN(64) uint8_t output_min[64]; |
| XNN_ALIGN(64) uint32_t shuffle512_mask[16]; |
| XNN_ALIGN(32) uint32_t shuffle256_mask[8]; |
| } avx512; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) float scale[2]; |
| XNN_ALIGN(8) int16_t output_zero_point[4]; |
| XNN_ALIGN(8) uint8_t output_min[8]; |
| XNN_ALIGN(8) uint8_t output_max[8]; |
| } wasmsimd_cvt; |
| struct { |
| XNN_ALIGN(8) float scale[2]; |
| XNN_ALIGN(8) float magic_bias[2]; |
| XNN_ALIGN(8) int32_t magic_min[2]; |
| XNN_ALIGN(8) int32_t magic_bias_less_zero_point[2]; |
| XNN_ALIGN(8) uint8_t output_max[8]; |
| } wasmsimd_magic; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_qs8_f32_cvt_params { |
| struct { |
| int32_t zero_point; |
| float scale; |
| } scalar; |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| int16_t minus_zero_point[2]; |
| float scale; |
| } neon; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) uint8_t sign_mask[16]; |
| XNN_ALIGN(16) uint16_t magic_exp[8]; |
| XNN_ALIGN(16) float magic_bias[4]; |
| XNN_ALIGN(16) float scale[4]; |
| } sse2; |
| struct { |
| XNN_ALIGN(16) int32_t minus_zero_point[4]; |
| XNN_ALIGN(16) float scale[4]; |
| } sse4; |
| struct { |
| XNN_ALIGN(32) int32_t minus_zero_point[8]; |
| XNN_ALIGN(32) float scale[8]; |
| } avx; |
| struct { |
| XNN_ALIGN(64) int32_t minus_zero_point[16]; |
| XNN_ALIGN(64) float scale[16]; |
| } avx512; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) int16_t minus_zero_point[4]; |
| XNN_ALIGN(8) float scale[2]; |
| } wasmsimd; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| union xnn_qu8_f32_cvt_params { |
| struct { |
| int32_t zero_point; |
| float scale; |
| } scalar; |
| #if XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| struct { |
| int16_t minus_zero_point[2]; |
| float scale; |
| } neon; |
| #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 |
| #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| struct { |
| XNN_ALIGN(16) uint16_t magic_exp[8]; |
| XNN_ALIGN(16) float magic_bias[4]; |
| XNN_ALIGN(16) float scale[4]; |
| } sse2; |
| struct { |
| XNN_ALIGN(16) int32_t minus_zero_point[4]; |
| XNN_ALIGN(16) float scale[4]; |
| } sse4; |
| struct { |
| XNN_ALIGN(32) int32_t minus_zero_point[8]; |
| XNN_ALIGN(32) float scale[8]; |
| } avx; |
| struct { |
| XNN_ALIGN(64) int32_t minus_zero_point[16]; |
| XNN_ALIGN(64) float scale[16]; |
| } avx512; |
| #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| struct { |
| XNN_ALIGN(8) int16_t minus_zero_point[4]; |
| XNN_ALIGN(8) float scale[2]; |
| } wasmsimd; |
| #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
| }; |
| |
| typedef void (*xnn_ppmm_ukernel_function)( |
| size_t mr, |
| size_t nc, |
| size_t kc, |
| const void* a, |
| const void* w, |
| void* c, |
| size_t cm_stride, |
| size_t cn_stride, |
| const void* params); |
| |
| typedef void (*xnn_f32_ppmm_minmax_ukernel_function)( |
| size_t mr, |
| size_t nc, |
| size_t kc, |
| const float* a, |
| const float* w, |
| float* c, |
| size_t cm_stride, |
| size_t cn_stride, |
| const union xnn_f32_minmax_params* params); |
| |
| typedef void (*xnn_f16_ppmm_ukernel_function)( |
| size_t mr, |
| size_t nc, |
| size_t kc, |
| const void* a, |
| const void* w, |
| void* c, |
| size_t cm_stride, |
| size_t cn_stride, |
| const union xnn_f16_scaleminmax_params* params); |
| |
| typedef void (*xnn_gemm_ukernel_function)( |
| size_t mr, |
| size_t nr, |
| size_t k, |
| const void* a, |
| size_t a_stride, |
| const void* w, |
| void* c, |
| size_t cm_stride, |
| size_t cn_stride, |
| const void* params); |
| |
| typedef void (*xnn_f32_gemm_ukernel_function)( |
| size_t mr, |
| size_t nr, |
| size_t k, |
| const float* a, |
| size_t a_stride, |
| const float* w, |
| float* c, |
| size_t cm_stride, |
| size_t cn_stride, |
| const union xnn_f32_default_params* params); |
| |
| typedef void (*xnn_x8_transpose_ukernel_function)( |
| const uint8_t* a, |
| uint8_t* b, |
| size_t input_stride, |
| size_t output_stride, |
| size_t block_width, |
| size_t block_height); |
| |
| typedef void (*xnn_x16_transpose_ukernel_function)( |
| const uint16_t* a, |
| uint16_t* b, |
| size_t input_stride, |
| size_t output_stride, |
| size_t block_width, |
| size_t block_height); |
| |
| typedef void (*xnn_x32_transpose_ukernel_function)( |
| const uint32_t* a, |
| uint32_t* b, |
| size_t input_stride, |
| size_t output_stride, |
| size_t block_width, |
| size_t block_height); |
| |
| typedef void (*xnn_x64_transpose_ukernel_function)( |
| const uint64_t* a, |
| uint64_t* b, |
| size_t input_stride, |
| size_t output_stride, |
| size_t block_width, |
| size_t block_height); |
| |
| typedef void (*xnn_f32_gemm_relu_ukernel_function)( |
| size_t mr, |
| size_t nr, |
| size_t k, |
| const float* a, |
| size_t a_stride, |
| const float* w, |
| float* c, |
| size_t cm_stride, |
| size_t cn_stride, |
| const union xnn_f32_relu_params* params); |
| |
| typedef void (*xnn_f32_gemm_minmax_ukernel_function)( |
| size_t mr, |
| size_t nr, |
| size_t k, |
| const float* a, |
| size_t a_stride, |
| const float* w, |
| float* c, |
| size_t cm_stride, |
| size_t cn_stride, |
| const union xnn_f32_minmax_params* params); |
| |
| typedef void (*xnn_f32_gemminc_minmax_ukernel_function)( |
| size_t mr, |
| size_t nr, |
| size_t k, |
| const float* a, |
| size_t a_stride, |
| const float* w, |
| float* c, |
| size_t cm_stride, |
| size_t cn_stride, |
| const float* acc, |
| const union xnn_f32_minmax_params* params); |
| |
| typedef void (*xnn_f16_gemm_minmax_ukernel_function)( |
| size_t mr, |
| size_t nr, |
| size_t k, |
| const void* a, |
| size_t a_stride, |
| const void* w, |
| void* c, |
| size_t cm_stride, |
| size_t cn_stride, |
| const union xnn_f16_scaleminmax_params* params); |
| |
| typedef void (*xnn_f16_igemm_minmax_ukernel_function)( |
| size_t mr, |
| size_t nr, |
| size_t kc, |
| size_t ks, |
| const void** a, |
| const void* w, |
| void* c, |
| size_t cm_stride, |
| size_t cn_stride, |
| size_t a_offset, |
| const void* zero, |
| const union xnn_f16_scaleminmax_params* params); |
| |
| typedef void (*xnn_qc8_gemm_minmax_ukernel_function)( |
| size_t mr, |
| size_t nr, |
| size_t k, |
| const int8_t* a, |
| size_t a_stride, |
| const void* w, |
| int8_t* c, |
| size_t cm_stride, |
| size_t cn_stride, |
| const union xnn_qs8_minmax_params* params); |
| |
| typedef void (*xnn_qs8_gemm_minmax_ukernel_function)( |
| size_t mr, |
| size_t nr, |
| size_t k, |
| const int8_t* a, |
| size_t a_stride, |
| const void* w, |
| int8_t* c, |
| size_t cm_stride, |
| size_t cn_stride, |
| const union xnn_qs8_conv_minmax_params* params); |
| |
| typedef void (*xnn_qu8_gemm_minmax_ukernel_function)( |
| size_t mr, |
| size_t nr, |
| size_t k, |
| const uint8_t* a, |
| size_t a_stride, |
| const void* w, |
| uint8_t* c, |
| size_t cm_stride, |
| size_t cn_stride, |
| const union xnn_qu8_conv_minmax_params* params); |
| |
| typedef void (*xnn_igemm_ukernel_function)( |
| size_t mr, |
| size_t nr, |
| size_t kc, |
| size_t ks, |
| const void** a, |
| const void* w, |
| void* c, |
| size_t cm_stride, |
| size_t cn_stride, |
| size_t a_offset, |
| const void* zero, |
| const void* params); |
| |
| typedef void (*xnn_f32_igemm_ukernel_function)( |
| size_t mr, |
| size_t nr, |
| size_t kc, |
| size_t ks, |
| const float** a, |
| const float* w, |
| float* c, |
| size_t cm_stride, |
| size_t cn_stride, |
| size_t a_offset, |
| const float* zero, |
| const union xnn_f32_default_params* params); |
| |
| typedef void (*xnn_f32_igemm_relu_ukernel_function)( |
| size_t mr, |
| size_t nr, |
| size_t kc, |
| size_t ks, |
| const float** a, |
| const float* w, |
| float* c, |
| size_t cm_stride, |
| size_t cn_stride, |
| size_t a_offset, |
| const float* zero, |
| const union xnn_f32_relu_params* params); |
| |
| typedef void (*xnn_f32_igemm_minmax_ukernel_function)( |
| size_t mr, |
| size_t nr, |
| size_t kc, |
| size_t ks, |
| const float** a, |
| const float* w, |
| float* c, |
| size_t cm_stride, |
| size_t cn_stride, |
| size_t a_offset, |
| const float* zero, |
| const union xnn_f32_minmax_params* params); |
| |
| typedef void (*xnn_qu8_igemm_minmax_ukernel_function)( |
| size_t mr, |
| size_t nr, |
| size_t kc, |
| size_t ks, |
| const uint8_t** a, |
| const void* w, |
| uint8_t* c, |
| size_t cm_stride, |
| size_t cn_stride, |
| size_t a_offset, |
| const uint8_t* zero, |
| const union xnn_qu8_conv_minmax_params* params); |
| |
| typedef void (*xnn_qc8_igemm_minmax_ukernel_function)( |
| size_t mr, |
| size_t nr, |
| size_t kc, |
| size_t ks, |
| const int8_t** a, |
| const void* w, |
| int8_t* c, |
| size_t cm_stride, |
| size_t cn_stride, |
| size_t a_offset, |
| const int8_t* zero, |
| const union xnn_qs8_minmax_params* params); |
| |
| typedef void (*xnn_qs8_igemm_minmax_ukernel_function)( |
| size_t mr, |
| size_t nr, |
| size_t kc, |
| size_t ks, |
| const int8_t** a, |
| const void* w, |
| int8_t* c, |
| size_t cm_stride, |
| size_t cn_stride, |
| size_t a_offset, |
| const int8_t* zero, |
| const union xnn_qs8_conv_minmax_params* params); |
| |
| typedef void (*xnn_conv_hwc_ukernel_function)( |
| size_t input_height, |
| size_t input_width, |
| size_t output_y_start, |
| size_t output_y_end, |
| const void* input, |
| const void* zero, |
| const void* weights, |
| void* output, |
| size_t input_padding_top, |
| size_t output_channels, |
| size_t output_height_stride, |
| size_t output_width_stride, |
| const void* params); |
| |
| typedef void (*xnn_f32_conv_hwc_ukernel_function)( |
| size_t input_height, |
| size_t input_width, |
| size_t output_y_start, |
| size_t output_y_end, |
| const float* input, |
| const float* zero, |
| const float* weights, |
| float* output, |
| size_t input_padding_top, |
| size_t output_channels, |
| size_t output_height_stride, |
| size_t output_width_stride, |
| const union xnn_f32_minmax_params* params); |
| |
| typedef void (*xnn_conv_hwc2chw_ukernel_function)( |
| size_t input_height, |
| size_t input_width, |
| size_t output_y_start, |
| size_t output_y_end, |
| const void* input, |
| const void* zero, |
| const void* weights, |
| void* output, |
| size_t input_padding_top, |
| size_t output_channels, |
| size_t output_height_stride, |
| size_t output_channel_stride, |
| const void* params); |
| |
| typedef void (*xnn_f32_conv_hwc2chw_ukernel_function)( |
| size_t input_height, |
| size_t input_width, |
| size_t output_y_start, |
| size_t output_y_end, |
| const float* input, |
| const float* zero, |
| const float* weights, |
| float* output, |
| size_t input_padding_top, |
| size_t output_channels, |
| size_t output_height_stride, |
| size_t output_channel_stride, |
| const union xnn_f32_minmax_params* params); |
| |
| typedef void (*xnn_spmm_ukernel_function)( |
| size_t batch_size, |
| size_t output_channels, |
| const void* input, |
| const void* weights, |
| const int32_t* widx_dmap, |
| const uint32_t* nidx_nnzmap, |
| void* output, |
| size_t output_stride, |
| const void* params); |
| |
| typedef void (*xnn_f16_spmm_minmax_ukernel_function)( |
| size_t batch_size, |
| size_t output_channels, |
| const void* input, |
| const void* weights, |
| const int32_t* widx_dmap, |
| const uint32_t* nidx_nnzmap, |
| void* output, |
| size_t output_stride, |
| const union xnn_f16_scaleminmax_params* params); |
| |
| typedef void (*xnn_f32_spmm_minmax_ukernel_function)( |
| size_t batch_size, |
| size_t output_channels, |
| const float* input, |
| const float* weights, |
| const int32_t* widx_dmap, |
| const uint32_t* nidx_nnzmap, |
| float* output, |
| size_t output_stride, |
| const union xnn_f32_minmax_params* params); |
| |
| typedef void (*xnn_packx_ukernel_function)( |
| size_t m, |
| size_t k, |
| const void* x, |
| size_t x_stride, |
| void* y); |
| |
| typedef void (*xnn_x32_packx_ukernel_function)( |
| size_t m, |
| size_t k, |
| const uint32_t* x, |
| size_t x_stride, |
| uint32_t* y); |
| |
| typedef void (*xnn_fill_ukernel_function)( |
| size_t rows, |
| size_t channels, |
| void* output, |
| size_t output_stride, |
| const uint32_t fill_pattern); |
| |
| typedef void (*xnn_depthtospace2d_chw2hwc_ukernel_function)( |
| size_t output_channels, |
| size_t input_height, |
| size_t input_width, |
| size_t block_size, |
| const void* input, |
| void* output, |
| size_t output_channels_stride); |
| |
| typedef void (*xnn_x32_depthtospace2d_chw2hwc_ukernel_function)( |
| size_t output_channels, |
| size_t input_height, |
| size_t input_width, |
| size_t block_size, |
| const uint32_t* input, |
| uint32_t* output, |
| size_t output_channel_stride); |
| |
| typedef void (*xnn_pad_ukernel_function)( |
| size_t rows, |
| size_t channels, |
| size_t pre_padding, |
| size_t post_padding, |
| const void* input, |
| size_t input_stride, |
| void* output, |
| size_t output_stride, |
| const uint32_t fill_value); |
| |
| typedef void (*xnn_unpool_ukernel_function)( |
| size_t p, |
| size_t c, |
| uint32_t f, |
| const void* input, |
| const uint32_t* index, |
| void** output); |
| |
| typedef void (*xnn_x32_unpool_ukernel_function)( |
| size_t p, |
| size_t c, |
| uint32_t f, |
| const uint32_t* input, |
| const uint32_t* index, |
| uint32_t** output); |
| |
| typedef void (*xnn_zipc_ukernel_function)( |
| size_t n, |
| const void* x, |
| void* y); |
| |
| typedef void (*xnn_x8_zipc_ukernel_function)( |
| size_t n, |
| const uint8_t* x, |
| uint8_t* y); |
| |
| typedef void (*xnn_x32_zipc_ukernel_function)( |
| size_t n, |
| const uint32_t* x, |
| uint32_t* y); |
| |
| typedef void (*xnn_zipv_ukernel_function)( |
| size_t n, |
| size_t m, |
| const void* x, |
| void* y); |
| |
| typedef void (*xnn_x8_zipv_ukernel_function)( |
| size_t n, |
| size_t m, |
| const uint8_t* x, |
| uint8_t* y); |
| |
| typedef void (*xnn_x32_zipv_ukernel_function)( |
| size_t n, |
| size_t m, |
| const uint32_t* x, |
| uint32_t* y); |
| |
| typedef void (*xnn_x8_lut_ukernel_function)( |
| size_t n, |
| const uint8_t* x, |
| uint8_t* y, |
| const uint8_t* t); |
| |
| typedef void (*xnn_dwconv2d_chw_ukernel_function)( |
| size_t input_height, |
| size_t input_width, |
| const void* input, |
| const void* weights, |
| const void* zero, |
| void* output, |
| uint32_t padding_top, |
| const void* params); |
| |
| typedef void (*xnn_f32_dwconv2d_chw_ukernel_function)( |
| size_t input_height, |
| size_t input_width, |
| const float* input, |
| const float* weights, |
| const float* zero, |
| float* output, |
| uint32_t padding_top, |
| const union xnn_f32_chw_params* params); |
| |
| typedef void (*xnn_dwconv_unipass_ukernel_function)( |
| size_t channels, |
| size_t output_width, |
| const void** input, |
| const void* weights, |
| void* output, |
| size_t input_stride, |
| size_t output_increment, |
| size_t input_offset, |
| const void* zero, |
| const void* params); |
| |
| typedef void (*xnn_f32_dwconv_unipass_ukernel_function)( |
| size_t channels, |
| size_t output_width, |
| const float** input, |
| const float* weights, |
| float* output, |
| size_t input_stride, |
| size_t output_increment, |
| size_t input_offset, |
| const float* zero, |
| const union xnn_f32_default_params* params); |
| |
| typedef void (*xnn_f32_dwconv_minmax_unipass_ukernel_function)( |
| size_t channels, |
| size_t output_width, |
| const float** input, |
| const float* weights, |
| float* output, |
| size_t input_stride, |
| size_t output_increment, |
| size_t input_offset, |
| const float* zero, |
| const union xnn_f32_minmax_params* params); |
| |
| typedef void (*xnn_f16_dwconv_minmax_unipass_ukernel_function)( |
| size_t channels, |
| size_t output_width, |
| const void** input, |
| const void* weights, |
| void* output, |
| size_t input_stride, |
| size_t output_increment, |
| size_t input_offset, |
| const void* zero, |
| const union xnn_f16_minmax_params* params); |
| |
| typedef void (*xnn_qc8_dwconv_minmax_unipass_ukernel_function)( |
| size_t channels, |
| size_t output_width, |
| const int8_t** input, |
| const void* weights, |
| int8_t* output, |
| size_t input_stride, |
| size_t output_increment, |
| size_t input_offset, |
| const int8_t* zero, |
| const union xnn_qs8_minmax_params* params); |
| |
| typedef void (*xnn_qs8_dwconv_minmax_unipass_ukernel_function)( |
| size_t channels, |
| size_t output_width, |
| const int8_t** input, |
| const void* weights, |
| int8_t* output, |
| size_t input_stride, |
| size_t output_increment, |
| size_t input_offset, |
| const int8_t* zero, |
| const union xnn_qs8_conv_minmax_params* params); |
| |
| typedef void (*xnn_qu8_dwconv_minmax_unipass_ukernel_function)( |
| size_t channels, |
| size_t output_width, |
| const uint8_t** input, |
| const void* weights, |
| uint8_t* output, |
| size_t input_stride, |
| size_t output_increment, |
| size_t input_offset, |
| const uint8_t* zero, |
| const union xnn_qu8_conv_minmax_params* params); |
| |
| typedef void (*xnn_dwconv_multipass_ukernel_function)( |
| size_t channels, |
| size_t output_width, |
| const void** input, |
| const void* weights, |
| void* buffer, |
| void* output, |
| size_t input_stride, |
| size_t output_increment, |
| size_t input_offset, |
| const void* zero, |
| const void* params); |
| |
| typedef void (*xnn_f32_ibilinear_ukernel_function)( |
| size_t output_pixels, |
| size_t channels, |
| const float** input, |
| size_t input_offset, |
| const float* weights, |
| float* output, |
| size_t output_increment); |
| |
| typedef void (*xnn_s8_ibilinear_ukernel_function)( |
| size_t output_pixels, |
| size_t channels, |
| const int8_t** input, |
| size_t input_offset, |
| const int16_t* weights, |
| int8_t* output, |
| size_t output_increment); |
| |
| typedef void (*xnn_u8_ibilinear_ukernel_function)( |
| size_t output_pixels, |
| size_t channels, |
| const uint8_t** input, |
| size_t input_offset, |
| const int16_t* weights, |
| uint8_t* output, |
| size_t output_increment); |
| |
| typedef void (*xnn_ibilinear_ukernel_function)( |
| size_t output_pixels, |
| size_t channels, |
| const void** input, |
| size_t input_offset, |
| const void* weights, |
| void* output, |
| size_t output_increment); |
| |
| typedef void (*xnn_f32_ibilinear_chw_ukernel_function)( |
| size_t output_pixels, |
| size_t channels, |
| const float** input, |
| size_t input_offset, |
| const float* weights, |
| float* output, |
| size_t input_increment); |
| |
| typedef void (*xnn_ibilinear_chw_ukernel_function)( |
| size_t output_pixels, |
| size_t channels, |
| const void** input, |
| size_t input_offset, |
| const void* weights, |
| void* output, |
| size_t input_increment); |
| |
| typedef void (*xnn_gavgpool_unipass_ukernel_function)( |
| size_t rows, |
| size_t channels, |
| const void* input, |
| size_t input_stride, |
| const void* zero, |
| void* output, |
| const void* params); |
| |
| typedef void (*xnn_f16_gavgpool_minmax_unipass_ukernel_function)( |
| size_t rows, |
| size_t channels, |
| const void* input, |
| size_t input_stride, |
| const void* zero, |
| void* output, |
| const union xnn_f16_scaleminmax_params* params); |
| |
| typedef void (*xnn_f32_gavgpool_minmax_unipass_ukernel_function)( |
| size_t rows, |
| size_t channels, |
| const float* input, |
| size_t input_stride, |
| const float* zero, |
| float* output, |
| const union xnn_f32_scaleminmax_params* params); |
| |
| typedef void (*xnn_qu8_gavgpool_minmax_unipass_ukernel_function)( |
| size_t rows, |
| size_t channels, |
| const uint8_t* input, |
| size_t input_stride, |
| const uint8_t* zero, |
| uint8_t* output, |
| const union xnn_qu8_avgpool_minmax_params* params); |
| |
| typedef void (*xnn_qs8_gavgpool_minmax_unipass_ukernel_function)( |
| size_t rows, |
| size_t channels, |
| const int8_t* input, |
| size_t input_stride, |
| const int8_t* zero, |
| int8_t* output, |
| const union xnn_qs8_avgpool_minmax_params* params); |
| |
| typedef void (*xnn_gavgpool_multipass_ukernel_function)( |
| size_t rows, |
| size_t channels, |
| const void* input, |
| size_t input_stride, |
| const void* zero, |
| void* buffer, |
| void* output, |
| const void* params); |
| |
| typedef void (*xnn_f16_gavgpool_minmax_multipass_ukernel_function)( |
| size_t rows, |
| size_t channels, |
| const void* input, |
| size_t input_stride, |
| const void* zero, |
| void* buffer, |
| void* output, |
| const union xnn_f16_scaleminmax_params* params); |
| |
| typedef void (*xnn_f32_gavgpool_minmax_multipass_ukernel_function)( |
| size_t rows, |
| size_t channels, |
| const float* input, |
| size_t input_stride, |
| const float* zero, |
| float* buffer, |
| float* output, |
| const union xnn_f32_scaleminmax_params* params); |
| |
| typedef void (*xnn_qu8_gavgpool_minmax_multipass_ukernel_function)( |
| size_t rows, |
| size_t channels, |
| const uint8_t* input, |
| size_t input_stride, |
| const uint8_t* zero, |
| int32_t* buffer, |
| uint8_t* output, |
| const union xnn_qu8_avgpool_minmax_params* params); |
| |
| typedef void (*xnn_qs8_gavgpool_minmax_multipass_ukernel_function)( |
| size_t rows, |
| size_t channels, |
| const int8_t* input, |
| size_t input_stride, |
| const int8_t* zero, |
| int32_t* buffer, |
| int8_t* output, |
| const union xnn_qs8_avgpool_minmax_params* params); |
| |
| typedef void (*xnn_gavgpool_cw_ukernel_function)( |
| size_t elements, |
| size_t channels, |
| const float* input, |
| float* output, |
| const void* params); |
| |
| typedef void (*xnn_f32_gavgpool_cw_ukernel_function)( |
| size_t elements, |
| size_t channels, |
| const float* input, |
| float* output, |
| const union xnn_f32_gavgpool_params* params); |
| |
| typedef void (*xnn_avgpool_unipass_ukernel_function)( |
| size_t output_pixels, |
| size_t kernel_elements, |
| size_t channels, |
| const void** input, |
| size_t input_offset, |
| const void* zero, |
| void* output, |
| size_t input_increment, |
| size_t output_increment, |
| const void* params); |
| |
| typedef void (*xnn_f32_avgpool_minmax_unipass_ukernel_function)( |
| size_t output_pixels, |
| size_t kernel_elements, |
| size_t channels, |
| const float** input, |
| size_t input_offset, |
| const float* zero, |
| float* output, |
| size_t input_increment, |
| size_t output_increment, |
| const union xnn_f32_scaleminmax_params* params); |
| |
| typedef void (*xnn_qu8_avgpool_minmax_unipass_ukernel_function)( |
| size_t output_pixels, |
| size_t kernel_elements, |
| size_t channels, |
| const uint8_t** input, |
| size_t input_offset, |
| const uint8_t* zero, |
| uint8_t* output, |
| size_t input_increment, |
| size_t output_increment, |
| const union xnn_qu8_avgpool_minmax_params* params); |
| |
| typedef void (*xnn_avgpool_multipass_ukernel_function)( |
| size_t output_pixels, |
| size_t kernel_elements, |
| size_t channels, |
| const void** input, |
| size_t input_offset, |
| const void* zero, |
| void* buffer, |
| void* output, |
| size_t input_increment, |
| size_t output_increment, |
| const void* params); |
| |
| typedef void (*xnn_f32_avgpool_minmax_multipass_ukernel_function)( |
| size_t output_pixels, |
| size_t kernel_elements, |
| size_t channels, |
| const float** input, |
| size_t input_offset, |
| const float* zero, |
| float* buffer, |
| float* output, |
| size_t input_increment, |
| size_t output_increment, |
| const union xnn_f32_scaleminmax_params* params); |
| |
| typedef void (*xnn_qu8_avgpool_minmax_multipass_ukernel_function)( |
| size_t output_pixels, |
| size_t kernel_elements, |
| size_t channels, |
| const uint8_t** input, |
| size_t input_offset, |
| const uint8_t* zero, |
| int32_t* buffer, |
| uint8_t* output, |
| size_t input_increment, |
| size_t output_increment, |
| const union xnn_qu8_avgpool_minmax_params* params); |
| |
| typedef void (*xnn_pavgpool_unipass_ukernel_function)( |
| size_t output_pixels, |
| size_t kernel_elements, |
| size_t channels, |
| const void** input, |
| size_t input_offset, |
| const void* zero, |
| const void* multiplier, |
| void* output, |
| size_t input_increment, |
| size_t output_increment, |
| const void* params); |
| |
| typedef void (*xnn_f32_pavgpool_minmax_unipass_ukernel_function)( |
| size_t output_pixels, |
| size_t kernel_elements, |
| size_t channels, |
| const float** input, |
| size_t input_offset, |
| const float* zero, |
| const float* multiplier, |
| float* output, |
| size_t input_increment, |
| size_t output_increment, |
| const union xnn_f32_minmax_params* params); |
| |
| typedef void (*xnn_pavgpool_multipass_ukernel_function)( |
| size_t output_pixels, |
| size_t kernel_elements, |
| size_t channels, |
| const void** input, |
| size_t input_offset, |
| const void* zero, |
| const void* multiplier, |
| void* buffer, |
| void* output, |
| size_t input_increment, |
| size_t output_increment, |
| const void* params); |
| |
| typedef void (*xnn_f32_pavgpool_minmax_multipass_ukernel_function)( |
| size_t output_pixels, |
| size_t kernel_elements, |
| size_t channels, |
| const float** input, |
| size_t input_offset, |
| const float* zero, |
| const float* multiplier, |
| float* buffer, |
| float* output, |
| size_t input_increment, |
| size_t output_increment, |
| const union xnn_f32_minmax_params* params); |
| |
| typedef void (*xnn_maxpool_ukernel_function)( |
| size_t output_pixels, |
| size_t kernel_elements, |
| size_t channels, |
| const void** input, |
| size_t input_offset, |
| void* output, |
| size_t input_increment, |
| size_t output_increment, |
| const void* params); |
| |
| typedef void (*xnn_f16_maxpool_ukernel_function)( |
| size_t output_pixels, |
| size_t kernel_elements, |
| size_t channels, |
| const void** input, |
| size_t input_offset, |
| void* output, |
| size_t input_increment, |
| size_t output_increment, |
| const union xnn_f16_minmax_params* params); |
| |
| typedef void (*xnn_f32_maxpool_ukernel_function)( |
| size_t output_pixels, |
| size_t kernel_elements, |
| size_t channels, |
| const float** input, |
| size_t input_offset, |
| float* output, |
| size_t input_increment, |
| size_t output_increment, |
| const union xnn_f32_minmax_params* params); |
| |
| typedef void (*xnn_s8_maxpool_ukernel_function)( |
| size_t output_pixels, |
| size_t kernel_elements, |
| size_t channels, |
| const int8_t** input, |
| size_t input_offset, |
| int8_t* output, |
| size_t input_increment, |
| size_t output_increment, |
| const union xnn_s8_minmax_params* params); |
| |
| typedef void (*xnn_u8_maxpool_ukernel_function)( |
| size_t output_pixels, |
| size_t kernel_elements, |
| size_t channels, |
| const uint8_t** input, |
| size_t input_offset, |
| uint8_t* output, |
| size_t input_increment, |
| size_t output_increment, |
| const union xnn_u8_minmax_params* params); |
| |
| typedef void (*xnn_argmaxpool_unipass_ukernel_function)( |
| size_t output_pixels, |
| size_t kernel_elements, |
| size_t channels, |
| const void** input, |
| size_t input_offset, |
| void* output, |
| uint32_t* index, |
| size_t input_increment, |
| size_t output_increment); |
| |
| typedef void (*xnn_f32_argmaxpool_unipass_ukernel_function)( |
| size_t output_pixels, |
| size_t kernel_elements, |
| size_t channels, |
| const float** input, |
| size_t input_offset, |
| float* output, |
| uint32_t* index, |
| size_t input_increment, |
| size_t output_increment); |
| |
| typedef void (*xnn_argmaxpool_multipass_ukernel_function)( |
| size_t output_pixels, |
| size_t kernel_elements, |
| size_t channels, |
| const void** input, |
| size_t input_offset, |
| void* accumulation_buffer, |
| uint32_t* index_buffer, |
| void* output, |
| uint32_t* index, |
| size_t input_increment, |
| size_t output_increment); |
| |
| typedef void (*xnn_f32_argmaxpool_multipass_ukernel_function)( |
| size_t output_pixels, |
| size_t kernel_elements, |
| size_t channels, |
| const float** input, |
| size_t input_offset, |
| float* accumulation_buffer, |
| uint32_t* index_buffer, |
| float* output, |
| uint32_t* index, |
| size_t input_increment, |
| size_t output_increment); |
| |
| typedef void (*xnn_univector_ukernel_function)( |
| size_t n, |
| const void* x, |
| void* y, |
| const void* params); |
| |
| typedef void (*xnn_f16_vclamp_ukernel_function)( |
| size_t n, |
| const void* x, |
| void* y, |
| const union xnn_f16_minmax_params* params); |
| |
| typedef void (*xnn_f32_vclamp_ukernel_function)( |
| size_t n, |
| const float* x, |
| float* y, |
| const union xnn_f32_minmax_params* params); |
| |
| typedef void (*xnn_s8_vclamp_ukernel_function)( |
| size_t n, |
| const int8_t* x, |
| int8_t* y, |
| const union xnn_s8_minmax_params* params); |
| |
| typedef void (*xnn_u8_vclamp_ukernel_function)( |
| size_t n, |
| const uint8_t* x, |
| uint8_t* y, |
| const union xnn_u8_minmax_params* params); |
| |
| typedef void (*xnn_f32_vrelu_ukernel_function)( |
| size_t n, |
| const float* x, |
| float* y, |
| const union xnn_f32_relu_params* params); |
| |
| typedef void (*xnn_f16_vhswish_ukernel_function)( |
| size_t n, |
| const void* x, |
| void* y, |
| const union xnn_f16_hswish_params* params); |
| |
| typedef void (*xnn_f32_vabs_ukernel_function)( |
| size_t n, |
| const float* x, |
| float* y, |
| const union xnn_f32_abs_params* params); |
| |
| typedef void (*xnn_f32_vhswish_ukernel_function)( |
| size_t n, |
| const float* x, |
| float* y, |
| const union xnn_f32_hswish_params* params); |
| |
| typedef void (*xnn_f32_vlrelu_ukernel_function)( |
| size_t n, |
| const float* x, |
| float* y, |
| const union xnn_f32_lrelu_params* params); |
| |
| typedef void (*xnn_f32_vneg_ukernel_function)( |
| size_t n, |
| const float* x, |
| float* y, |
| const union xnn_f32_neg_params* params); |
| |
| typedef void (*xnn_f32_vround_ukernel_function)( |
| size_t n, |
| const float* x, |
| float* y, |
| const union xnn_f32_rnd_params* params); |
| |
| typedef void (*xnn_f32_vsigmoid_ukernel_function)( |
| size_t n, |
| const float* x, |
| float* y, |
| const union xnn_f32_sigmoid_params* params); |
| |
| typedef void (*xnn_rmax_ukernel_function)( |
| size_t n, |
| const void* x, |
| void* y); |
| |
| typedef void (*xnn_u8_rmax_ukernel_function)( |
| size_t n, |
| const uint8_t* x, |
| uint8_t* y); |
| |
| typedef void (*xnn_f32_rmax_ukernel_function)( |
| size_t n, |
| const float* x, |
| float* y); |
| |
| typedef void (*xnn_u8_lut32norm_ukernel_function)( |
| size_t n, |
| const uint8_t* x, |
| const uint32_t* t, |
| uint8_t* y); |
| |
| typedef void (*xnn_vadd_ukernel_function)( |
| size_t n, |
| const void* a, |
| const void* b, |
| void* y, |
| const void* params); |
| |
| typedef void (*xnn_qu8_vaddsub_minmax_ukernel_function)( |
| size_t n, |
| const uint8_t* input_x, |
| const uint8_t* input_y, |
| uint8_t* output, |
| const union xnn_qu8_addsub_minmax_params* params); |
| |
| typedef void (*xnn_qs8_vaddsub_minmax_ukernel_function)( |
| size_t n, |
| const int8_t* input_x, |
| const int8_t* input_y, |
| int8_t* output, |
| const union xnn_qs8_addsub_minmax_params* params); |
| |
| typedef void (*xnn_qu8_vmul_minmax_ukernel_function)( |
| size_t n, |
| const uint8_t* input_x, |
| const uint8_t* input_y, |
| uint8_t* output, |
| const union xnn_qu8_mul_minmax_params* params); |
| |
| typedef void (*xnn_qs8_vmul_minmax_ukernel_function)( |
| size_t n, |
| const int8_t* input_x, |
| const int8_t* input_y, |
| int8_t* output, |
| const union xnn_qs8_mul_minmax_params* params); |
| |
| typedef void (*xnn_f32_velu_ukernel_function)( |
| size_t n, |
| const float* x, |
| float* y, |
| const union xnn_f32_elu_params* params); |
| |
| |
| typedef void (*xnn_f32_vsqr_ukernel_function)( |
| size_t n, |
| const float* x, |
| float* y, |
| const union xnn_f32_default_params* params); |
| |
| typedef void (*xnn_f32_vsqrt_ukernel_function)( |
| size_t n, |
| const float* x, |
| float* y, |
| const union xnn_f32_sqrt_params* params); |
| |
| typedef void (*xnn_vbinary_ukernel_function)( |
| size_t n, |
| const void* a, |
| const void* b, |
| void* y, |
| const void* params); |
| |
| typedef void (*xnn_f16_vbinary_ukernel_function)( |
| size_t n, |
| const void* a, |
| const void* b, |
| void* y, |
| const union xnn_f16_default_params* params); |
| |
| typedef void (*xnn_f16_vbinary_minmax_ukernel_function)( |
| size_t n, |
| const void* a, |
| const void* b, |
| void* y, |
| const union xnn_f16_minmax_params* params); |
| |
| typedef void (*xnn_f32_vbinary_ukernel_function)( |
| size_t n, |
| const float* a, |
| const float* b, |
| float* y, |
| const union xnn_f32_default_params* params); |
| |
| typedef void (*xnn_f32_vbinary_minmax_ukernel_function)( |
| size_t n, |
| const float* a, |
| const float* b, |
| float* y, |
| const union xnn_f32_minmax_params* params); |
| |
| typedef void (*xnn_f32_vbinary_relu_ukernel_function)( |
| size_t n, |
| const float* a, |
| const float* b, |
| float* y, |
| const union xnn_f32_relu_params* params); |
| |
| typedef void (*xnn_vunary_ukernel_function)( |
| size_t n, |
| const void* x, |
| void* y, |
| const void* params); |
| |
| typedef void (*xnn_s8_vunary_ukernel_function)( |
| size_t n, |
| const int8_t* x, |
| int8_t* y, |
| const void* params); |
| |
| typedef void (*xnn_u8_vunary_ukernel_function)( |
| size_t n, |
| const uint8_t* x, |
| uint8_t* y, |
| const void* params); |
| |
| typedef void (*xnn_f16_vunary_ukernel_function)( |
| size_t n, |
| const uint16_t* x, |
| uint16_t* y, |
| const void* params); |
| |
| typedef void (*xnn_f32_vunary_ukernel_function)( |
| size_t n, |
| const float* x, |
| float* y, |
| const void* params); |
| |
| typedef void (*xnn_f16_f32_vcvt_ukernel_function)( |
| size_t n, |
| const void* input, |
| float* output, |
| const union xnn_f16_f32_cvt_params* params); |
| |
| typedef void (*xnn_f32_f16_vcvt_ukernel_function)( |
| size_t n, |
| const float* input, |
| void* output, |
| const union xnn_f32_f16_cvt_params* params); |
| |
| typedef void (*xnn_f32_qs8_vcvt_ukernel_function)( |
| size_t n, |
| const float* input, |
| int8_t* output, |
| const union xnn_f32_qs8_cvt_params* params); |
| |
| typedef void (*xnn_f32_qu8_vcvt_ukernel_function)( |
| size_t n, |
| const float* input, |
| uint8_t* output, |
| const union xnn_f32_qu8_cvt_params* params); |
| |
| typedef void (*xnn_qs8_f32_vcvt_ukernel_function)( |
| size_t n, |
| const int8_t* input, |
| float* output, |
| const union xnn_qs8_f32_cvt_params* params); |
| |
| typedef void (*xnn_qu8_f32_vcvt_ukernel_function)( |
| size_t n, |
| const uint8_t* input, |
| float* output, |
| const union xnn_qu8_f32_cvt_params* params); |
| |
| typedef void (*xnn_vmulcaddc_ukernel_function)( |
| size_t m, |
| size_t c, |
| const void* x, |
| size_t x_stride, |
| const void* w, |
| void* y, |
| size_t y_stride, |
| const void* params); |
| |
| typedef void (*xnn_f16_vmulcaddc_ukernel_function)( |
| size_t m, |
| size_t c, |
| const void* x, |
| size_t x_stride, |
| const void* w, |
| void* y, |
| size_t y_stride, |
| const union xnn_f16_minmax_params* params); |
| |
| typedef void (*xnn_f32_vmulcaddc_ukernel_function)( |
| size_t m, |
| size_t c, |
| const float* x, |
| size_t x_stride, |
| const float* w, |
| float* y, |
| size_t y_stride, |
| const union xnn_f32_minmax_params* params); |
| |
| typedef void (*xnn_prelu_ukernel_function)( |
| size_t mr, |
| size_t n, |
| const void* x, |
| size_t x_stride, |
| const void* w, |
| void* y, |
| size_t y_stride); |
| |
| typedef void (*xnn_f16_prelu_ukernel_function)( |
| size_t mr, |
| size_t n, |
| const void* x, |
| size_t x_stride, |
| const void* w, |
| void* y, |
| size_t y_stride); |
| |
| typedef void (*xnn_f32_prelu_ukernel_function)( |
| size_t mr, |
| size_t n, |
| const float* x, |
| size_t x_stride, |
| const float* w, |
| float* y, |
| size_t y_stride); |
| |
| typedef void (*xnn_f32_raddexpminusmax_ukernel_function)( |
| size_t n, |
| const float* input, |
| float* sum, |
| float max); |
| |
| typedef void (*xnn_f32_raddstoreexpminusmax_ukernel_function)( |
| size_t n, |
| const float* input, |
| const float* max, |
| float* output, |
| float* sum, |
| const union xnn_f32_expminus_params* params); |
| |
| typedef void (*xnn_f32_vscaleexpminusmax_ukernel_function)( |
| size_t n, |
| const float* input, |
| float* output, |
| float max, |
| float scale); |
| |
| typedef void (*xnn_f32_vscale_ukernel_function)( |
| size_t n, |
| const float* x, |
| float* y, |
| float c); |
| |
| // Reduce-Add Extended ("mantissa" + "exponent") Exponentials |
| typedef void (*xnn_f32_raddextexp_ukernel_function)( |
| size_t n, |
| const float* input, |
| float* sum); |
| |
| // Vector Scale Extended ("mantissa" + "exponent") Exponentials |
| typedef void (*xnn_f32_vscaleextexp_ukernel_function)( |
| size_t n, |
| const float* input, |
| float* output, |
| float scale_mantissa, |
| float scale_exponent); |
| |
| typedef void (*xnn_init_f16_f32_cvt_params_fn)( |
| union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)]); |
| |
| typedef void (*xnn_init_f32_f16_cvt_params_fn)( |
| union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)]); |
| |
| typedef void (*xnn_init_f32_qs8_cvt_params_fn)( |
| union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)], |
| float scale, |
| int8_t output_zero_point, |
| int8_t output_min, |
| int8_t output_max); |
| |
| typedef void (*xnn_init_f32_qu8_cvt_params_fn)( |
| union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)], |
| float scale, |
| uint8_t output_zero_point, |
| uint8_t output_min, |
| uint8_t output_max); |
| |
| typedef void (*xnn_init_qs8_f32_cvt_params_fn)( |
| union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)], |
| float scale, |
| int8_t zero_point); |
| |
| typedef void (*xnn_init_qu8_f32_cvt_params_fn)( |
| union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)], |
| float scale, |
| uint8_t zero_point); |
| |
| typedef void (*xnn_init_qs8_minmax_params_fn)( |
| union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS(1)], |
| int8_t output_zero_point, |
| int8_t output_min, |
| int8_t output_max); |
| |
| typedef void (*xnn_init_qs8_conv_minmax_params_fn)( |
| union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)], |
| float scale, |
| int8_t output_zero_point, |
| int8_t output_min, |
| int8_t output_max); |
| |
| typedef void (*xnn_init_qu8_conv_minmax_params_fn)( |
| union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)], |
| uint8_t kernel_zero_point, |
| float scale, |
| uint8_t output_zero_point, |
| uint8_t output_min, |
| uint8_t output_max); |
| |
| typedef void (*xnn_init_qs8_avgpool_minmax_params_fn)( |
| union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)], |
| int32_t bias, |
| float scale, |
| int8_t output_zero_point, |
| int8_t output_min, |
| int8_t output_max); |
| |
| typedef void (*xnn_init_qu8_avgpool_minmax_params_fn)( |
| union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)], |
| int32_t bias, |
| float scale, |
| uint8_t output_zero_point, |
| uint8_t output_min, |
| uint8_t output_max); |
| |
| typedef void (*xnn_update_qs8_avgpool_minmax_params_fn)( |
| union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)], |
| int32_t bias, |
| float scale); |
| |
| typedef void (*xnn_update_qu8_avgpool_minmax_params_fn)( |
| union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)], |
| int32_t bias, |
| float scale); |
| |
| typedef void (*xnn_init_qs8_addsub_minmax_params_fn)( |
| union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)], |
| int8_t a_zero_point, |
| int8_t b_zero_point, |
| int8_t output_zero_point, |
| float a_output_scale, |
| float b_output_scale, |
| int8_t output_min, |
| int8_t output_max); |
| |
| typedef void (*xnn_init_qu8_addsub_minmax_params_fn)( |
| union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)], |
| uint8_t a_zero_point, |
| uint8_t b_zero_point, |
| uint8_t output_zero_point, |
| float a_output_scale, |
| float b_output_scale, |
| uint8_t output_min, |
| uint8_t output_max); |
| |
| typedef void (*xnn_init_qs8_mul_minmax_params_fn)( |
| union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)], |
| int8_t a_zero_point, |
| int8_t b_zero_point, |
| int8_t output_zero_point, |
| float product_output_scale, |
| int8_t output_min, |
| int8_t output_max); |
| |
| typedef void (*xnn_init_qu8_mul_minmax_params_fn)( |
| union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)], |
| uint8_t a_zero_point, |
| uint8_t b_zero_point, |
| uint8_t output_zero_point, |
| float product_output_scale, |
| uint8_t output_min, |
| uint8_t output_max); |
| |
| typedef void (*xnn_init_f16_hswish_params_fn)( |
| union xnn_f16_hswish_params params[XNN_MIN_ELEMENTS(1)]); |
| |
| typedef void (*xnn_init_f16_minmax_params_fn)( |
| union xnn_f16_minmax_params params[XNN_MIN_ELEMENTS(1)], |
| uint16_t min, |
| uint16_t max); |
| |
| typedef void (*xnn_init_f16_scaleminmax_params_fn)( |
| union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS(1)], |
| uint16_t scale, |
| uint16_t min, |
| uint16_t max); |
| |
| typedef void (*xnn_update_f16_scaleminmax_params_fn)( |
| union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS(1)], |
| uint16_t scale); |
| |
| typedef void (*xnn_init_f32_abs_params_fn)( |
| union xnn_f32_abs_params params[XNN_MIN_ELEMENTS(1)]); |
| |
| typedef void (*xnn_init_f32_default_params_fn)( |
| union xnn_f32_default_params params[XNN_MIN_ELEMENTS(1)]); |
| |
| typedef void (*xnn_init_f32_expminus_params_fn)( |
| union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)]); |
| |
| typedef void (*xnn_init_f32_elu_params_fn)( |
| union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)], |
| float prescale, |
| float alpha, |
| float beta); |
| |
| typedef void (*xnn_init_f32_hswish_params_fn)( |
| union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS(1)]); |
| |
| typedef void (*xnn_init_f32_lrelu_params_fn)( |
| union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS(1)], |
| float slope); |
| |
| typedef void (*xnn_init_f32_minmax_params_fn)( |
| union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS(1)], |
| float output_min, |
| float output_max); |
| |
| typedef void (*xnn_init_f32_neg_params_fn)( |
| union xnn_f32_neg_params params[XNN_MIN_ELEMENTS(1)]); |
| |
| typedef void (*xnn_init_f32_rnd_params_fn)( |
| union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS(1)]); |
| |
| typedef void (*xnn_init_f32_scaleminmax_params_fn)( |
| union xnn_f32_scaleminmax_params params[XNN_MIN_ELEMENTS(1)], |
| float scale, |
| float output_min, |
| float output_max); |
| |
| typedef void (*xnn_update_f32_scaleminmax_params_fn)( |
| union xnn_f32_scaleminmax_params params[XNN_MIN_ELEMENTS(1)], |
| float scale); |
| |
| typedef void (*xnn_init_f32_sigmoid_params_fn)( |
| union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)]); |
| |
| typedef void (*xnn_init_f32_sqrt_params_fn)( |
| union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS(1)]); |
| |
| typedef void (*xnn_init_s8_minmax_params_fn)( |
| union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS(1)], |
| int8_t output_min, |
| int8_t output_max); |
| |
| typedef void (*xnn_init_u8_minmax_params_fn)( |
| union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS(1)], |
| uint8_t output_min, |
| uint8_t output_max); |
| |
| typedef void (*xnn_init_qc8_scale_params_fn)( |
| size_t channels, |
| size_t channels_tile, |
| size_t stride, |
| const float scale[XNN_MIN_ELEMENTS(1)], |
| void* packed_w); |
| |
| // Forward declare to avoid circular includes between this and allocator.h. |
| struct xnn_code_buffer; |
| |
| struct jit_gemm_params { |
| struct { |
| float min; |
| float max; |
| } f32_minmax; |
| }; |
| |
| typedef enum xnn_status (*xnn_jit_gemm_code_generator_function)( |
| struct xnn_code_buffer *code, size_t nc, size_t kc, const void *params); |
| typedef enum xnn_status (*xnn_jit_igemm_code_generator_function)( |
| struct xnn_code_buffer *code, size_t nc, size_t kc, size_t ks, const void *params); |
| |
| struct xnn_hmp_gemm_ukernel { |
| xnn_gemm_ukernel_function function[XNN_MAX_UARCH_TYPES]; |
| }; |
| |
| static inline struct xnn_hmp_gemm_ukernel xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function function) { |
| struct xnn_hmp_gemm_ukernel ukernel = {{ function }}; |
| for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) { |
| ukernel.function[i] = function; |
| } |
| return ukernel; |
| } |
| |
| static inline bool xnn_is_hmp_gemm_ukernel(struct xnn_hmp_gemm_ukernel ukernel) { |
| #if XNN_MAX_UARCH_TYPES == 1 |
| return false; |
| #else |
| uintptr_t default_function = (uintptr_t) ukernel.function[XNN_UARCH_DEFAULT]; |
| uintptr_t difference = 0; |
| for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) { |
| difference |= (default_function ^ (uintptr_t) ukernel.function[i]); |
| } |
| return difference != 0; |
| #endif |
| } |
| |
| struct xnn_hmp_igemm_ukernel { |
| xnn_igemm_ukernel_function function[XNN_MAX_UARCH_TYPES]; |
| }; |
| |
| static inline struct xnn_hmp_igemm_ukernel xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function function) { |
| struct xnn_hmp_igemm_ukernel ukernel = {{ function }}; |
| for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) { |
| ukernel.function[i] = function; |
| } |
| return ukernel; |
| } |
| |
| static inline bool xnn_is_hmp_igemm_ukernel(struct xnn_hmp_igemm_ukernel ukernel) { |
| #if XNN_MAX_UARCH_TYPES == 1 |
| return false; |
| #else |
| uintptr_t default_function = (uintptr_t) ukernel.function[XNN_UARCH_DEFAULT]; |
| uintptr_t difference = 0; |
| for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) { |
| difference |= (default_function ^ (uintptr_t) ukernel.function[i]); |
| } |
| return difference != 0; |
| #endif |
| } |
| |
| struct gemm_fused_ukernels { |
| struct xnn_hmp_gemm_ukernel gemm; |
| struct xnn_hmp_igemm_ukernel igemm; |
| // Optional GEMM and IGEMM micro-kernels with MR=1 and the same NR and KR parameters. |
| struct xnn_hmp_gemm_ukernel gemm1; |
| struct xnn_hmp_igemm_ukernel igemm1; |
| }; |
| |
| #if XNN_PLATFORM_JIT |
| struct xnn_hmp_gemm_codegen { |
| xnn_jit_gemm_code_generator_function function[XNN_MAX_UARCH_TYPES]; |
| }; |
| |
| static inline struct xnn_hmp_gemm_codegen xnn_init_hmp_gemm_codegen(xnn_jit_gemm_code_generator_function function) { |
| struct xnn_hmp_gemm_codegen ukernel = {{ function }}; |
| for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) { |
| ukernel.function[i] = function; |
| } |
| return ukernel; |
| } |
| |
| static inline bool xnn_is_hmp_gemm_codegen(struct xnn_hmp_gemm_codegen ukernel) { |
| #if XNN_MAX_UARCH_TYPES == 1 |
| return false; |
| #else |
| uintptr_t default_function = (uintptr_t) ukernel.function[XNN_UARCH_DEFAULT]; |
| uintptr_t difference = 0; |
| for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) { |
| difference |= (default_function ^ (uintptr_t) ukernel.function[i]); |
| } |
| return difference != 0; |
| #endif |
| } |
| |
| struct xnn_hmp_igemm_codegen { |
| xnn_jit_igemm_code_generator_function function[XNN_MAX_UARCH_TYPES]; |
| }; |
| |
| static inline struct xnn_hmp_igemm_codegen xnn_init_hmp_igemm_codegen(xnn_jit_igemm_code_generator_function function) { |
| struct xnn_hmp_igemm_codegen ukernel = {{ function }}; |
| for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) { |
| ukernel.function[i] = function; |
| } |
| return ukernel; |
| } |
| |
| static inline bool xnn_is_hmp_igemm_codegen(struct xnn_hmp_igemm_codegen ukernel) { |
| #if XNN_MAX_UARCH_TYPES == 1 |
| return false; |
| #else |
| uintptr_t default_function = (uintptr_t) ukernel.function[XNN_UARCH_DEFAULT]; |
| uintptr_t difference = 0; |
| for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) { |
| difference |= (default_function ^ (uintptr_t) ukernel.function[i]); |
| } |
| return difference != 0; |
| #endif |
| } |
| |
| struct gemm_codegens { |
| struct xnn_hmp_gemm_codegen gemm; |
| struct xnn_hmp_igemm_codegen igemm; |
| // Optional JIT GEMM and IGEMM micro-kernels with MR=1 and the same NR and KR parameters. |
| struct xnn_hmp_gemm_codegen gemm1; |
| struct xnn_hmp_igemm_codegen igemm1; |
| }; |
| #endif // XNN_PLATFORM_JIT |
| |
| struct gemm_parameters { |
| struct gemm_fused_ukernels minmax; |
| struct gemm_fused_ukernels relu; |
| struct gemm_fused_ukernels linear; |
| #if XNN_PLATFORM_JIT |
| struct gemm_codegens generator; |
| #endif // XNN_PLATFORM_JIT |
| union { |
| xnn_init_qs8_minmax_params_fn qc8; |
| xnn_init_qs8_conv_minmax_params_fn qs8; |
| xnn_init_qu8_conv_minmax_params_fn qu8; |
| xnn_init_f16_scaleminmax_params_fn f16; |
| xnn_init_f32_minmax_params_fn f32; |
| } init; |
| uint8_t mr; |
| uint8_t nr; |
| uint8_t log2_kr; |
| uint8_t log2_sr; |
| }; |
| |
| struct vunary_parameters { |
| xnn_univector_ukernel_function ukernel; |
| union { |
| xnn_init_f16_f32_cvt_params_fn f16_f32_cvt; |
| xnn_init_f16_hswish_params_fn f16_hswish; |
| xnn_init_f32_abs_params_fn f32_abs; |
| xnn_init_f32_default_params_fn f32_default; |
| xnn_init_f32_elu_params_fn f32_elu; |
| xnn_init_f32_f16_cvt_params_fn f32_f16_cvt; |
| xnn_init_f32_hswish_params_fn f32_hswish; |
| xnn_init_f32_lrelu_params_fn f32_lrelu; |
| xnn_init_f32_minmax_params_fn f32_minmax; |
| xnn_init_f32_neg_params_fn f32_neg; |
| xnn_init_f32_qs8_cvt_params_fn f32_qs8_cvt; |
| xnn_init_f32_qu8_cvt_params_fn f32_qu8_cvt; |
| xnn_init_f32_rnd_params_fn f32_rnd; |
| xnn_init_f32_sigmoid_params_fn f32_sigmoid; |
| xnn_init_f32_sqrt_params_fn f32_sqrt; |
| xnn_init_qs8_f32_cvt_params_fn qs8_f32_cvt; |
| xnn_init_qu8_f32_cvt_params_fn qu8_f32_cvt; |
| xnn_init_s8_minmax_params_fn s8_minmax; |
| xnn_init_u8_minmax_params_fn u8_minmax; |
| } init; |
| // Number of elements in a tile. |
| // For best efficiency, micro-kernel must process a multiple of this number of elements in each call. |
| uint8_t element_tile; |
| }; |
| |
| struct vbinary_fused_ukernels { |
| xnn_vbinary_ukernel_function op_ukernel; |
| xnn_vbinary_ukernel_function opc_ukernel; |
| xnn_vbinary_ukernel_function ropc_ukernel; |
| }; |
| |
| struct vbinary_parameters { |
| struct vbinary_fused_ukernels minmax; |
| struct vbinary_fused_ukernels linear; |
| union { |
| xnn_init_f16_minmax_params_fn f16_minmax; |
| xnn_init_f32_default_params_fn f32_default; |
| xnn_init_f32_minmax_params_fn f32_minmax; |
| xnn_init_qs8_addsub_minmax_params_fn qs8_addsub; |
| xnn_init_qs8_mul_minmax_params_fn qs8_mul; |
| xnn_init_qu8_addsub_minmax_params_fn qu8_addsub; |
| xnn_init_qu8_mul_minmax_params_fn qu8_mul; |
| } init; |
| // Number of elements in a tile. |
| // For best efficiency, micro-kernel must process a multiple of this number of elements in each call. |
| uint8_t element_tile; |
| }; |
| |
| struct spmm_parameters { |
| xnn_spmm_ukernel_function ukernel; |
| // Number of M-dimension elements in a tile. |
| // Corresponds to a block of pixels in 1x1 Convolution and a block of batch size in Fully Connected operator. |
| uint8_t mr; |
| // Number of N-dimension elements in a tile. |
| // Corresponds to a block of output channels/features in 1x1 Convolution and Fully Connected operator. |
| uint8_t nr; |
| }; |
| |
| struct conv_hwc2chw_parameters { |
| xnn_conv_hwc2chw_ukernel_function ukernel_with_symm_padding; |
| // Number of output channels in a tile. |
| // This parameter must be passed as is to weight packing function. |
| uint8_t output_channel_tile; |
| // Number of output height pixels in a tile. |
| // For best efficiency, micro-kernel must produce a multiple of this number of rows in each call. |
| uint8_t output_height_tile; |
| // Number of output width pixels in a tile. |
| uint8_t output_width_tile; |
| }; |
| |
| struct dwconv2d_chw_parameters { |
| xnn_dwconv2d_chw_ukernel_function ukernel; |
| // Number of output width pixels in a tile. |
| uint8_t output_width_tile; |
| // Number of output height pixels in a tile. |
| // For best efficiency, micro-kernel must produce a multiple of this number of rows in each call. |
| uint8_t output_height_tile; |
| }; |
| |
| struct gavgpool_cw_parameters { |
| xnn_gavgpool_cw_ukernel_function ukernel; |
| // Number of channels in a tile. |
| // For best efficiency, micro-kernel must process a multiple of this number of channels in each call. |
| uint8_t channel_tile; |
| }; |
| |
| union dwconv_fused_ukernels { |
| xnn_dwconv_unipass_ukernel_function unipass; |
| xnn_dwconv_multipass_ukernel_function multipass; |
| }; |
| |
| struct dwconv_parameters { |
| union dwconv_fused_ukernels minmax; |
| union dwconv_fused_ukernels linear; |
| union { |
| xnn_init_qs8_minmax_params_fn qc8; |
| xnn_init_qs8_conv_minmax_params_fn qs8; |
| xnn_init_qu8_conv_minmax_params_fn qu8; |
| xnn_init_f16_minmax_params_fn f16; |
| xnn_init_f32_minmax_params_fn f32; |
| } init; |
| uint8_t channel_tile; |
| uint8_t primary_tile; |
| uint8_t incremental_tile; |
| }; |
| |
| struct depthtospace2d_chw2hwc_parameters { |
| xnn_depthtospace2d_chw2hwc_ukernel_function ukernel; |
| // Number of output pixels in a tile. |
| // For best efficiency, micro-kernel must produce a multiple of this number of pixels in each call. |
| uint8_t pixel_tile; |
| // Number of channels in a tile. |
| // For best efficiency, micro-kernel must process a multiple of this number of channels in each call. |
| uint8_t channel_tile; |
| }; |
| |
| struct gavgpool_parameters { |
| xnn_gavgpool_unipass_ukernel_function unipass; |
| xnn_gavgpool_multipass_ukernel_function multipass; |
| union { |
| xnn_init_f16_scaleminmax_params_fn f16; |
| xnn_init_f32_scaleminmax_params_fn f32; |
| xnn_init_qs8_avgpool_minmax_params_fn qs8; |
| xnn_init_qu8_avgpool_minmax_params_fn qu8; |
| } init; |
| union { |
| xnn_update_f16_scaleminmax_params_fn f16; |
| xnn_update_f32_scaleminmax_params_fn f32; |
| xnn_update_qs8_avgpool_minmax_params_fn qs8; |
| xnn_update_qu8_avgpool_minmax_params_fn qu8; |
| } update; |
| // Number of rows in a tile. |
| // For best efficiency, micro-kernel must produce a multiple of this number of rows in each call. |
| uint16_t row_tile; |
| // Number of channels in a tile. |
| // For best efficiency, micro-kernel must process a multiple of this number of channels in each call. |
| uint16_t channel_tile; |
| }; |
| |
| struct avgpool_parameters { |
| xnn_avgpool_unipass_ukernel_function unipass; |
| xnn_avgpool_multipass_ukernel_function multipass; |
| union { |
| xnn_init_f32_scaleminmax_params_fn f32; |
| xnn_init_qu8_avgpool_minmax_params_fn qu8; |
| } init; |
| // Number of rows in a primary tile. |
| // Unipass micro-kernel must be called with this number of rows, or fewer. |
| // Multipass micro-kernel must be called with more than this number of rows. |
| uint8_t primary_tile; |
| // Number of rows in an incremental tile. |
| // For best efficiency, multipass micro-kernel must process the number of rows in the primary tile plus a multiple |
| // of this number of rows in each call. This number has no meaning for the unipass micro-kernel. |
| uint8_t incremental_tile; |
| // Number of channels in a tile. |
| // For best efficiency, micro-kernel must process a multiple of this number of channels in each call. |
| uint16_t channel_tile; |
| }; |
| |
| struct pavgpool_parameters { |
| xnn_pavgpool_unipass_ukernel_function unipass; |
| xnn_pavgpool_multipass_ukernel_function multipass; |
| // Number of rows in a primary tile. |
| // Unipass micro-kernel must be called with this number of rows, or fewer. |
| // Multipass micro-kernel must be called with more than this number of rows. |
| uint8_t primary_tile; |
| // Number of rows in an incremental tile. |
| // For best efficiency, multipass micro-kernel must process the number of rows in the primary tile plus a multiple |
| // of this number of rows in each call. This number has no meaning for the unipass micro-kernel. |
| uint8_t incremental_tile; |
| // Number of channels in a tile. |
| // For best efficiency, micro-kernel must process a multiple of this number of channels in each call. |
| uint16_t channel_tile; |
| }; |
| |
| struct argmaxpool_parameters { |
| union { |
| xnn_argmaxpool_unipass_ukernel_function up; |
| xnn_argmaxpool_multipass_ukernel_function mp; |
| }; |
| uint8_t mr; |
| uint8_t qr; |
| }; |
| |
| struct maxpool_parameters { |
| xnn_maxpool_ukernel_function ukernel; |
| union { |
| xnn_init_s8_minmax_params_fn s8; |
| xnn_init_u8_minmax_params_fn u8; |
| xnn_init_f32_minmax_params_fn f32; |
| } init; |
| uint8_t mr; |
| uint8_t qr; |
| }; |
| |
| struct ibilinear_parameters { |
| xnn_ibilinear_ukernel_function ukernel; |
| // Number of output pixels in a tile. |
| // For best efficiency, micro-kernel must produce a multiple of this number of pixels in each call. |
| uint8_t pixel_tile; |
| // Number of channels in a tile. |
| // For best efficiency, micro-kernel must process a multiple of this number of channels in each call. |
| uint8_t channel_tile; |
| }; |
| |
| struct ibilinear_chw_parameters { |
| xnn_ibilinear_chw_ukernel_function ukernel; |
| // Number of output pixels in a tile. |
| // For best efficiency, micro-kernel must produce a multiple of this number of pixels in each call. |
| uint8_t pixel_tile; |
| // Number of channels in a tile. |
| // For best efficiency, micro-kernel must process a multiple of this number of channels in each call. |
| uint8_t channel_tile; |
| }; |
| |
| struct zip_parameters { |
| xnn_zipc_ukernel_function x2; |
| xnn_zipc_ukernel_function x3; |
| xnn_zipc_ukernel_function x4; |
| xnn_zipv_ukernel_function xm; |
| }; |
| |
| struct prelu_parameters { |
| xnn_prelu_ukernel_function ukernel; |
| uint16_t row_tile; |
| uint16_t channel_tile; |
| }; |
| |
| struct raddstoreexpminusmax_parameters { |
| xnn_f32_raddstoreexpminusmax_ukernel_function ukernel; |
| xnn_init_f32_expminus_params_fn init; |
| // Number of elements in a tile. |
| // For best efficiency, micro-kernel must process a multiple of this number of elements in each call. |
| uint8_t element_tile; |
| }; |
| |
| struct fill_parameters { |
| xnn_fill_ukernel_function ukernel; |
| // Number of rows of inputs processed in one tile. |
| // For best efficiency, micro-kernel must produce a multiple of this number of rows in each call. |
| uint8_t row_tile; |
| }; |
| |
| struct pad_parameters { |
| xnn_pad_ukernel_function ukernel; |
| // Number of rows of inputs processed in one tile. |
| // For best efficiency, micro-kernel must produce a multiple of this number of rows in each call. |
| uint8_t row_tile; |
| }; |
| |
| struct vmulcaddc_parameters { |
| xnn_vmulcaddc_ukernel_function ukernel; |
| union { |
| xnn_init_f16_minmax_params_fn f16; |
| xnn_init_f32_minmax_params_fn f32; |
| } init; |
| uint8_t channel_tile; |
| uint8_t row_tile; |
| }; |
| |
| #define XNN_MAX_QC8_DWCONV_UKERNELS 2 |
| #define XNN_MAX_QS8_DWCONV_UKERNELS 2 |
| #define XNN_MAX_QU8_DWCONV_UKERNELS 2 |
| #define XNN_MAX_F16_DWCONV_UKERNELS 3 |
| #define XNN_MAX_F32_DWCONV_UKERNELS 4 |
| #define XNN_MAX_F32_ARGMAXPOOL_UKERNELS 3 |
| |
| // Indicates that XNNPACK as a whole has initialized. |
| // This does not guarantee that any particular microkernels are available. |
| #define XNN_INIT_FLAG_XNNPACK 0x00000001 |
| // Indicates that F32 XNNPACK microkernels are available for use. |
| #define XNN_INIT_FLAG_F32 0x00000002 |
| // Indicates that X32 XNNPACK microkernels are available for use. |
| #define XNN_INIT_FLAG_X32 0x00000004 |
| // Indicates that F16 XNNPACK microkernels are available for use. |
| #define XNN_INIT_FLAG_F16 0x00000008 |
| // Indicates that X16 XNNPACK microkernels are available for use. |
| #define XNN_INIT_FLAG_X16 0x00000010 |
| // Indicates that QC8 XNNPACK microkernels are available for use. |
| #define XNN_INIT_FLAG_QC8 0x00000020 |
| // Indicates that QS8 XNNPACK microkernels are available for use. |
| #define XNN_INIT_FLAG_QS8 0x00000040 |
| // Indicates that QU8 XNNPACK microkernels are available for use. |
| #define XNN_INIT_FLAG_QU8 0x00000080 |
| // Indicates that S8 XNNPACK microkernels are available for use. |
| #define XNN_INIT_FLAG_S8 0x00000100 |
| // Indicates that U8 XNNPACK microkernels are available for use. |
| #define XNN_INIT_FLAG_U8 0x00000200 |
| // Indicates that X8 XNNPACK microkernels are available for use. |
| #define XNN_INIT_FLAG_X8 0x00000400 |
| // Indicates that XX XNNPACK microkernels are available for use. |
| #define XNN_INIT_FLAG_XX 0x00000800 |
| // Indicates that VCVT XNNPACK microkernels are available for use. |
| #define XNN_INIT_FLAG_VCVT 0x00001000 |
| // Indicates that CHW XNNPACK microkernels are optimized for the host platform. |
| #define XNN_INIT_FLAG_CHW_OPT 0x00002000 |
| |
| struct xnn_parameters { |
| // Bitwise combination of XNN_INIT_FLAG_* flags |
| uint32_t init_flags; |
| struct xnn_allocator allocator; |
| struct { |
| struct gemm_parameters gemm; |
| struct dwconv_parameters dwconv[XNN_MAX_QC8_DWCONV_UKERNELS]; |
| } qc8; |
| struct { |
| struct gemm_parameters gemm; |
| struct dwconv_parameters dwconv[XNN_MAX_QS8_DWCONV_UKERNELS]; |
| struct gavgpool_parameters gavgpool; |
| struct vbinary_parameters vadd; |
| struct vbinary_parameters vmul; |
| } qs8; |
| struct { |
| struct gemm_parameters gemm; |
| struct dwconv_parameters dwconv[XNN_MAX_QU8_DWCONV_UKERNELS]; |
| struct avgpool_parameters avgpool; |
| struct gavgpool_parameters gavgpool; |
| struct vbinary_parameters vadd; |
| struct vbinary_parameters vmul; |
| } qu8; |
| struct { |
| struct vunary_parameters clamp; |
| // Bilinear interpolation (2D). |
| struct ibilinear_parameters ibilinear; |
| struct maxpool_parameters maxpool; |
| } s8; |
| struct { |
| struct vunary_parameters clamp; |
| // Bilinear interpolation (2D). |
| struct ibilinear_parameters ibilinear; |
| struct maxpool_parameters maxpool; |
| xnn_u8_lut32norm_ukernel_function lut32norm; |
| xnn_u8_rmax_ukernel_function rmax; |
| } u8; |
| struct { |
| xnn_x8_lut_ukernel_function lut; |
| struct zip_parameters zip; |
| } x8; |
| struct { |
| struct gavgpool_parameters gavgpool; |
| struct gemm_parameters gemm; |
| struct gemm_parameters gemm2; |
| struct dwconv_parameters dwconv[XNN_MAX_F16_DWCONV_UKERNELS]; |
| struct vunary_parameters hswish; |
| struct prelu_parameters prelu; |
| struct vbinary_parameters vadd; |
| struct vbinary_parameters vmul; |
| struct vmulcaddc_parameters vmulcaddc; |
| } f16; |
| struct { |
| struct gemm_parameters gemm; |
| struct gemm_parameters gemm2; |
| struct dwconv_parameters dwconv[XNN_MAX_F32_DWCONV_UKERNELS]; |
| struct avgpool_parameters avgpool; |
| struct pavgpool_parameters pavgpool; |
| struct gavgpool_parameters gavgpool; |
| struct maxpool_parameters maxpool; |
| struct argmaxpool_parameters argmaxpool[XNN_MAX_F32_ARGMAXPOOL_UKERNELS]; |
| // Bilinear interpolation (2D). |
| struct ibilinear_parameters ibilinear; |
| struct vunary_parameters abs; |
| struct vunary_parameters clamp; |
| struct vunary_parameters elu; |
| struct vunary_parameters hswish; |
| struct vunary_parameters lrelu; |
| struct vunary_parameters neg; |
| xnn_univector_ukernel_function relu; |
| struct vunary_parameters rndne; |
| struct vunary_parameters rndz; |
| struct vunary_parameters rndu; |
| struct vunary_parameters rndd; |
| struct vunary_parameters sigmoid; |
| struct vunary_parameters sqr; |
| struct vunary_parameters sqrt; |
| struct prelu_parameters prelu; |
| struct vbinary_parameters vadd; |
| struct vbinary_parameters vdiv; |
| struct vbinary_parameters vmax; |
| struct vbinary_parameters vmin; |
| struct vbinary_parameters vmul; |
| struct vbinary_parameters vsub; |
| struct vbinary_parameters vsqrdiff; |
| struct vmulcaddc_parameters vmulcaddc; |
| struct raddstoreexpminusmax_parameters raddstoreexpminusmax; |
| xnn_f32_rmax_ukernel_function rmax; |
| // Sparse Matrix-Dense Matrix Multiplication (NR=1 block). |
| struct spmm_parameters spmm; |
| // Sparse Matrix-Dense Matrix Multiplication (NR=2 block). |
| struct spmm_parameters spmm2; |
| // Sparse Matrix-Dense Matrix Multiplication (NR=4 block). |
| struct spmm_parameters spmm4; |
| // Direct 3x3 stride-2 Convolution with 3 input channels and HWC->CHW layout conversion. |
| struct conv_hwc2chw_parameters conv_hwc2chw_3x3c3s2; |
| // Direct 3x3 stride-1 Convolution with padding 1 on left and right in CHW layout. |
| struct dwconv2d_chw_parameters dwconv2d_chw_3x3; |
| // Direct 3x3 stride-2 Convolution with padding 1 on left and right in CHW layout. |
| struct dwconv2d_chw_parameters dwconv2d_chw_3x3s2; |
| // Direct 5x5 stride-1 Convolution with padding 2 on left and right in CHW layout. |
| struct dwconv2d_chw_parameters dwconv2d_chw_5x5; |
| // Direct 5x5 stride-2 Convolution with padding 2 on left and right in CHW layout. |
| struct dwconv2d_chw_parameters dwconv2d_chw_5x5s2; |
| // Global Average Pooling in CW layout. |
| struct gavgpool_cw_parameters gavgpool_cw; |
| // Bilinear interpolation (2D) in CHW layout. |
| struct ibilinear_chw_parameters ibilinear_chw; |
| } f32; |
| struct { |
| struct vunary_parameters f16_to_f32; |
| struct vunary_parameters f32_to_f16; |
| struct vunary_parameters f32_to_qs8; |
| struct vunary_parameters f32_to_qu8; |
| struct vunary_parameters qs8_to_f32; |
| struct vunary_parameters qu8_to_f32; |
| } vcvt; |
| struct { |
| xnn_unpool_ukernel_function unpool; |
| struct zip_parameters zip; |
| // Depth To Space 2D with CHW->HWC layout conversion. |
| struct depthtospace2d_chw2hwc_parameters depthtospace2d_chw2hwc; |
| } x32; |
| struct { |
| xnn_univector_ukernel_function copy; |
| struct fill_parameters fill; |
| struct pad_parameters pad; |
| } xx; |
| }; |
| |
| #ifdef __cplusplus |
| extern "C" XNN_INTERNAL struct xnn_parameters xnn_params; |
| #else |
| extern XNN_INTERNAL struct xnn_parameters xnn_params; |
| #endif |