blob: c172e506393a4fcc87475a79659729c9511f2e8e [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
Marat Dukhan01849012020-04-27 19:28:32 -07009#include <math.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070010#include <stdbool.h>
11#include <stddef.h>
12#include <stdint.h>
Marat Dukhan04f03be2019-11-19 12:36:47 -080013#include <string.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070014
Marat Dukhan57133c02020-04-13 00:54:59 -070015#ifdef _WIN32
16 #include <windows.h>
17#else
18 #include <pthread.h>
19#endif
XNNPACK Teamb455b122019-09-27 18:10:33 -070020
Marat Dukhan496389f2021-04-07 15:47:12 -070021#ifdef _MSC_VER
22 #include <intrin.h>
23#endif
24
Marat Dukhand343c222019-10-07 09:22:14 -070025#ifndef __EMSCRIPTEN__
26 #include <cpuinfo.h>
27#endif
XNNPACK Teamb455b122019-09-27 18:10:33 -070028
29#include <xnnpack.h>
Marat Dukhan496389f2021-04-07 15:47:12 -070030#include <xnnpack/allocator.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070031#include <xnnpack/argmaxpool.h>
32#include <xnnpack/avgpool.h>
Marat Dukhan1dadbf72019-10-01 10:46:20 -070033#include <xnnpack/common.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070034#include <xnnpack/conv.h>
35#include <xnnpack/dwconv.h>
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -080036#include <xnnpack/depthtospace.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070037#include <xnnpack/gavgpool.h>
38#include <xnnpack/gemm.h>
Marat Dukhan4662b192020-05-21 15:52:03 -070039#include <xnnpack/fill.h>
Marat Dukhan660fd192020-03-10 04:55:30 -070040#include <xnnpack/ibilinear.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070041#include <xnnpack/igemm.h>
42#include <xnnpack/log.h>
43#include <xnnpack/lut.h>
44#include <xnnpack/maxpool.h>
45#include <xnnpack/pad.h>
46#include <xnnpack/params.h>
Marat Dukhanc5a7a392021-05-21 16:04:31 -070047#include <xnnpack/params-init.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070048#include <xnnpack/pavgpool.h>
49#include <xnnpack/prelu.h>
Marat Dukhan1edc4542020-01-27 12:40:13 -080050#include <xnnpack/raddstoreexpminusmax.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070051#include <xnnpack/rmax.h>
52#include <xnnpack/spmm.h>
53#include <xnnpack/unpool.h>
Marat Dukhan64287252021-09-07 16:20:03 -070054#include <xnnpack/vaddsub.h>
Marat Dukhan1e782c42019-11-21 17:02:40 -080055#include <xnnpack/vbinary.h>
Marat Dukhanaf2ba002021-10-24 14:21:41 -070056#include <xnnpack/vcvt.h>
Marat Dukhan0853b8a2021-08-03 01:01:53 -070057#include <xnnpack/vmul.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070058#include <xnnpack/vmulcaddc.h>
Marat Dukhan1e782c42019-11-21 17:02:40 -080059#include <xnnpack/vunary.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070060#include <xnnpack/zip.h>
61
62#ifndef XNN_ENABLE_ASSEMBLY
63 #define XNN_ENABLE_ASSEMBLY 1
64#endif
65
Marat Dukhan57133c02020-04-13 00:54:59 -070066#ifdef _WIN32
67 static INIT_ONCE init_guard = INIT_ONCE_STATIC_INIT;
68#else
69 static pthread_once_t init_guard = PTHREAD_ONCE_INIT;
70#endif
XNNPACK Teamb455b122019-09-27 18:10:33 -070071
Marat Dukhan496389f2021-04-07 15:47:12 -070072static const struct xnn_allocator* volatile init_allocator = NULL;
73
XNNPACK Teamb455b122019-09-27 18:10:33 -070074struct xnn_parameters xnn_params = {
Marat Dukhan854fb6b2020-06-19 12:33:44 -070075 .init_flags = 0
XNNPACK Teamb455b122019-09-27 18:10:33 -070076};
77
Marat Dukhan01849012020-04-27 19:28:32 -070078static void init(void) {
Marat Dukhana199d492020-07-24 15:01:25 -070079#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
Marat Dukhan01849012020-04-27 19:28:32 -070080 // Unlike most other architectures, on x86/x86-64 when floating-point instructions
81 // have no NaN arguments, but produce NaN output, the output NaN has sign bit set.
82 // We use it to distinguish x86/x86-64 from other architectures, by doing subtraction
83 // of two infinities (must produce NaN per IEEE 754 standard).
84 static const volatile float inf = INFINITY;
85 const bool is_wasm_x86 = signbit(inf - inf);
XNNPACK Teamb455b122019-09-27 18:10:33 -070086#endif
Marat Dukhan854fb6b2020-06-19 12:33:44 -070087 uint32_t init_flags = XNN_INIT_FLAG_XNNPACK;
XNNPACK Teamb455b122019-09-27 18:10:33 -070088
Marat Dukhan1dadbf72019-10-01 10:46:20 -070089#if XNN_ARCH_ARM
Frank Barchardbcdb1c12020-05-11 14:13:20 -070090 #if XNN_PLATFORM_MOBILE
Marat Dukhan3b745a42020-05-10 21:43:25 -070091 if (!cpuinfo_has_arm_neon()) {
92 xnn_log_error("XNNPACK initialization failed: NEON is not supported");
93 return;
94 }
95 #else
96 if (!cpuinfo_has_arm_vfpv2() && !cpuinfo_has_arm_vfpv3()) {
97 xnn_log_error("XNNPACK initialization failed: VFP is not supported");
98 return;
99 }
100 #endif
XNNPACK Teamb455b122019-09-27 18:10:33 -0700101
Marat Dukhan3b745a42020-05-10 21:43:25 -0700102 if (cpuinfo_has_arm_neon()) {
Marat Dukhan898d5852021-06-30 21:18:34 -0700103 /**************************** QC8 micro-kernels ****************************/
104 #ifndef XNN_NO_QC8_OPERATORS
105 init_flags |= XNN_INIT_FLAG_QC8;
106
107 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
108 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot);
109 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
110 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__neondot);
111 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
Marat Dukhan7988a182021-12-06 22:00:33 -0800112 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan898d5852021-06-30 21:18:34 -0700113 xnn_params.qc8.gemm.mr = 4;
114 xnn_params.qc8.gemm.nr = 8;
115 xnn_params.qc8.gemm.log2_kr = 2;
Frank Barchard0bc58012021-11-22 18:12:05 -0800116 } else if (cpuinfo_has_arm_v8()) {
117 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
118 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
119 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
120 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
Marat Dukhan7988a182021-12-06 22:00:33 -0800121 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Frank Barchard0bc58012021-11-22 18:12:05 -0800122 xnn_params.qc8.gemm.mr = 2;
123 xnn_params.qc8.gemm.nr = 8;
124 xnn_params.qc8.gemm.log2_kr = 1;
125 xnn_params.qc8.gemm.log2_sr = 2;
Marat Dukhan898d5852021-06-30 21:18:34 -0700126 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -0800127 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal);
128 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal);
129 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal);
130 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal);
Marat Dukhan7988a182021-12-06 22:00:33 -0800131 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neon_params;
Marat Dukhan898d5852021-06-30 21:18:34 -0700132 xnn_params.qc8.gemm.mr = 2;
133 xnn_params.qc8.gemm.nr = 8;
134 xnn_params.qc8.gemm.log2_kr = 1;
Frank Barcharde7043ff2021-11-10 14:50:08 -0800135 xnn_params.qc8.gemm.log2_sr = 2;
Marat Dukhan898d5852021-06-30 21:18:34 -0700136 }
137
Frank Barchard0bc58012021-11-22 18:12:05 -0800138 if (cpuinfo_has_arm_neon_v8()) {
139 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neonv8_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -0800140 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Frank Barchard0bc58012021-11-22 18:12:05 -0800141 xnn_params.qc8.dwconv[0].channel_tile = 16;
142 xnn_params.qc8.dwconv[0].primary_tile = 9;
143 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -0800144 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Frank Barchard0bc58012021-11-22 18:12:05 -0800145 xnn_params.qc8.dwconv[1].channel_tile = 8;
146 xnn_params.qc8.dwconv[1].primary_tile = 25;
147 } else {
148 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neon_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -0800149 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_neon_params;
Frank Barchard0bc58012021-11-22 18:12:05 -0800150 xnn_params.qc8.dwconv[0].channel_tile = 16;
151 xnn_params.qc8.dwconv[0].primary_tile = 9;
152 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neon_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -0800153 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_neon_params;
Frank Barchard0bc58012021-11-22 18:12:05 -0800154 xnn_params.qc8.dwconv[1].channel_tile = 8;
155 xnn_params.qc8.dwconv[1].primary_tile = 25;
156 }
Marat Dukhan898d5852021-06-30 21:18:34 -0700157 #endif // XNN_NO_QC8_OPERATORS
158
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700159 /**************************** QS8 micro-kernels ****************************/
160 #ifndef XNN_NO_QS8_OPERATORS
161 init_flags |= XNN_INIT_FLAG_QS8;
162
Marat Dukhanec56b7e2021-04-13 16:46:21 -0700163 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
Marat Dukhane903dff2021-07-16 19:43:41 -0700164 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
165 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
166 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
167 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
168 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchardfb0ab0b2021-03-03 07:54:34 -0800169 xnn_params.qs8.gemm.mr = 4;
170 xnn_params.qs8.gemm.nr = 8;
171 xnn_params.qs8.gemm.log2_kr = 2;
172 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -0800173 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
174 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
175 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
176 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
Marat Dukhane903dff2021-07-16 19:43:41 -0700177 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchardfb0ab0b2021-03-03 07:54:34 -0800178 xnn_params.qs8.gemm.mr = 2;
179 xnn_params.qs8.gemm.nr = 8;
180 xnn_params.qs8.gemm.log2_kr = 1;
Frank Barchard66ae2572021-11-02 17:36:21 -0700181 xnn_params.qs8.gemm.log2_sr = 2;
Frank Barchardfb0ab0b2021-03-03 07:54:34 -0800182 }
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700183
Frank Barchard0d065732021-08-31 00:01:40 -0700184 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64;
Marat Dukhanbe18f5c2021-07-16 18:46:39 -0700185 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchard0d065732021-08-31 00:01:40 -0700186 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700187 xnn_params.qs8.dwconv[0].primary_tile = 9;
Marat Dukhan5f2939f2021-07-23 13:38:32 -0700188 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mla8_ld64;
Marat Dukhanbe18f5c2021-07-16 18:46:39 -0700189 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -0700190 xnn_params.qs8.dwconv[1].channel_tile = 8;
191 xnn_params.qs8.dwconv[1].primary_tile = 25;
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700192
193 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
194 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c8_acc2,
195 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2,
196 .mr = 7,
197 };
Marat Dukhanff209482020-09-03 14:26:53 -0700198
199 xnn_params.qs8.vadd = (struct vbinary_parameters) {
Marat Dukhan01debd92021-07-29 18:14:21 -0700200 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16,
201 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16,
202 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16,
Marat Dukhan64287252021-09-07 16:20:03 -0700203 .init.qs8_addsub = xnn_init_qs8_add_minmax_neon_params,
Marat Dukhan01debd92021-07-29 18:14:21 -0700204 .element_tile = 16,
Marat Dukhanff209482020-09-03 14:26:53 -0700205 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -0700206 if (cpuinfo_has_arm_neon_v8()) {
207 xnn_params.qs8.vmul = (struct vbinary_parameters) {
208 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__neonv8_ld64_x16,
209 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__neonv8_ld64_x16,
210 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__neonv8_ld64_x16,
211 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_neonv8_params,
212 .element_tile = 16,
213 };
214 } else {
215 xnn_params.qs8.vmul = (struct vbinary_parameters) {
216 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__neon_ld64_x16,
217 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__neon_ld64_x16,
218 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__neon_ld64_x16,
219 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_neon_params,
220 .element_tile = 16,
221 };
222 }
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700223 #endif // XNN_NO_QS8_OPERATORS
224
Marat Dukhan08b7a972020-07-14 18:17:29 -0700225 /*************************** QU8 micro-kernels ***************************/
226 #ifndef XNN_NO_QU8_OPERATORS
227 init_flags |= XNN_INIT_FLAG_QU8;
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700228
Frank Barchard20255152021-08-11 14:01:45 -0700229 if (cpuinfo_has_arm_neon_dot()) {
Frank Barchardde9c64a2021-08-17 18:32:50 -0700230 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot);
231 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_2x16c4__neondot);
232 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
233 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
Frank Barchard20255152021-08-11 14:01:45 -0700234 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
Frank Barcharde0331262021-08-11 23:18:59 -0700235 xnn_params.qu8.gemm.mr = 2;
Frank Barchardde9c64a2021-08-17 18:32:50 -0700236 xnn_params.qu8.gemm.nr = 16;
Frank Barchard20255152021-08-11 14:01:45 -0700237 xnn_params.qu8.gemm.log2_kr = 2;
238 } else {
239 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane);
240 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane);
241 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
242 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
243 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
244 xnn_params.qu8.gemm.mr = 4;
245 xnn_params.qu8.gemm.nr = 8;
246 }
Frank Barchard354cbc62021-09-27 21:42:41 -0700247 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8;
Marat Dukhan73a899a2021-07-27 00:10:38 -0700248 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
Frank Barchard0d065732021-08-31 00:01:40 -0700249 xnn_params.qu8.dwconv[0].channel_tile = 16;
Marat Dukhan08b7a972020-07-14 18:17:29 -0700250 xnn_params.qu8.dwconv[0].primary_tile = 9;
Frank Barchard354cbc62021-09-27 21:42:41 -0700251 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8;
Marat Dukhan73a899a2021-07-27 00:10:38 -0700252 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
Marat Dukhan43b46ee2021-07-15 19:07:50 -0700253 xnn_params.qu8.dwconv[1].channel_tile = 8;
254 xnn_params.qu8.dwconv[1].primary_tile = 25;
255
Marat Dukhan08b7a972020-07-14 18:17:29 -0700256 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
257 .up = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8,
258 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700259 .mr = 9,
260 .qr = 8,
261 };
Marat Dukhan08b7a972020-07-14 18:17:29 -0700262 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
263 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8,
264 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700265 .mr = 7,
266 };
Marat Dukhandb007cd2021-07-20 23:42:39 -0700267 xnn_params.qu8.vadd = (struct vbinary_parameters) {
Frank Barchard0a3093c2021-08-31 09:58:11 -0700268 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__neon_ld64_x16,
269 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16,
270 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16,
Marat Dukhan64287252021-09-07 16:20:03 -0700271 .init.qu8_addsub = xnn_init_qu8_add_minmax_neon_params,
Marat Dukhandb007cd2021-07-20 23:42:39 -0700272 .element_tile = 8,
273 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -0700274 if (cpuinfo_has_arm_neon_v8()) {
275 xnn_params.qu8.vmul = (struct vbinary_parameters) {
276 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__neonv8_ld64_x16,
277 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__neonv8_ld64_x16,
278 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__neonv8_ld64_x16,
279 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_neonv8_params,
280 .element_tile = 16,
281 };
282 } else {
283 xnn_params.qu8.vmul = (struct vbinary_parameters) {
284 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__neon_ld64_x16,
285 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__neon_ld64_x16,
286 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__neon_ld64_x16,
287 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_neon_params,
288 .element_tile = 16,
289 };
290 }
Marat Dukhan08b7a972020-07-14 18:17:29 -0700291 #endif // XNN_NO_QU8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -0700292
Marat Dukhan23147532021-08-16 07:26:56 -0700293 /**************************** S8 micro-kernels ****************************/
294 #ifndef XNN_NO_S8_OPERATORS
295 init_flags |= XNN_INIT_FLAG_S8;
296
Marat Dukhan61c0c9e2021-08-16 23:16:14 -0700297 xnn_params.s8.clamp = (struct vunary_parameters) {
298 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__neon_x64,
299 .init.s8_minmax = xnn_init_s8_minmax_neon_params,
300 .element_tile = 64,
301 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -0800302 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
303 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__neon_c8,
304 .pixel_tile = 1,
305 .channel_tile = 8,
306 };
Marat Dukhan23147532021-08-16 07:26:56 -0700307 xnn_params.s8.maxpool = (struct maxpool_parameters) {
308 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__neon_c16,
Marat Dukhandc5c1482021-08-16 09:03:15 -0700309 .init.s8 = xnn_init_s8_minmax_neon_params,
Marat Dukhan23147532021-08-16 07:26:56 -0700310 .mr = 9,
311 .qr = 8,
312 };
313 #endif // XNN_NO_S8_OPERATORS
314
Marat Dukhan3b745a42020-05-10 21:43:25 -0700315 /**************************** U8 micro-kernels ****************************/
316 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700317 init_flags |= XNN_INIT_FLAG_U8;
318
Marat Dukhan94912792021-08-16 21:40:30 -0700319 xnn_params.u8.clamp = (struct vunary_parameters) {
320 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__neon_x64,
321 .init.u8_minmax = xnn_init_u8_minmax_neon_params,
322 .element_tile = 64,
323 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -0800324 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
325 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__neon_c8,
326 .pixel_tile = 1,
327 .channel_tile = 8,
328 };
Marat Dukhan3b745a42020-05-10 21:43:25 -0700329 xnn_params.u8.maxpool = (struct maxpool_parameters) {
330 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16,
Marat Dukhan2ea50a02021-08-16 12:59:19 -0700331 .init.u8 = xnn_init_u8_minmax_neon_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700332 .mr = 9,
333 .qr = 8,
334 };
Marat Dukhan3b745a42020-05-10 21:43:25 -0700335 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
336 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
337 #endif // XNN_NO_U8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -0700338
Marat Dukhan3b745a42020-05-10 21:43:25 -0700339 /**************************** X8 micro-kernels ****************************/
340 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700341 init_flags |= XNN_INIT_FLAG_X8;
342
Marat Dukhand67539d2021-09-08 23:06:03 -0700343 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700344 xnn_params.x8.zip = (struct zip_parameters) {
345 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__neon,
346 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__neon,
347 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__neon,
348 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__neon,
349 };
350 #endif // XNN_NO_X8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -0700351
Marat Dukhan3b745a42020-05-10 21:43:25 -0700352 /**************************** F32 micro-kernels ****************************/
353 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700354 init_flags |= XNN_INIT_FLAG_F32;
355
Marat Dukhan3b745a42020-05-10 21:43:25 -0700356 #if XNN_ENABLE_ASSEMBLY
357 switch (cpuinfo_get_uarch(0)->uarch) {
358 case cpuinfo_uarch_cortex_a5:
359 case cpuinfo_uarch_cortex_a7:
Frank Barchard490febe2020-07-16 18:42:17 -0700360 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7);
361 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7);
Marat Dukhan3b745a42020-05-10 21:43:25 -0700362 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
363 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700364 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700365 xnn_params.f32.gemm.mr = 4;
366 xnn_params.f32.gemm.nr = 8;
Marat Dukhan05702cf2020-03-26 15:41:33 -0700367 break;
Marat Dukhan05702cf2020-03-26 15:41:33 -0700368
Marat Dukhan3b745a42020-05-10 21:43:25 -0700369 case cpuinfo_uarch_cortex_a53:
370 case cpuinfo_uarch_cortex_a55r0:
371 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53);
372 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53);
373 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
374 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700375 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700376 xnn_params.f32.gemm.mr = 4;
377 xnn_params.f32.gemm.nr = 8;
378 break;
379
Frank Barchardf975ee02021-11-05 16:01:00 -0700380 case cpuinfo_uarch_cortex_a35:
Marat Dukhan3b745a42020-05-10 21:43:25 -0700381 case cpuinfo_uarch_cortex_a55:
382 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55);
383 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55);
384 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
385 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700386 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700387 xnn_params.f32.gemm.mr = 4;
388 xnn_params.f32.gemm.nr = 8;
389 break;
390
391 case cpuinfo_uarch_cortex_a57:
392 case cpuinfo_uarch_cortex_a72:
393 case cpuinfo_uarch_cortex_a73:
394 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_pld_cortex_a75);
395 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_pld_cortex_a75);
396 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
397 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700398 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700399 xnn_params.f32.gemm.mr = 4;
400 xnn_params.f32.gemm.nr = 8;
401 break;
402
403 case cpuinfo_uarch_krait:
404 default:
405 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75);
406 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75);
407 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
408 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700409 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700410 xnn_params.f32.gemm.mr = 4;
411 xnn_params.f32.gemm.nr = 8;
412 break;
413 }
414 #if XNN_MAX_UARCH_TYPES > 1
415 {
416 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
417 const uint32_t mr = xnn_params.f32.gemm.mr;
418 const uint32_t nr = xnn_params.f32.gemm.nr;
419 const uint32_t log2_sr = xnn_params.f32.gemm.log2_sr;
420 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
421 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
422 if (uarch_info == NULL) {
423 /* No more microarchitectures in the system */
Marat Dukhan05702cf2020-03-26 15:41:33 -0700424 break;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700425 }
426
427 switch (uarch_info->uarch) {
428 case cpuinfo_uarch_cortex_a53:
429 case cpuinfo_uarch_cortex_a55r0:
430 if (mr == 4 && nr == 8 && log2_sr == 0) {
431 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53;
432 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53;
433 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
434 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
435 }
436 break;
437 case cpuinfo_uarch_cortex_a55:
438 if (mr == 4 && nr == 8 && log2_sr == 0) {
439 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55;
440 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55;
441 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
442 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
443 }
444 break;
445 default:
446 break;
447 }
Marat Dukhan05702cf2020-03-26 15:41:33 -0700448 }
449 }
Marat Dukhan3b745a42020-05-10 21:43:25 -0700450 #endif // XNN_MAX_UARCH_TYPES > 1
451 #else // XNN_ENABLE_ASSEMBLY
452 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128);
453 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128);
454 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
455 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700456 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700457 xnn_params.f32.gemm.mr = 4;
458 xnn_params.f32.gemm.nr = 8;
459 #endif // XNN_ENABLE_ASSEMBLY
460 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64);
461 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700462 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700463 xnn_params.f32.gemm2.mr = 4;
464 xnn_params.f32.gemm2.nr = 2;
465
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700466 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__neon;
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700467 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Frank Barcharddbe781b2021-10-18 10:29:52 -0700468 xnn_params.f32.dwconv[0].channel_tile = 8,
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700469 xnn_params.f32.dwconv[0].primary_tile = 3,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700470
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700471 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__neon;
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700472 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700473 xnn_params.f32.dwconv[1].channel_tile = 8,
474 xnn_params.f32.dwconv[1].primary_tile = 4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700475
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700476 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neon;
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700477 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
Frank Barcharddbe781b2021-10-18 10:29:52 -0700478 xnn_params.f32.dwconv[2].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700479 xnn_params.f32.dwconv[2].primary_tile = 9;
480
481 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__neon_acc2;
482 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
483 xnn_params.f32.dwconv[3].channel_tile = 8;
484 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700485
486 xnn_params.f32.avgpool = (struct avgpool_parameters) {
487 .up = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__neon_c4,
488 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4,
489 .mr = 9,
490 .qr = 8,
491 };
492 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
493 .up = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4,
494 .mp = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4,
495 .mr = 9,
496 .qr = 8,
497 };
498 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
499 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4,
500 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
501 .mr = 7,
502 };
503 xnn_params.f32.maxpool = (struct maxpool_parameters) {
504 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4,
Marat Dukhan91ae1652021-08-15 19:19:49 -0700505 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700506 .mr = 9,
507 .qr = 8,
508 };
509 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -0700510 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__neon_c4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700511 .mr = 4,
512 };
513 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -0700514 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__neon_c4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700515 .mr = 9,
516 };
517 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -0700518 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__neon_c4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700519 .mr = 9,
520 .qr = 8,
521 };
522 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
523 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__neon_c8,
524 .pixel_tile = 1,
525 .channel_tile = 8,
526 };
Marat Dukhan5020b962020-06-08 13:30:10 -0700527 xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__neon_x8;
Marat Dukhan94912792021-08-16 21:40:30 -0700528 xnn_params.f32.clamp = (struct vunary_parameters) {
529 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__neon_x8,
530 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
531 .element_tile = 8,
532 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -0800533 if (cpuinfo_has_arm_neon_fma()) {
534 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neonfma_rr1_p6_x8;
535 } else {
536 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x8;
537 }
Marat Dukhan6674d692021-05-05 22:27:00 -0700538 xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__neon_x16;
Marat Dukhan28813332020-06-10 18:05:38 -0700539 xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__neon_x8;
Marat Dukhan5020b962020-06-08 13:30:10 -0700540 xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__neon_x8;
Marat Dukhan64e52512020-06-09 13:41:16 -0700541 if (cpuinfo_has_arm_neon_v8()) {
542 xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8;
543 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8;
544 xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8;
545 xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8;
546 } else {
547 xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neon_x8;
548 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neon_x8;
549 xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neon_x8;
550 xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neon_x8;
551 }
Marat Dukhan6674d692021-05-05 22:27:00 -0700552 xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x8;
Marat Dukhan5020b962020-06-08 13:30:10 -0700553 xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__neon_x8;
Marat Dukhan6804bbd2020-06-30 19:26:11 -0700554 xnn_params.f32.sqrt = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700555 xnn_params.f32.prelu = (struct prelu_parameters) {
556 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
557 .row_tile = 2,
558 .channel_tile = 8,
559 };
560 xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x8;
561 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__neon;
562 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -0700563 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__neon_x8,
564 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
565 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700566 .element_tile = 8,
567 };
568 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -0700569 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
570 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
571 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700572 .element_tile = 2,
573 };
574 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -0700575 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
576 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
577 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700578 .element_tile = 8,
579 };
580 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -0700581 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__neon_x8,
582 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
583 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700584 .element_tile = 8,
585 };
586 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -0700587 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__neon_x8,
588 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
589 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700590 .element_tile = 8,
591 };
592 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -0700593 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__neon_x8,
594 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__neon_x8,
595 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__neon_x8,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700596 .element_tile = 8,
597 };
Marat Dukhanf7399262020-06-05 10:58:44 -0700598 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -0700599 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__neon_x8,
600 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
601 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
Marat Dukhanf7399262020-06-05 10:58:44 -0700602 .element_tile = 8,
603 };
Marat Dukhan3b745a42020-05-10 21:43:25 -0700604 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -0700605 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -0700606 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700607 .channel_tile = 4,
608 .row_tile = 2,
609 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -0700610 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -0800611 init_flags |= XNN_INIT_FLAG_CHW_OPT;
612
Marat Dukhan3e913382020-12-07 13:36:08 -0800613 xnn_params.f32.spmm = (struct spmm_parameters) {
614 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__neon,
615 .mr = 32,
616 .nr = 1,
617 };
Marat Dukhanc7634882020-12-07 15:11:12 -0800618 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
619 .ukernel_with_symm_padding =
620 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2,
621 .output_channel_tile = 4,
622 .output_height_tile = 2,
623 .output_width_tile = 2,
624 };
Marat Dukhan3e913382020-12-07 13:36:08 -0800625 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
626 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4,
Marat Dukhan3e913382020-12-07 13:36:08 -0800627 .output_width_tile = 4,
628 .output_height_tile = 2,
629 };
630 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
631 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4,
Marat Dukhan3e913382020-12-07 13:36:08 -0800632 .output_width_tile = 4,
633 .output_height_tile = 1,
634 };
635 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
636 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4,
Marat Dukhan3e913382020-12-07 13:36:08 -0800637 .output_width_tile = 4,
638 .output_height_tile = 1,
639 };
640 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
641 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4,
Marat Dukhan3e913382020-12-07 13:36:08 -0800642 .output_width_tile = 4,
643 .output_height_tile = 1,
644 };
645 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
646 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__neon_x4,
647 .channel_tile = 4,
648 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -0700649 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
Artsiom Ablavatski2202c812021-01-22 14:16:43 -0800650 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__neon_p8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -0700651 .channel_tile = 1,
Artsiom Ablavatskib3ffd582021-03-31 13:00:08 -0700652 .pixel_tile = 8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -0700653 };
654 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan3b745a42020-05-10 21:43:25 -0700655 #endif // XNN_NO_F32_OPERATORS
656
Marat Dukhanaf2ba002021-10-24 14:21:41 -0700657 /*************************** VCVT micro-kernels ***************************/
658 #ifndef XNN_NO_VCVT_OPERATORS
659 init_flags |= XNN_INIT_FLAG_VCVT;
660
661 if (cpuinfo_has_arm_neon_fp16()) {
662 xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neonfp16_x16;
Marat Dukhana0c61682021-11-10 19:23:41 -0800663 xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__neonfp16_x16;
Marat Dukhanaf2ba002021-10-24 14:21:41 -0700664 } else {
665 xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neon_int16_x16;
Marat Dukhana0c61682021-11-10 19:23:41 -0800666 xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__neon_x8;
Marat Dukhanaf2ba002021-10-24 14:21:41 -0700667 }
Marat Dukhaned2d7762021-12-03 23:51:19 -0800668 if (cpuinfo_has_arm_neon_v8()) {
669 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
670 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neonv8_x32,
671 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neonv8_params,
672 .element_tile = 32,
673 };
674 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
675 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neonv8_x32,
676 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neonv8_params,
677 .element_tile = 32,
678 };
679 } else {
680 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
681 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neon_x32,
682 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neon_params,
683 .element_tile = 32,
684 };
685 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
686 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neon_x32,
687 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neon_params,
688 .element_tile = 32,
689 };
690 }
Marat Dukhanaf2ba002021-10-24 14:21:41 -0700691 #endif // XNN_NO_VCVT_OPERATORS
692
Marat Dukhan3b745a42020-05-10 21:43:25 -0700693 /**************************** X32 micro-kernels ****************************/
694 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700695 init_flags |= XNN_INIT_FLAG_X32;
696
Marat Dukhan3b745a42020-05-10 21:43:25 -0700697 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__neon;
698 xnn_params.x32.zip = (struct zip_parameters) {
699 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__neon,
700 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__neon,
701 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__neon,
702 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
703 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -0800704 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -0800705 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
706 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -0800707 .channel_tile = 1,
708 .pixel_tile = 1,
709 };
710 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan3b745a42020-05-10 21:43:25 -0700711 #endif // XNN_NO_X32_OPERATORS
Marat Dukhan933051b2021-08-07 16:26:15 -0700712
713 /**************************** XX micro-kernels ****************************/
714 #ifndef XNN_NO_XX_OPERATORS
715 init_flags |= XNN_INIT_FLAG_XX;
716
717 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
718 xnn_params.xx.fill = (struct fill_parameters) {
719 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__neon_x64,
720 .row_tile = 1,
721 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -0700722 xnn_params.xx.pad = (struct pad_parameters) {
723 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__neon,
724 .row_tile = 1,
725 };
Marat Dukhan933051b2021-08-07 16:26:15 -0700726 #endif // XNN_NO_XX_OPERATORS
727
Marat Dukhan3b745a42020-05-10 21:43:25 -0700728 } else if (!XNN_PLATFORM_MOBILE) {
Marat Dukhan933051b2021-08-07 16:26:15 -0700729
Marat Dukhan3b9b4bc2021-08-24 00:23:07 -0700730 /*************************** QS8 micro-kernels ***************************/
Marat Dukhan66a3ca12021-08-06 18:24:19 -0700731 #ifndef XNN_NO_QS8_OPERATORS
732 init_flags |= XNN_INIT_FLAG_QS8;
733
734 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_magic);
735 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_magic);
736 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_magic);
737 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_magic);
738 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_magic_params;
739 xnn_params.qs8.gemm.mr = 2;
740 xnn_params.qs8.gemm.nr = 2;
741
742 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_magic;
743 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_magic_params;
744 xnn_params.qs8.dwconv[0].channel_tile = 1;
745 xnn_params.qs8.dwconv[0].primary_tile = 9;
746 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x25__scalar_magic;
747 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_magic_params;
748 xnn_params.qs8.dwconv[1].channel_tile = 1;
749 xnn_params.qs8.dwconv[1].primary_tile = 25;
750
751 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
752 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7x__scalar_c1,
753 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7p7x__scalar_c1,
754 .mr = 7,
755 };
756 xnn_params.qs8.vadd = (struct vbinary_parameters) {
757 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x1,
758 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x1,
759 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x1,
Marat Dukhan64287252021-09-07 16:20:03 -0700760 .init.qs8_addsub = xnn_init_qs8_add_minmax_scalar_params,
Marat Dukhan66a3ca12021-08-06 18:24:19 -0700761 .element_tile = 1,
762 };
763 xnn_params.qs8.vmul = (struct vbinary_parameters) {
764 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
765 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
766 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
767 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
768 .element_tile = 4,
769 };
770 #endif // XNN_NO_QS8_OPERATORS
771
772 /*************************** QU8 micro-kernels ***************************/
Marat Dukhan08b7a972020-07-14 18:17:29 -0700773 #ifndef XNN_NO_QU8_OPERATORS
774 init_flags |= XNN_INIT_FLAG_QU8;
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700775
Marat Dukhan43b46ee2021-07-15 19:07:50 -0700776 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_magic);
777 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_magic);
778 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_magic);
779 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_magic);
780 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_magic_params;
Marat Dukhan08b7a972020-07-14 18:17:29 -0700781 xnn_params.qu8.gemm.mr = 2;
782 xnn_params.qu8.gemm.nr = 2;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700783
Marat Dukhan43b46ee2021-07-15 19:07:50 -0700784 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x9__scalar_magic;
785 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_magic_params;
Marat Dukhan08b7a972020-07-14 18:17:29 -0700786 xnn_params.qu8.dwconv[0].channel_tile = 1;
787 xnn_params.qu8.dwconv[0].primary_tile = 9;
Marat Dukhan43b46ee2021-07-15 19:07:50 -0700788 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x25__scalar_magic;
789 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_magic_params;
790 xnn_params.qu8.dwconv[1].channel_tile = 1;
791 xnn_params.qu8.dwconv[1].primary_tile = 25;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700792
Marat Dukhan08b7a972020-07-14 18:17:29 -0700793 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
794 .up = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
795 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700796 .mr = 9,
797 .qr = 8,
798 };
Marat Dukhan08b7a972020-07-14 18:17:29 -0700799 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
800 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__scalar_c1,
801 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__scalar_c1,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700802 .mr = 7,
803 };
Marat Dukhandb007cd2021-07-20 23:42:39 -0700804 xnn_params.qu8.vadd = (struct vbinary_parameters) {
805 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x1,
806 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x1,
807 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x1,
Marat Dukhan64287252021-09-07 16:20:03 -0700808 .init.qu8_addsub = xnn_init_qu8_add_minmax_scalar_params,
Marat Dukhandb007cd2021-07-20 23:42:39 -0700809 .element_tile = 1,
810 };
Marat Dukhan3c5e6622021-08-06 00:38:05 -0700811 xnn_params.qu8.vmul = (struct vbinary_parameters) {
812 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
813 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
814 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
815 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
816 .element_tile = 4,
817 };
Marat Dukhan08b7a972020-07-14 18:17:29 -0700818 #endif // XNN_NO_QU8_OPERATORS
Marat Dukhan3b745a42020-05-10 21:43:25 -0700819
Marat Dukhan23147532021-08-16 07:26:56 -0700820 /**************************** S8 micro-kernels ****************************/
821 #ifndef XNN_NO_S8_OPERATORS
822 init_flags |= XNN_INIT_FLAG_S8;
823
Marat Dukhan61c0c9e2021-08-16 23:16:14 -0700824 xnn_params.s8.clamp = (struct vunary_parameters) {
825 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
Marat Dukhan3b9b4bc2021-08-24 00:23:07 -0700826 .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
Marat Dukhan61c0c9e2021-08-16 23:16:14 -0700827 .element_tile = 4,
828 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -0800829 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
830 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
831 .pixel_tile = 1,
832 .channel_tile = 1,
833 };
Marat Dukhan23147532021-08-16 07:26:56 -0700834 xnn_params.s8.maxpool = (struct maxpool_parameters) {
835 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
836 .init.s8 = xnn_init_s8_minmax_scalar_params,
837 .mr = 9,
838 .qr = 8,
839 };
840 #endif // XNN_NO_S8_OPERATORS
841
Marat Dukhan3b745a42020-05-10 21:43:25 -0700842 /**************************** U8 micro-kernels ****************************/
843 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700844 init_flags |= XNN_INIT_FLAG_U8;
845
Marat Dukhan94912792021-08-16 21:40:30 -0700846 xnn_params.u8.clamp = (struct vunary_parameters) {
847 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
Marat Dukhan3b9b4bc2021-08-24 00:23:07 -0700848 .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
Marat Dukhan94912792021-08-16 21:40:30 -0700849 .element_tile = 4,
850 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -0800851 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
852 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
853 .pixel_tile = 1,
854 .channel_tile = 1,
855 };
Marat Dukhan3b745a42020-05-10 21:43:25 -0700856 xnn_params.u8.maxpool = (struct maxpool_parameters) {
857 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -0700858 .init.u8 = xnn_init_u8_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700859 .mr = 9,
860 .qr = 8,
861 };
Marat Dukhan3b745a42020-05-10 21:43:25 -0700862 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
863 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
864 #endif // XNN_NO_U8_OPERATORS
865
866 /**************************** X8 micro-kernels ****************************/
867 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700868 init_flags |= XNN_INIT_FLAG_X8;
869
Marat Dukhand67539d2021-09-08 23:06:03 -0700870 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700871 xnn_params.x8.zip = (struct zip_parameters) {
872 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
873 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
874 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
875 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
876 };
877 #endif // XNN_NO_X8_OPERATORS
878
879 /**************************** F32 micro-kernels ****************************/
880 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700881 init_flags |= XNN_INIT_FLAG_F32;
882
Marat Dukhan3b745a42020-05-10 21:43:25 -0700883 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__scalar);
884 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__scalar);
885 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__scalar);
886 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__scalar);
Marat Dukhan467f6362020-05-22 23:21:55 -0700887 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__scalar);
888 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__scalar);
889 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__scalar);
890 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__scalar);
Marat Dukhan3b745a42020-05-10 21:43:25 -0700891 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__scalar);
892 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__scalar);
893 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar);
894 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700895 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -0700896 xnn_params.f32.gemm.mr = 4;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700897 xnn_params.f32.gemm.nr = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -0700898
Marat Dukhan3b745a42020-05-10 21:43:25 -0700899 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__scalar);
900 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__scalar),
901 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__scalar);
902 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__scalar),
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700903 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700904 xnn_params.f32.gemm2.mr = 4;
905 xnn_params.f32.gemm2.nr = 2;
Marat Dukhanaefaef32020-04-09 07:09:34 -0700906
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700907 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__scalar_acc2;
908 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700909 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700910 xnn_params.f32.dwconv[0].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700911 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -0700912
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700913 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__scalar_acc2;
914 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700915 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700916 xnn_params.f32.dwconv[1].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700917 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -0700918
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700919 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__scalar_acc2;
920 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700921 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700922 xnn_params.f32.dwconv[2].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700923 xnn_params.f32.dwconv[2].primary_tile = 9;
924
925 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__scalar_acc2;
926 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar_acc2;
927 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
928 xnn_params.f32.dwconv[3].channel_tile = 1;
929 xnn_params.f32.dwconv[3].primary_tile = 25;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700930
Marat Dukhan3b745a42020-05-10 21:43:25 -0700931 xnn_params.f32.avgpool = (struct avgpool_parameters) {
932 .up = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__scalar_c1,
933 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__scalar_c1,
934 .mr = 9,
935 .qr = 8,
936 };
937 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
938 .up = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__scalar_c1,
939 .mp = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__scalar_c1,
940 .mr = 9,
941 .qr = 8,
942 };
943 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
944 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1,
945 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1,
946 .mr = 7,
947 };
948 xnn_params.f32.maxpool = (struct maxpool_parameters) {
949 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -0700950 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700951 .mr = 9,
952 .qr = 8,
953 };
954 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
955 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
956 .mr = 4,
957 };
958 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
959 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
960 .mr = 9,
961 };
962 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
963 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
964 .mr = 9,
965 .qr = 8,
966 };
967 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
968 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
969 .pixel_tile = 1,
970 .channel_tile = 2,
971 };
Marat Dukhan5020b962020-06-08 13:30:10 -0700972 xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4;
Marat Dukhan94912792021-08-16 21:40:30 -0700973 xnn_params.f32.clamp = (struct vunary_parameters) {
974 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__scalar_x4,
975 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
976 .element_tile = 4,
977 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -0800978 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4;
Marat Dukhan6674d692021-05-05 22:27:00 -0700979 xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4;
Marat Dukhan28813332020-06-10 18:05:38 -0700980 xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4;
Marat Dukhan5020b962020-06-08 13:30:10 -0700981 xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4;
Marat Dukhan64e52512020-06-09 13:41:16 -0700982 xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x1;
983 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x1;
984 xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x1;
985 xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x1;
Marat Dukhan6674d692021-05-05 22:27:00 -0700986 xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_lut64_p2_div_x2;
Marat Dukhan5020b962020-06-08 13:30:10 -0700987 xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4;
Marat Dukhan6804bbd2020-06-30 19:26:11 -0700988 xnn_params.f32.sqrt = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700989 xnn_params.f32.prelu = (struct prelu_parameters) {
990 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
991 .row_tile = 4,
992 .channel_tile = 4,
993 };
994 xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc2;
995 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
996 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -0700997 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__scalar_x8,
998 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
999 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001000 .element_tile = 8,
1001 };
1002 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07001003 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
1004 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
1005 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001006 .element_tile = 2,
1007 };
1008 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001009 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__scalar_x8,
1010 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
1011 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001012 .element_tile = 8,
1013 };
1014 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001015 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__scalar_x8,
1016 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
1017 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001018 .element_tile = 8,
1019 };
1020 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001021 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__scalar_x8,
1022 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
1023 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001024 .element_tile = 8,
1025 };
1026 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001027 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__scalar_x8,
1028 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__scalar_x8,
1029 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__scalar_x8,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001030 .element_tile = 8,
1031 };
Marat Dukhanf7399262020-06-05 10:58:44 -07001032 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001033 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
1034 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
1035 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
Marat Dukhanf7399262020-06-05 10:58:44 -07001036 .element_tile = 8,
1037 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001038 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07001039 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__scalar_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07001040 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001041 .channel_tile = 1,
1042 .row_tile = 2,
1043 };
1044 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -08001045 init_flags |= XNN_INIT_FLAG_CHW_OPT;
1046
Marat Dukhan3b745a42020-05-10 21:43:25 -07001047 xnn_params.f32.spmm = (struct spmm_parameters) {
1048 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
1049 .mr = 8,
1050 .nr = 1,
1051 };
1052 xnn_params.f32.spmm2 = (struct spmm_parameters) {
1053 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
1054 .mr = 8,
1055 .nr = 2,
1056 };
1057 xnn_params.f32.spmm4 = (struct spmm_parameters) {
1058 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
1059 .mr = 8,
1060 .nr = 4,
1061 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07001062 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
Marat Dukhan3b745a42020-05-10 21:43:25 -07001063 .ukernel_with_symm_padding =
Marat Dukhan1f29b802020-05-15 23:46:39 -07001064 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001065 .output_channel_tile = 4,
1066 .output_height_tile = 1,
1067 .output_width_tile = 1,
1068 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07001069 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001070 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1,
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001071 .output_width_tile = 1,
1072 .output_height_tile = 4,
1073 };
1074 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
1075 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001076 .output_width_tile = 1,
Marat Dukhan91249d22020-10-24 12:02:51 -07001077 .output_height_tile = 2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001078 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07001079 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001080 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001081 .output_width_tile = 1,
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001082 .output_height_tile = 2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001083 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07001084 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001085 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001086 .output_width_tile = 1,
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001087 .output_height_tile = 2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001088 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07001089 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
1090 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001091 .channel_tile = 1,
1092 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -07001093 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
1094 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
1095 .channel_tile = 1,
1096 .pixel_tile = 4,
1097 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001098 #endif // XNN_NO_NCHW_OPERATORS
1099 #endif // XNN_NO_F32_OPERATORS
1100
Marat Dukhanaf2ba002021-10-24 14:21:41 -07001101 /*************************** VCVT micro-kernels ***************************/
1102 #ifndef XNN_NO_VCVT_OPERATORS
1103 init_flags |= XNN_INIT_FLAG_VCVT;
1104
1105 xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_float_x4;
Marat Dukhana0c61682021-11-10 19:23:41 -08001106 xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_fabsf_x2;
Marat Dukhaned2d7762021-12-03 23:51:19 -08001107 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
Marat Dukhan430b1732021-12-04 02:53:12 -08001108 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__scalar_magic_iminmax_x4,
1109 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_magic_iminmax_params,
Marat Dukhaned2d7762021-12-03 23:51:19 -08001110 .element_tile = 4,
1111 };
1112 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
Marat Dukhan430b1732021-12-04 02:53:12 -08001113 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_magic_iminmax_x4,
1114 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_magic_iminmax_params,
Marat Dukhaned2d7762021-12-03 23:51:19 -08001115 .element_tile = 4,
1116 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07001117 #endif // XNN_NO_VCVT_OPERATORS
1118
Marat Dukhan3b745a42020-05-10 21:43:25 -07001119 /**************************** X32 micro-kernels ****************************/
1120 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07001121 init_flags |= XNN_INIT_FLAG_X32;
1122
Marat Dukhan3b745a42020-05-10 21:43:25 -07001123 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
1124 xnn_params.x32.zip = (struct zip_parameters) {
1125 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
1126 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
1127 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
1128 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
1129 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08001130 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -08001131 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
1132 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08001133 .channel_tile = 1,
1134 .pixel_tile = 1,
1135 };
1136 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan3b745a42020-05-10 21:43:25 -07001137 #endif // XNN_NO_X32_OPERATORS
Marat Dukhan933051b2021-08-07 16:26:15 -07001138
1139 /**************************** XX micro-kernels ****************************/
1140 #ifndef XNN_NO_XX_OPERATORS
1141 init_flags |= XNN_INIT_FLAG_XX;
1142
1143 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
1144 xnn_params.xx.fill = (struct fill_parameters) {
1145 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
1146 .row_tile = 1,
1147 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -07001148 xnn_params.xx.pad = (struct pad_parameters) {
1149 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
1150 .row_tile = 1,
1151 };
Marat Dukhan933051b2021-08-07 16:26:15 -07001152 #endif // XNN_NO_XX_OPERATORS
Marat Dukhan3b745a42020-05-10 21:43:25 -07001153 }
XNNPACK Teamb455b122019-09-27 18:10:33 -07001154
Marat Dukhan1dadbf72019-10-01 10:46:20 -07001155#elif XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -07001156
Marat Dukhan898d5852021-06-30 21:18:34 -07001157 /**************************** QC8 micro-kernels ****************************/
1158 #ifndef XNN_NO_QC8_OPERATORS
1159 init_flags |= XNN_INIT_FLAG_QC8;
1160
Marat Dukhan75d1b792021-07-01 13:00:28 -07001161 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
1162 #if XNN_ENABLE_ASSEMBLY
1163 if (cpuinfo_has_arm_neon_dot()) {
1164 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1165 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1166 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1167 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
Marat Dukhan7988a182021-12-06 22:00:33 -08001168 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001169 xnn_params.qc8.gemm.mr = 4;
1170 xnn_params.qc8.gemm.nr = 16;
1171 xnn_params.qc8.gemm.log2_kr = 2;
1172 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08001173 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1174 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1175 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
1176 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
Marat Dukhan7988a182021-12-06 22:00:33 -08001177 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001178 xnn_params.qc8.gemm.mr = 2;
1179 xnn_params.qc8.gemm.nr = 8;
1180 xnn_params.qc8.gemm.log2_kr = 3;
1181 }
1182 #else // !XNN_ENABLE_ASSEMBLY
1183 if (cpuinfo_has_arm_neon_dot()) {
1184 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__neondot);
1185 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1186 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__neondot);
1187 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
Marat Dukhan7988a182021-12-06 22:00:33 -08001188 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001189 xnn_params.qc8.gemm.mr = 4;
1190 xnn_params.qc8.gemm.nr = 16;
1191 xnn_params.qc8.gemm.log2_kr = 2;
1192 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08001193 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1194 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1195 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
1196 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
Marat Dukhan7988a182021-12-06 22:00:33 -08001197 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001198 xnn_params.qc8.gemm.mr = 2;
1199 xnn_params.qc8.gemm.nr = 8;
1200 xnn_params.qc8.gemm.log2_kr = 1;
Frank Barcharde7043ff2021-11-10 14:50:08 -08001201 xnn_params.qc8.gemm.log2_sr = 2;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001202 }
1203 #endif // XNN_ENABLE_ASSEMBLY
1204 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1205 #if XNN_ENABLE_ASSEMBLY
1206 if (cpuinfo_has_arm_neon_dot()) {
1207 switch (cpuinfo_get_core(0)->uarch) {
1208 case cpuinfo_uarch_cortex_a55:
1209 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1210 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1211 break;
1212 case cpuinfo_uarch_cortex_x1:
1213 case cpuinfo_uarch_cortex_a78:
1214 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1215 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1216 break;
1217 default:
1218 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64);
1219 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64);
1220 break;
1221 }
1222 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1223 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
Marat Dukhan7988a182021-12-06 22:00:33 -08001224 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001225 xnn_params.qc8.gemm.mr = 4;
1226 xnn_params.qc8.gemm.nr = 16;
1227 xnn_params.qc8.gemm.log2_kr = 2;
1228 } else {
1229 switch (cpuinfo_get_core(0)->uarch) {
Frank Barchard6c34dbf2021-11-22 16:14:53 -08001230 case cpuinfo_uarch_cortex_a35:
1231 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1232 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1233 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1234 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
Marat Dukhan7988a182021-12-06 22:00:33 -08001235 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Frank Barchard6c34dbf2021-11-22 16:14:53 -08001236 xnn_params.qc8.gemm.mr = 4;
1237 xnn_params.qc8.gemm.nr = 16;
1238 break;
1239
Marat Dukhan75d1b792021-07-01 13:00:28 -07001240 case cpuinfo_uarch_cortex_a53:
1241 case cpuinfo_uarch_cortex_a55r0:
1242 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1243 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1244 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1245 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
Marat Dukhan7988a182021-12-06 22:00:33 -08001246 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001247 xnn_params.qc8.gemm.mr = 4;
1248 xnn_params.qc8.gemm.nr = 16;
1249 break;
1250
1251 case cpuinfo_uarch_cortex_a72:
1252 case cpuinfo_uarch_cortex_a73:
1253 case cpuinfo_uarch_kryo:
Frank Barcharde22685a2021-11-12 11:36:58 -08001254 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1255 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1256 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm);
1257 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm);
Marat Dukhan7988a182021-12-06 22:00:33 -08001258 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001259 xnn_params.qc8.gemm.mr = 2;
1260 xnn_params.qc8.gemm.nr = 8;
1261 xnn_params.qc8.gemm.log2_kr = 3;
1262 break;
1263
1264 default:
Frank Barcharde22685a2021-11-12 11:36:58 -08001265 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1266 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1267 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
1268 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
Marat Dukhan7988a182021-12-06 22:00:33 -08001269 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001270 xnn_params.qc8.gemm.mr = 2;
1271 xnn_params.qc8.gemm.nr = 8;
1272 xnn_params.qc8.gemm.log2_kr = 3;
1273 break;
1274 }
1275 }
1276 #if XNN_MAX_UARCH_TYPES > 1
1277 {
1278 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
1279 const uint32_t mr = xnn_params.qc8.gemm.mr;
1280 const uint32_t nr = xnn_params.qc8.gemm.nr;
1281 const uint32_t log2_kr = xnn_params.qc8.gemm.log2_kr;
1282 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
1283 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
1284 if (uarch_info == NULL) {
1285 /* No more microarchitectures in the system */
1286 break;
1287 }
1288
1289 switch (uarch_info->uarch) {
1290 case cpuinfo_uarch_cortex_a53:
1291 case cpuinfo_uarch_cortex_a55r0:
1292 if (mr == 2 && nr == 8 && log2_kr == 3) {
Frank Barcharde22685a2021-11-12 11:36:58 -08001293 xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1294 xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1295 xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1296 xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001297 }
1298 break;
1299
1300 case cpuinfo_uarch_cortex_a55:
Frank Barchardc37b8da2021-09-01 00:35:19 -07001301 if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
Marat Dukhan75d1b792021-07-01 13:00:28 -07001302 xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55;
1303 xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55;
1304 xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot;
1305 xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot;
1306 }
1307 break;
1308 default:
1309 break;
1310 }
1311 }
1312 }
1313 #endif // XNN_MAX_UARCH_TYPES > 1
1314 #else // !XNN_ENABLE_ASSEMBLY
1315 if (cpuinfo_has_arm_neon_dot()) {
1316 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__neondot);
1317 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1318 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__neondot);
1319 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
Marat Dukhan7988a182021-12-06 22:00:33 -08001320 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001321 xnn_params.qc8.gemm.mr = 4;
1322 xnn_params.qc8.gemm.nr = 16;
1323 xnn_params.qc8.gemm.log2_kr = 2;
1324 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08001325 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1326 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1327 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
1328 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
Marat Dukhan7988a182021-12-06 22:00:33 -08001329 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001330 xnn_params.qc8.gemm.mr = 2;
1331 xnn_params.qc8.gemm.nr = 8;
1332 xnn_params.qc8.gemm.log2_kr = 1;
Frank Barcharde7043ff2021-11-10 14:50:08 -08001333 xnn_params.qc8.gemm.log2_sr = 2;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001334 }
1335 #endif // XNN_ENABLE_ASSEMBLY
1336 #endif // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Marat Dukhan898d5852021-06-30 21:18:34 -07001337
Frank Barchard0d065732021-08-31 00:01:40 -07001338 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neonv8_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -08001339 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Frank Barchard0d065732021-08-31 00:01:40 -07001340 xnn_params.qc8.dwconv[0].channel_tile = 16;
Marat Dukhan898d5852021-06-30 21:18:34 -07001341 xnn_params.qc8.dwconv[0].primary_tile = 9;
Frank Barchard7da8b022021-08-31 09:49:10 -07001342 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__neonv8_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -08001343 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Frank Barchard7da8b022021-08-31 09:49:10 -07001344 xnn_params.qc8.dwconv[1].channel_tile = 16;
Marat Dukhan898d5852021-06-30 21:18:34 -07001345 xnn_params.qc8.dwconv[1].primary_tile = 25;
1346 #endif // XNN_NO_QC8_OPERATORS
1347
Marat Dukhanf28cddf2020-08-10 14:05:02 -07001348 /**************************** QS8 micro-kernels ****************************/
1349 #ifndef XNN_NO_QS8_OPERATORS
1350 init_flags |= XNN_INIT_FLAG_QS8;
1351
Marat Dukhandfe47b92020-12-14 02:48:43 -08001352 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Frank Barchardbc0c7292020-10-06 13:36:54 -07001353 #if XNN_ENABLE_ASSEMBLY
Marat Dukhan31677ad2020-10-13 23:59:31 -07001354 if (cpuinfo_has_arm_neon_dot()) {
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001355 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1356 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1357 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1358 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1359 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001360 xnn_params.qs8.gemm.mr = 4;
1361 xnn_params.qs8.gemm.nr = 16;
1362 xnn_params.qs8.gemm.log2_kr = 2;
1363 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08001364 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
1365 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
1366 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
1367 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001368 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001369 xnn_params.qs8.gemm.mr = 2;
Frank Barcharda414daa2021-02-23 15:50:39 -08001370 xnn_params.qs8.gemm.nr = 8;
Frank Barchardbbf51822021-03-12 10:37:31 -08001371 xnn_params.qs8.gemm.log2_kr = 3;
Frank Barchard1e8590e2020-10-12 21:20:46 -07001372 }
Marat Dukhan31677ad2020-10-13 23:59:31 -07001373 #else // !XNN_ENABLE_ASSEMBLY
1374 if (cpuinfo_has_arm_neon_dot()) {
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001375 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
1376 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1377 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
1378 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1379 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001380 xnn_params.qs8.gemm.mr = 4;
1381 xnn_params.qs8.gemm.nr = 16;
1382 xnn_params.qs8.gemm.log2_kr = 2;
1383 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08001384 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
1385 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
1386 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
1387 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001388 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001389 xnn_params.qs8.gemm.mr = 2;
Frank Barcharda414daa2021-02-23 15:50:39 -08001390 xnn_params.qs8.gemm.nr = 8;
1391 xnn_params.qs8.gemm.log2_kr = 1;
Frank Barchard66ae2572021-11-02 17:36:21 -07001392 xnn_params.qs8.gemm.log2_sr = 2;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001393 }
1394 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhandfe47b92020-12-14 02:48:43 -08001395 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
Marat Dukhan31677ad2020-10-13 23:59:31 -07001396 #if XNN_ENABLE_ASSEMBLY
1397 if (cpuinfo_has_arm_neon_dot()) {
1398 switch (cpuinfo_get_core(0)->uarch) {
1399 case cpuinfo_uarch_cortex_a55:
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001400 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1401 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
Marat Dukhan31677ad2020-10-13 23:59:31 -07001402 break;
Frank Barchard0ae35f22021-06-15 17:34:24 -07001403 case cpuinfo_uarch_cortex_x1:
1404 case cpuinfo_uarch_cortex_a78:
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001405 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1406 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
Frank Barchard0ae35f22021-06-15 17:34:24 -07001407 break;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001408 default:
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001409 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64);
1410 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64);
Marat Dukhan31677ad2020-10-13 23:59:31 -07001411 break;
1412 }
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001413 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1414 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1415 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001416 xnn_params.qs8.gemm.mr = 4;
1417 xnn_params.qs8.gemm.nr = 16;
1418 xnn_params.qs8.gemm.log2_kr = 2;
1419 } else {
Frank Barchard2a995e72021-04-13 16:24:25 -07001420 switch (cpuinfo_get_core(0)->uarch) {
Frank Barchard6c34dbf2021-11-22 16:14:53 -08001421 case cpuinfo_uarch_cortex_a35:
1422 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1423 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1424 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1425 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1426 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
1427 xnn_params.qs8.gemm.mr = 4;
1428 xnn_params.qs8.gemm.nr = 16;
1429 break;
1430
Frank Barchard2a995e72021-04-13 16:24:25 -07001431 case cpuinfo_uarch_cortex_a53:
Frank Barchardfb5983d2021-04-20 14:09:08 -07001432 case cpuinfo_uarch_cortex_a55r0:
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001433 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1434 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1435 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1436 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1437 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchardd4416d62021-05-17 15:51:37 -07001438 xnn_params.qs8.gemm.mr = 4;
1439 xnn_params.qs8.gemm.nr = 16;
Frank Barchard6ac1d182021-04-14 13:47:07 -07001440 break;
1441
Frank Barchard2a995e72021-04-13 16:24:25 -07001442 case cpuinfo_uarch_cortex_a72:
1443 case cpuinfo_uarch_cortex_a73:
1444 case cpuinfo_uarch_kryo:
Frank Barcharde22685a2021-11-12 11:36:58 -08001445 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1446 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1447 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm);
1448 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm);
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001449 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchard2a995e72021-04-13 16:24:25 -07001450 xnn_params.qs8.gemm.mr = 2;
1451 xnn_params.qs8.gemm.nr = 8;
1452 xnn_params.qs8.gemm.log2_kr = 3;
Frank Barchardc77fc4c2021-04-14 13:28:01 -07001453 break;
Frank Barchard2a995e72021-04-13 16:24:25 -07001454
1455 default:
Frank Barcharde22685a2021-11-12 11:36:58 -08001456 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
1457 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
1458 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
1459 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001460 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchard2a995e72021-04-13 16:24:25 -07001461 xnn_params.qs8.gemm.mr = 2;
1462 xnn_params.qs8.gemm.nr = 8;
1463 xnn_params.qs8.gemm.log2_kr = 3;
Frank Barchardc77fc4c2021-04-14 13:28:01 -07001464 break;
Frank Barchard2a995e72021-04-13 16:24:25 -07001465 }
Marat Dukhan31677ad2020-10-13 23:59:31 -07001466 }
1467 #if XNN_MAX_UARCH_TYPES > 1
1468 {
1469 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
1470 const uint32_t mr = xnn_params.qs8.gemm.mr;
1471 const uint32_t nr = xnn_params.qs8.gemm.nr;
1472 const uint32_t log2_kr = xnn_params.qs8.gemm.log2_kr;
1473 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
1474 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
1475 if (uarch_info == NULL) {
1476 /* No more microarchitectures in the system */
1477 break;
1478 }
1479
1480 switch (uarch_info->uarch) {
Frank Barchard2a995e72021-04-13 16:24:25 -07001481 case cpuinfo_uarch_cortex_a53:
Frank Barchard90f520b2021-04-26 18:01:51 -07001482 case cpuinfo_uarch_cortex_a55r0:
Frank Barchard2a995e72021-04-13 16:24:25 -07001483 if (mr == 2 && nr == 8 && log2_kr == 3) {
Frank Barcharde22685a2021-11-12 11:36:58 -08001484 xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1485 xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1486 xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1487 xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
Frank Barchard2a995e72021-04-13 16:24:25 -07001488 }
1489 break;
1490
Marat Dukhan31677ad2020-10-13 23:59:31 -07001491 case cpuinfo_uarch_cortex_a55:
Frank Barchardc37b8da2021-09-01 00:35:19 -07001492 if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001493 xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
1494 xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
1495 xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot;
1496 xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001497 }
1498 break;
1499 default:
1500 break;
1501 }
1502 }
1503 }
1504 #endif // XNN_MAX_UARCH_TYPES > 1
1505 #else // !XNN_ENABLE_ASSEMBLY
1506 if (cpuinfo_has_arm_neon_dot()) {
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001507 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
1508 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1509 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
1510 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1511 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001512 xnn_params.qs8.gemm.mr = 4;
1513 xnn_params.qs8.gemm.nr = 16;
1514 xnn_params.qs8.gemm.log2_kr = 2;
1515 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08001516 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
1517 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
1518 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
1519 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001520 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001521 xnn_params.qs8.gemm.mr = 2;
Frank Barcharda414daa2021-02-23 15:50:39 -08001522 xnn_params.qs8.gemm.nr = 8;
1523 xnn_params.qs8.gemm.log2_kr = 1;
Frank Barcharde7043ff2021-11-10 14:50:08 -08001524 xnn_params.qs8.gemm.log2_sr = 2;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001525 }
1526 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhandfe47b92020-12-14 02:48:43 -08001527 #endif // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Marat Dukhanf28cddf2020-08-10 14:05:02 -07001528
Frank Barchard0d065732021-08-31 00:01:40 -07001529 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64;
Marat Dukhan4ba70b72021-07-19 11:20:16 -07001530 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchard0d065732021-08-31 00:01:40 -07001531 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhanf28cddf2020-08-10 14:05:02 -07001532 xnn_params.qs8.dwconv[0].primary_tile = 9;
Frank Barchard7da8b022021-08-31 09:49:10 -07001533 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mla8_ld64;
Marat Dukhan4ba70b72021-07-19 11:20:16 -07001534 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchard7da8b022021-08-31 09:49:10 -07001535 xnn_params.qs8.dwconv[1].channel_tile = 16;
Marat Dukhan4ed14882021-05-12 17:50:40 -07001536 xnn_params.qs8.dwconv[1].primary_tile = 25;
Marat Dukhanf28cddf2020-08-10 14:05:02 -07001537
1538 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
1539 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c8_acc2,
1540 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2,
1541 .mr = 7,
1542 };
Marat Dukhanff209482020-09-03 14:26:53 -07001543
1544 xnn_params.qs8.vadd = (struct vbinary_parameters) {
Marat Dukhan01debd92021-07-29 18:14:21 -07001545 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__neon_ld64_x32,
1546 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32,
1547 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32,
Marat Dukhan64287252021-09-07 16:20:03 -07001548 .init.qs8_addsub = xnn_init_qs8_add_minmax_neon_params,
Marat Dukhan01debd92021-07-29 18:14:21 -07001549 .element_tile = 32,
Marat Dukhanff209482020-09-03 14:26:53 -07001550 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07001551 xnn_params.qs8.vmul = (struct vbinary_parameters) {
1552 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__neonv8_ld64_x16,
1553 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__neonv8_ld64_x16,
1554 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__neonv8_ld64_x16,
1555 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_neonv8_params,
1556 .element_tile = 16,
1557 };
Marat Dukhanf28cddf2020-08-10 14:05:02 -07001558 #endif // XNN_NO_QS8_OPERATORS
1559
Marat Dukhan08b7a972020-07-14 18:17:29 -07001560 /**************************** QU8 micro-kernels ****************************/
1561 #ifndef XNN_NO_QU8_OPERATORS
1562 init_flags |= XNN_INIT_FLAG_QU8;
Frank Barchard20255152021-08-11 14:01:45 -07001563
Frank Barcharda962f1e2021-08-02 13:52:15 -07001564 #if XNN_ENABLE_ASSEMBLY
Frank Barchard20255152021-08-11 14:01:45 -07001565 if (cpuinfo_has_arm_neon_dot()) {
Frank Barchard8b698022021-08-26 11:17:32 -07001566 switch (cpuinfo_get_core(0)->uarch) {
1567 case cpuinfo_uarch_cortex_a55:
Frank Barcharda49e41f2021-08-31 20:30:24 -07001568 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1569 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1570 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1571 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
Frank Barchard8b698022021-08-26 11:17:32 -07001572 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
1573 xnn_params.qu8.gemm.mr = 4;
Frank Barcharda49e41f2021-08-31 20:30:24 -07001574 xnn_params.qu8.gemm.nr = 16;
Frank Barchard8b698022021-08-26 11:17:32 -07001575 xnn_params.qu8.gemm.log2_kr = 2;
1576 break;
1577 default:
1578 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1579 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1580 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1581 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1582 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
1583 xnn_params.qu8.gemm.mr = 4;
1584 xnn_params.qu8.gemm.nr = 16;
1585 xnn_params.qu8.gemm.log2_kr = 2;
1586 break;
1587 }
Frank Barchard20255152021-08-11 14:01:45 -07001588 } else {
1589 switch (cpuinfo_get_core(0)->uarch) {
1590 case cpuinfo_uarch_cortex_a53:
1591 case cpuinfo_uarch_cortex_a55r0:
Frank Barchard20255152021-08-11 14:01:45 -07001592 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53);
1593 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53);
1594 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1595 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1596 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
1597 xnn_params.qu8.gemm.mr = 4;
1598 xnn_params.qu8.gemm.nr = 16;
1599 break;
Frank Barchardf479a1c2021-08-03 10:20:30 -07001600
Frank Barchard20255152021-08-11 14:01:45 -07001601 case cpuinfo_uarch_cortex_a57:
1602 case cpuinfo_uarch_cortex_a72:
1603 case cpuinfo_uarch_cortex_a73:
1604 case cpuinfo_uarch_cortex_a75:
1605 case cpuinfo_uarch_cortex_a76:
1606 case cpuinfo_uarch_exynos_m1:
1607 case cpuinfo_uarch_exynos_m2:
1608 case cpuinfo_uarch_exynos_m3:
1609 case cpuinfo_uarch_exynos_m4:
1610 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75);
1611 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75);
1612 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1613 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1614 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
1615 xnn_params.qu8.gemm.mr = 4;
1616 xnn_params.qu8.gemm.nr = 16;
1617 break;
Frank Barchardf479a1c2021-08-03 10:20:30 -07001618
Frank Barchard20255152021-08-11 14:01:45 -07001619 case cpuinfo_uarch_kryo:
1620 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
1621 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
1622 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1623 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1624 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
1625 xnn_params.qu8.gemm.mr = 4;
1626 xnn_params.qu8.gemm.nr = 16;
1627 break;
1628
1629 default:
1630 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75);
1631 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75);
1632 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1633 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1634 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
1635 xnn_params.qu8.gemm.mr = 4;
1636 xnn_params.qu8.gemm.nr = 16;
1637 break;
1638 }
Frank Barchardf479a1c2021-08-03 10:20:30 -07001639 }
Frank Barchardc37b8da2021-09-01 00:35:19 -07001640 #if XNN_MAX_UARCH_TYPES > 1
1641 {
1642 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
1643 const uint32_t mr = xnn_params.qu8.gemm.mr;
1644 const uint32_t nr = xnn_params.qu8.gemm.nr;
1645 const uint32_t log2_kr = xnn_params.qu8.gemm.log2_kr;
1646 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
1647 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
1648 if (uarch_info == NULL) {
1649 /* No more microarchitectures in the system */
1650 break;
1651 }
1652
1653 switch (uarch_info->uarch) {
1654 case cpuinfo_uarch_cortex_a53:
1655 case cpuinfo_uarch_cortex_a55r0:
1656 if (mr == 4 && nr == 16 && log2_kr == 0) {
1657 xnn_params.qu8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53;
1658 xnn_params.qu8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53;
1659 }
1660 break;
1661
1662 case cpuinfo_uarch_cortex_a55:
1663 if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
1664 xnn_params.qu8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
1665 xnn_params.qu8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
1666 }
1667 break;
1668 default:
1669 break;
1670 }
1671 }
1672 }
1673 #endif // XNN_MAX_UARCH_TYPES > 1
Frank Barchard20255152021-08-11 14:01:45 -07001674 #else // !XNN_ENABLE_ASSEMBLY
1675 if (cpuinfo_has_arm_neon_dot()) {
1676 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
1677 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
1678 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1679 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1680 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
1681 xnn_params.qu8.gemm.mr = 4;
1682 xnn_params.qu8.gemm.nr = 16;
1683 xnn_params.qu8.gemm.log2_kr = 2;
1684 } else {
1685 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
1686 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
1687 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1688 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1689 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
1690 xnn_params.qu8.gemm.mr = 4;
1691 xnn_params.qu8.gemm.nr = 16;
Marat Dukhan947805b2021-12-07 14:32:09 -08001692 }
Frank Barchard20255152021-08-11 14:01:45 -07001693 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhanaefaef32020-04-09 07:09:34 -07001694
Frank Barchard354cbc62021-09-27 21:42:41 -07001695 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8;
Marat Dukhan73a899a2021-07-27 00:10:38 -07001696 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
Frank Barchard354cbc62021-09-27 21:42:41 -07001697 xnn_params.qu8.dwconv[0].channel_tile = 16;
Marat Dukhan08b7a972020-07-14 18:17:29 -07001698 xnn_params.qu8.dwconv[0].primary_tile = 9;
Frank Barchard354cbc62021-09-27 21:42:41 -07001699 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8;
Marat Dukhan73a899a2021-07-27 00:10:38 -07001700 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
Frank Barchard354cbc62021-09-27 21:42:41 -07001701 xnn_params.qu8.dwconv[1].channel_tile = 8;
Marat Dukhan81721352021-07-15 18:26:08 -07001702 xnn_params.qu8.dwconv[1].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001703
Marat Dukhan08b7a972020-07-14 18:17:29 -07001704 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
1705 .up = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8,
1706 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07001707 .mr = 9,
1708 .qr = 8,
1709 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07001710 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
1711 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8,
1712 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07001713 .mr = 7,
1714 };
Marat Dukhandb007cd2021-07-20 23:42:39 -07001715 xnn_params.qu8.vadd = (struct vbinary_parameters) {
Frank Barchard0a3093c2021-08-31 09:58:11 -07001716 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__neon_ld64_x32,
1717 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x32,
1718 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x32,
Marat Dukhan64287252021-09-07 16:20:03 -07001719 .init.qu8_addsub = xnn_init_qu8_add_minmax_neon_params,
Marat Dukhandb007cd2021-07-20 23:42:39 -07001720 .element_tile = 8,
1721 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07001722 xnn_params.qu8.vmul = (struct vbinary_parameters) {
1723 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__neonv8_ld64_x16,
1724 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__neonv8_ld64_x16,
1725 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__neonv8_ld64_x16,
1726 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_neonv8_params,
1727 .element_tile = 16,
1728 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07001729 #endif // XNN_NO_QU8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07001730
Marat Dukhan23147532021-08-16 07:26:56 -07001731 /**************************** S8 micro-kernels ****************************/
1732 #ifndef XNN_NO_S8_OPERATORS
1733 init_flags |= XNN_INIT_FLAG_S8;
1734
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07001735 xnn_params.s8.clamp = (struct vunary_parameters) {
1736 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__neon_x64,
1737 .init.s8_minmax = xnn_init_s8_minmax_neon_params,
1738 .element_tile = 64,
1739 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08001740 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
1741 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__neon_c16,
1742 .pixel_tile = 1,
1743 .channel_tile = 16,
1744 };
Marat Dukhan23147532021-08-16 07:26:56 -07001745 xnn_params.s8.maxpool = (struct maxpool_parameters) {
1746 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__neon_c16,
Marat Dukhandc5c1482021-08-16 09:03:15 -07001747 .init.s8 = xnn_init_s8_minmax_neon_params,
Marat Dukhan23147532021-08-16 07:26:56 -07001748 .mr = 9,
1749 .qr = 8,
1750 };
1751 #endif // XNN_NO_S8_OPERATORS
1752
XNNPACK Teamb455b122019-09-27 18:10:33 -07001753 /**************************** U8 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07001754 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07001755 init_flags |= XNN_INIT_FLAG_U8;
1756
Marat Dukhan94912792021-08-16 21:40:30 -07001757 xnn_params.u8.clamp = (struct vunary_parameters) {
1758 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__neon_x64,
1759 .init.u8_minmax = xnn_init_u8_minmax_neon_params,
1760 .element_tile = 64,
1761 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08001762 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
1763 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__neon_c16,
1764 .pixel_tile = 1,
1765 .channel_tile = 16,
1766 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07001767 xnn_params.u8.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07001768 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16,
Marat Dukhan2ea50a02021-08-16 12:59:19 -07001769 .init.u8 = xnn_init_u8_minmax_neon_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07001770 .mr = 9,
1771 .qr = 8,
1772 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07001773 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
1774 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
1775 #endif // XNN_NO_U8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07001776
1777 /**************************** X8 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07001778 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07001779 init_flags |= XNN_INIT_FLAG_X8;
1780
Marat Dukhan98e054b2021-09-13 09:43:50 -07001781 xnn_params.x8.lut = xnn_x8_lut_ukernel__neon_tbx128x4_x64;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07001782 xnn_params.x8.zip = (struct zip_parameters) {
1783 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__neon,
1784 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__neon,
1785 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__neon,
1786 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__neon,
1787 };
1788 #endif // XNN_NO_X8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07001789
Frank Barchard7e2cbb02020-06-12 01:22:13 -07001790 /**************************** F16 micro-kernels ****************************/
1791 #ifndef XNN_NO_F16_OPERATORS
Marat Dukhan8d5d2592020-06-19 12:48:57 -07001792 if (cpuinfo_has_arm_neon_fp16_arith()) {
1793 init_flags |= XNN_INIT_FLAG_F16;
Frank Barchard7c3826e2021-06-07 15:14:16 -07001794 xnn_params.f16.gemm.mr = 6;
1795 xnn_params.f16.gemm.nr = 16;
Marat Dukhan854fb6b2020-06-19 12:33:44 -07001796
Frank Barchard6b73c4f2020-06-26 18:40:40 -07001797 #if XNN_ENABLE_ASSEMBLY
Frank Barchard7c3826e2021-06-07 15:14:16 -07001798 switch (cpuinfo_get_core(0)->uarch) {
1799 case cpuinfo_uarch_cortex_a55:
1800 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55);
1801 break;
1802
Frank Barchard07f4a892021-06-07 18:26:08 -07001803 case cpuinfo_uarch_cortex_a75:
Frank Barchard7b48ddc2021-06-11 13:00:49 -07001804 case cpuinfo_uarch_cortex_x1:
Frank Barchard07f4a892021-06-07 18:26:08 -07001805 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75);
1806 break;
1807
Frank Barchard7c3826e2021-06-07 15:14:16 -07001808 default:
1809 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
1810 break;
1811 }
Frank Barchard6b73c4f2020-06-26 18:40:40 -07001812 xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
Frank Barchard7c3826e2021-06-07 15:14:16 -07001813
1814 #if XNN_MAX_UARCH_TYPES > 1
1815 {
1816 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
1817 const uint32_t mr = xnn_params.f16.gemm.mr;
1818 const uint32_t nr = xnn_params.f16.gemm.nr;
1819 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
1820 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
1821 if (uarch_info == NULL) {
1822 /* No more microarchitectures in the system */
1823 break;
1824 }
1825
1826 switch (uarch_info->uarch) {
1827 case cpuinfo_uarch_cortex_a55:
1828 if (mr == 6 && nr == 16) {
1829 xnn_params.f16.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55;
1830 }
1831 break;
Frank Barchard07f4a892021-06-07 18:26:08 -07001832
Frank Barchardd2f454e2021-06-08 10:47:16 -07001833 case cpuinfo_uarch_cortex_a55r0:
1834 if (mr == 6 && nr == 16) {
1835 xnn_params.f16.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64;
1836 }
1837 break;
1838
Frank Barchard07f4a892021-06-07 18:26:08 -07001839 /* Cortex A75 is the medium core Exynos 9820 (M4) */
1840 case cpuinfo_uarch_cortex_a75:
1841 if (mr == 6 && nr == 16) {
1842 xnn_params.f16.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75;
1843 }
1844 break;
1845
Frank Barchard7c3826e2021-06-07 15:14:16 -07001846 default:
1847 break;
1848 }
1849 }
1850 }
1851 #endif // XNN_MAX_UARCH_TYPES > 1
1852 #else // XNN_ENABLE_ASSEMBLY
Frank Barchard6b73c4f2020-06-26 18:40:40 -07001853 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
1854 xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
Frank Barchard7c3826e2021-06-07 15:14:16 -07001855 #endif // XNN_ENABLE_ASSEMBLY
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001856 xnn_params.f16.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001857 xnn_params.f16.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
Marat Dukhanc5a7a392021-05-21 16:04:31 -07001858 xnn_params.f16.gemm.init.f16 = xnn_init_f16_scaleminmax_params;
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001859
1860 xnn_params.f16.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith;
Marat Dukhanc5a7a392021-05-21 16:04:31 -07001861 xnn_params.f16.dwconv[0].init.f16 = xnn_init_f16_minmax_params;
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001862 xnn_params.f16.dwconv[0].channel_tile = 16;
1863 xnn_params.f16.dwconv[0].primary_tile = 4;
1864
1865 xnn_params.f16.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith;
Marat Dukhanc5a7a392021-05-21 16:04:31 -07001866 xnn_params.f16.dwconv[1].init.f16 = xnn_init_f16_minmax_params;
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001867 xnn_params.f16.dwconv[1].channel_tile = 16;
1868 xnn_params.f16.dwconv[1].primary_tile = 9;
1869
1870 xnn_params.f16.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2;
Marat Dukhanc5a7a392021-05-21 16:04:31 -07001871 xnn_params.f16.dwconv[2].init.f16 = xnn_init_f16_minmax_params;
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001872 xnn_params.f16.dwconv[2].channel_tile = 8;
1873 xnn_params.f16.dwconv[2].primary_tile = 25;
1874
Marat Dukhan8d5d2592020-06-19 12:48:57 -07001875 xnn_params.f16.gavgpool = (struct gavgpool_parameters) {
1876 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8,
1877 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8,
1878 .mr = 7,
1879 };
Frank Barchard01898c02020-06-23 21:49:50 -07001880 xnn_params.f16.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07001881 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vadd_minmax_ukernel__neonfp16arith_x16,
1882 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16,
1883 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16,
Frank Barchard01898c02020-06-23 21:49:50 -07001884 .element_tile = 16,
1885 };
Frank Barchard0ea6a772020-09-09 15:26:31 -07001886 xnn_params.f16.vmul = (struct vbinary_parameters) {
1887 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmul_minmax_ukernel__neonfp16arith_x16,
1888 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16,
1889 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16,
1890 .element_tile = 16,
1891 };
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001892 xnn_params.f16.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07001893 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f16_vmulcaddc_minmax_ukernel_c8__neonfp16arith_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07001894 .init.f16 = xnn_init_f16_minmax_params,
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001895 .channel_tile = 8,
1896 .row_tile = 2,
1897 };
Marat Dukhan6674d692021-05-05 22:27:00 -07001898 xnn_params.f16.hswish = (xnn_univector_ukernel_function) xnn_f16_vhswish_ukernel__neonfp16arith_x16;
Marat Dukhan8d5d2592020-06-19 12:48:57 -07001899 }
Frank Barchard7e2cbb02020-06-12 01:22:13 -07001900 #endif // XNN_NO_F16_OPERATORS
1901
XNNPACK Teamb455b122019-09-27 18:10:33 -07001902 /**************************** F32 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07001903 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07001904 init_flags |= XNN_INIT_FLAG_F32;
1905
Marat Dukhandfe47b92020-12-14 02:48:43 -08001906 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Frank Barchard0d1052c2020-03-23 17:28:13 -07001907 #if XNN_ENABLE_ASSEMBLY
Frank Barchard143a1102021-06-15 09:15:34 -07001908 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
1909 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
1910 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
1911 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001912 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001913 xnn_params.f32.gemm.mr = 6;
1914 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07001915 #else // !XNN_ENABLE_ASSEMBLY
Marat Dukhanaefaef32020-04-09 07:09:34 -07001916 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64);
1917 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64);
1918 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
1919 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001920 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001921 xnn_params.f32.gemm.mr = 6;
1922 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07001923 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhandfe47b92020-12-14 02:48:43 -08001924 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
Frank Barchard0d1052c2020-03-23 17:28:13 -07001925 #if XNN_ENABLE_ASSEMBLY
1926 switch (cpuinfo_get_core(0)->uarch) {
1927 case cpuinfo_uarch_cortex_a57:
Frank Barchard143a1102021-06-15 09:15:34 -07001928 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75);
1929 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75);
1930 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
1931 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001932 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001933 xnn_params.f32.gemm.mr = 6;
1934 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07001935 break;
1936 case cpuinfo_uarch_cortex_a72:
Frank Barchard143a1102021-06-15 09:15:34 -07001937 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75);
1938 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75);
1939 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
1940 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001941 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001942 xnn_params.f32.gemm.mr = 4;
1943 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07001944 break;
1945 case cpuinfo_uarch_cortex_a75:
1946 case cpuinfo_uarch_cortex_a76:
1947 case cpuinfo_uarch_exynos_m3:
1948 case cpuinfo_uarch_exynos_m4:
Frank Barchard143a1102021-06-15 09:15:34 -07001949 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
1950 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
1951 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
1952 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001953 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001954 xnn_params.f32.gemm.mr = 6;
1955 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07001956 break;
1957 case cpuinfo_uarch_exynos_m1:
1958 case cpuinfo_uarch_exynos_m2:
Marat Dukhanaefaef32020-04-09 07:09:34 -07001959 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma);
1960 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma);
1961 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma);
1962 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001963 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001964 xnn_params.f32.gemm.mr = 6;
1965 xnn_params.f32.gemm.nr = 8;
1966 xnn_params.f32.gemm.log2_sr = 2;
Frank Barchard0d1052c2020-03-23 17:28:13 -07001967 break;
Frank Barchard0d1052c2020-03-23 17:28:13 -07001968 case cpuinfo_uarch_cortex_a53:
1969 case cpuinfo_uarch_cortex_a55r0:
Marat Dukhanaefaef32020-04-09 07:09:34 -07001970 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53);
1971 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53);
1972 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
1973 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001974 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001975 xnn_params.f32.gemm.mr = 6;
1976 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07001977 break;
Frank Barchardf975ee02021-11-05 16:01:00 -07001978 case cpuinfo_uarch_cortex_a35:
Frank Barchard0d1052c2020-03-23 17:28:13 -07001979 case cpuinfo_uarch_cortex_a55:
Marat Dukhanaefaef32020-04-09 07:09:34 -07001980 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55);
1981 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55);
1982 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
1983 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001984 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001985 xnn_params.f32.gemm.mr = 6;
1986 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07001987 break;
1988 case cpuinfo_uarch_cortex_a73:
Marat Dukhanaefaef32020-04-09 07:09:34 -07001989 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73);
1990 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73);
Frank Barchard143a1102021-06-15 09:15:34 -07001991 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
1992 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001993 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001994 xnn_params.f32.gemm.mr = 6;
1995 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07001996 break;
Frank Barchard0d1052c2020-03-23 17:28:13 -07001997 case cpuinfo_uarch_cortex_a77:
1998 case cpuinfo_uarch_exynos_m5:
1999 case cpuinfo_uarch_kryo:
Frank Barchard143a1102021-06-15 09:15:34 -07002000 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75);
2001 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75);
2002 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
2003 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002004 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002005 xnn_params.f32.gemm.mr = 4;
2006 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002007 break;
Frank Barchard990b2af2021-06-14 11:49:15 -07002008 case cpuinfo_uarch_cortex_a78:
2009 case cpuinfo_uarch_cortex_x1:
2010 default:
2011 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128);
Frank Barchard79cd5f92021-06-21 17:34:59 -07002012 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128);
Frank Barchard990b2af2021-06-14 11:49:15 -07002013 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2014 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2015 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2016 xnn_params.f32.gemm.mr = 6;
2017 xnn_params.f32.gemm.nr = 8;
2018 break;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002019 }
Marat Dukhan05702cf2020-03-26 15:41:33 -07002020 #if XNN_MAX_UARCH_TYPES > 1
2021 {
2022 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2023 const uint32_t mr = xnn_params.f32.gemm.mr;
2024 const uint32_t nr = xnn_params.f32.gemm.nr;
2025 const uint32_t log2_sr = xnn_params.f32.gemm.log2_sr;
2026 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2027 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2028 if (uarch_info == NULL) {
2029 /* No more microarchitectures in the system */
2030 break;
2031 }
2032
2033 switch (uarch_info->uarch) {
2034 case cpuinfo_uarch_cortex_a53:
2035 case cpuinfo_uarch_cortex_a55r0:
2036 if (mr == 6 && nr == 8 && log2_sr == 0) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07002037 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53;
2038 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53;
2039 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2040 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
Marat Dukhan05702cf2020-03-26 15:41:33 -07002041 } else if (mr == 4 && nr == 8 && log2_sr == 0) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07002042 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53;
2043 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53;
2044 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2045 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
Marat Dukhan05702cf2020-03-26 15:41:33 -07002046 }
2047 break;
2048 case cpuinfo_uarch_cortex_a55:
2049 if (mr == 6 && nr == 8 && log2_sr == 0) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07002050 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55;
2051 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55;
2052 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2053 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
Marat Dukhan05702cf2020-03-26 15:41:33 -07002054 } else if (mr == 4 && nr == 8 && log2_sr == 0) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07002055 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55;
2056 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55;
2057 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2058 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
Marat Dukhan05702cf2020-03-26 15:41:33 -07002059 }
2060 break;
2061 default:
2062 break;
2063 }
2064 }
2065 }
2066 #endif // XNN_MAX_UARCH_TYPES > 1
Frank Barchard0d1052c2020-03-23 17:28:13 -07002067 #else // !XNN_ENABLE_ASSEMBLY
Marat Dukhanaefaef32020-04-09 07:09:34 -07002068 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64);
2069 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64);
2070 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2071 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002072 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002073 xnn_params.f32.gemm.mr = 6;
2074 xnn_params.f32.gemm.nr = 8;
Marat Dukhan31677ad2020-10-13 23:59:31 -07002075 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhandfe47b92020-12-14 02:48:43 -08002076 #endif // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Marat Dukhanaefaef32020-04-09 07:09:34 -07002077 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__neonfma_lane_ld64);
2078 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__neonfma_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002079 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002080 xnn_params.f32.gemm2.mr = 4;
2081 xnn_params.f32.gemm2.nr = 2;
2082
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002083 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__neonfma;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002084 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanf5425ea2020-04-24 01:46:00 -07002085 xnn_params.f32.dwconv[0].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002086 xnn_params.f32.dwconv[0].primary_tile = 3;
2087
2088 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__neonfma;
2089 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
2090 xnn_params.f32.dwconv[1].channel_tile = 8;
2091 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002092
Marat Dukhandfe47b92020-12-14 02:48:43 -08002093 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002094 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
2095 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2096 xnn_params.f32.dwconv[2].channel_tile = 8;
2097 xnn_params.f32.dwconv[2].primary_tile = 9;
Marat Dukhandfe47b92020-12-14 02:48:43 -08002098 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
Frank Barchard0d1052c2020-03-23 17:28:13 -07002099 switch (cpuinfo_get_core(0)->uarch) {
2100 case cpuinfo_uarch_kryo:
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002101 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
2102 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2103 xnn_params.f32.dwconv[2].channel_tile = 8;
2104 xnn_params.f32.dwconv[2].primary_tile = 9;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002105 break;
2106 #if XNN_ENABLE_ASSEMBLY
2107 case cpuinfo_uarch_cortex_a53:
2108 case cpuinfo_uarch_cortex_a55r0:
2109 case cpuinfo_uarch_cortex_a55:
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002110 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma_cortex_a55;
2111 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2112 xnn_params.f32.dwconv[2].channel_tile = 4;
2113 xnn_params.f32.dwconv[2].primary_tile = 9;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002114 break;
2115 #endif // XNN_ENABLE_ASSEMBLY
2116 default:
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002117 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
2118 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2119 xnn_params.f32.dwconv[2].channel_tile = 8;
2120 xnn_params.f32.dwconv[2].primary_tile = 9;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002121 break;
2122 }
Marat Dukhandfe47b92020-12-14 02:48:43 -08002123 #endif // XNN_PLATFORM_IOS && XNN_PLATFORM_MAC
Marat Dukhanaefaef32020-04-09 07:09:34 -07002124
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002125 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__neonfma_acc2;
2126 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
2127 xnn_params.f32.dwconv[3].channel_tile = 8;
2128 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002129
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002130 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07002131 .up = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__neon_c4,
2132 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002133 .mr = 9,
2134 .qr = 8,
2135 };
2136 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07002137 .up = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4,
2138 .mp = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002139 .mr = 9,
2140 .qr = 8,
2141 };
2142 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07002143 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4,
2144 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002145 .mr = 7,
2146 };
2147 xnn_params.f32.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07002148 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4,
Marat Dukhan91ae1652021-08-15 19:19:49 -07002149 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002150 .mr = 9,
2151 .qr = 8,
2152 };
2153 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -07002154 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__neon_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002155 .mr = 4,
2156 };
2157 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -07002158 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__neon_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002159 .mr = 9,
2160 };
2161 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -07002162 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__neon_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002163 .mr = 9,
2164 .qr = 8,
2165 };
Marat Dukhan660fd192020-03-10 04:55:30 -07002166 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
2167 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__neonfma_c8,
Marat Dukhan69722492019-11-11 19:55:50 -08002168 .pixel_tile = 1,
2169 .channel_tile = 8,
2170 };
Marat Dukhan5020b962020-06-08 13:30:10 -07002171 xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__neon_x8;
Marat Dukhan94912792021-08-16 21:40:30 -07002172 xnn_params.f32.clamp = (struct vunary_parameters) {
2173 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__neon_x8,
2174 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
2175 .element_tile = 8,
2176 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08002177 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x16;
Marat Dukhan6674d692021-05-05 22:27:00 -07002178 xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__neon_x16;
Marat Dukhan28813332020-06-10 18:05:38 -07002179 xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__neon_x8;
Marat Dukhan5020b962020-06-08 13:30:10 -07002180 xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__neon_x8;
Marat Dukhan64e52512020-06-09 13:41:16 -07002181 xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8;
2182 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8;
2183 xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8;
2184 xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8;
Marat Dukhan6674d692021-05-05 22:27:00 -07002185 xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x16;
Marat Dukhan5020b962020-06-08 13:30:10 -07002186 xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__neon_x8;
Marat Dukhan6804bbd2020-06-30 19:26:11 -07002187 xnn_params.f32.sqrt = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__neon_sqrt_x4;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002188 xnn_params.f32.prelu = (struct prelu_parameters) {
Marat Dukhan69c3f2c2019-11-06 12:30:01 -08002189 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
2190 .row_tile = 2,
2191 .channel_tile = 8,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002192 };
Marat Dukhan1edc4542020-01-27 12:40:13 -08002193 xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x16;
2194 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__neon;
Marat Dukhanb1a0fc32019-12-02 19:32:02 -08002195 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002196 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__neon_x8,
2197 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
2198 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
Marat Dukhanb1a0fc32019-12-02 19:32:02 -08002199 .element_tile = 8,
2200 };
Marat Dukhan69180502019-12-06 15:00:31 -08002201 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002202 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__neon_x8,
2203 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__neon_x8,
2204 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__neon_x8,
Marat Dukhan69180502019-12-06 15:00:31 -08002205 .element_tile = 8,
2206 };
Marat Dukhan79e7f842019-12-05 14:35:50 -08002207 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002208 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
2209 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
2210 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
Marat Dukhan79e7f842019-12-05 14:35:50 -08002211 .element_tile = 8,
2212 };
2213 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002214 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__neon_x8,
2215 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
2216 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
Marat Dukhan79e7f842019-12-05 14:35:50 -08002217 .element_tile = 8,
2218 };
Marat Dukhan1e782c42019-11-21 17:02:40 -08002219 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002220 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__neon_x8,
2221 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
2222 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
Marat Dukhanca2733c2019-11-15 23:21:17 -08002223 .element_tile = 8,
2224 };
Marat Dukhan05f3f6d2019-12-03 15:13:53 -08002225 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002226 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__neon_x8,
2227 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__neon_x8,
2228 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__neon_x8,
Marat Dukhan05f3f6d2019-12-03 15:13:53 -08002229 .element_tile = 8,
2230 };
Marat Dukhanf7399262020-06-05 10:58:44 -07002231 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002232 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__neon_x8,
2233 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
2234 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
Marat Dukhanf7399262020-06-05 10:58:44 -07002235 .element_tile = 8,
2236 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002237 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07002238 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07002239 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan49e6ee92019-11-06 15:55:29 -08002240 .channel_tile = 4,
2241 .row_tile = 2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002242 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08002243 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -08002244 init_flags |= XNN_INIT_FLAG_CHW_OPT;
2245
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002246 xnn_params.f32.spmm = (struct spmm_parameters) {
Marat Dukhan4baa2ac2021-02-11 10:04:09 -08002247 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__neonfma_pipelined,
2248 .mr = 32,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002249 .nr = 1,
XNNPACK Teamb455b122019-09-27 18:10:33 -07002250 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002251 xnn_params.f32.spmm2 = (struct spmm_parameters) {
Marat Dukhan4baa2ac2021-02-11 10:04:09 -08002252 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x2__neonfma,
2253 .mr = 32,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002254 .nr = 2,
2255 };
2256 xnn_params.f32.spmm4 = (struct spmm_parameters) {
Marat Dukhan4baa2ac2021-02-11 10:04:09 -08002257 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x4__neonfma,
2258 .mr = 32,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002259 .nr = 4,
2260 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07002261 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002262 .ukernel_with_symm_padding =
Marat Dukhan1f29b802020-05-15 23:46:39 -07002263 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002264 .output_channel_tile = 4,
2265 .output_height_tile = 2,
2266 .output_width_tile = 2,
2267 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07002268 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
2269 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002270 .output_width_tile = 4,
2271 .output_height_tile = 3,
2272 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07002273 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
Marat Dukhan82f0c322020-10-25 19:17:35 -07002274 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4_acc2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002275 .output_width_tile = 4,
Marat Dukhan82f0c322020-10-25 19:17:35 -07002276 .output_height_tile = 2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002277 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07002278 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
Marat Dukhan149f0ea2020-10-26 12:50:33 -07002279 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4,
Marat Dukhana99918a2019-11-15 14:40:12 -08002280 .output_width_tile = 4,
Marat Dukhan149f0ea2020-10-26 12:50:33 -07002281 .output_height_tile = 4,
Marat Dukhana99918a2019-11-15 14:40:12 -08002282 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07002283 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
2284 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2,
Marat Dukhana99918a2019-11-15 14:40:12 -08002285 .output_width_tile = 4,
2286 .output_height_tile = 1,
2287 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07002288 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
2289 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__neon_x4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002290 .channel_tile = 4,
2291 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -07002292 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
Artsiom Ablavatski2202c812021-01-22 14:16:43 -08002293 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__neonfma_p8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07002294 .channel_tile = 1,
Artsiom Ablavatskib3ffd582021-03-31 13:00:08 -07002295 .pixel_tile = 8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07002296 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08002297 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002298 #endif // XNN_NO_F32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07002299
Marat Dukhanaf2ba002021-10-24 14:21:41 -07002300 /*************************** VCVT micro-kernels ***************************/
2301 #ifndef XNN_NO_VCVT_OPERATORS
2302 init_flags |= XNN_INIT_FLAG_VCVT;
2303
2304 xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neonfp16_x16;
Marat Dukhana0c61682021-11-10 19:23:41 -08002305 xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__neonfp16_x16;
Marat Dukhaned2d7762021-12-03 23:51:19 -08002306 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
2307 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neonv8_x32,
2308 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neonv8_params,
2309 .element_tile = 32,
2310 };
2311 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
2312 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neonv8_x32,
2313 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neonv8_params,
2314 .element_tile = 32,
2315 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07002316 #endif // XNN_NO_VCVT_OPERATORS
2317
XNNPACK Teamb455b122019-09-27 18:10:33 -07002318 /**************************** X32 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002319 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07002320 init_flags |= XNN_INIT_FLAG_X32;
2321
Marat Dukhan57dccd82020-04-14 00:53:10 -07002322 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__neon;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002323 xnn_params.x32.zip = (struct zip_parameters) {
2324 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__neon,
2325 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__neon,
2326 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__neon,
2327 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
2328 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08002329 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -08002330 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
2331 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08002332 .channel_tile = 1,
2333 .pixel_tile = 1,
2334 };
2335 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002336 #endif // XNN_NO_X32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07002337
Marat Dukhan048931b2020-11-24 20:53:54 -08002338 /**************************** XX micro-kernels ****************************/
2339 #ifndef XNN_NO_XX_OPERATORS
2340 init_flags |= XNN_INIT_FLAG_XX;
2341
2342 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
Marat Dukhan933051b2021-08-07 16:26:15 -07002343 xnn_params.xx.fill = (struct fill_parameters) {
2344 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__neon_x64,
2345 .row_tile = 1,
2346 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -07002347 xnn_params.xx.pad = (struct pad_parameters) {
2348 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__neon,
2349 .row_tile = 1,
2350 };
Marat Dukhan048931b2020-11-24 20:53:54 -08002351 #endif
2352
Marat Dukhan933051b2021-08-07 16:26:15 -07002353#elif XNN_ARCH_X86 || XNN_ARCH_X86_64
2354 if (!cpuinfo_has_x86_sse2()) {
2355 xnn_log_error("XNNPACK initialization failed: SSE2 is not supported");
2356 return;
2357 }
2358
Marat Dukhan5e353862021-06-15 09:03:25 -07002359 /**************************** QC8 micro-kernels ****************************/
2360 #ifndef XNN_NO_QC8_OPERATORS
2361 init_flags |= XNN_INIT_FLAG_QC8;
2362
2363 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
2364 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
2365 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
2366 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
2367 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
2368 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_avx512_params;
2369 xnn_params.qc8.gemm.mr = 4;
2370 xnn_params.qc8.gemm.nr = 16;
2371 xnn_params.qc8.gemm.log2_kr = 3;
2372 } else if (cpuinfo_has_x86_xop()) {
2373 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
2374 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
2375 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
2376 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
2377 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
2378 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse4_params;
2379 xnn_params.qc8.gemm.mr = 2;
2380 xnn_params.qc8.gemm.nr = 4;
2381 xnn_params.qc8.gemm.log2_kr = 3;
2382 } else if (cpuinfo_has_x86_avx2()) {
2383 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
2384 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
2385 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
2386 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
2387 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_avx2_params;
2388 xnn_params.qc8.gemm.mr = 3;
2389 xnn_params.qc8.gemm.nr = 8;
2390 xnn_params.qc8.gemm.log2_kr = 3;
2391 } else if (cpuinfo_has_x86_avx()) {
2392 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
2393 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
2394 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
2395 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
2396 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse4_params;
2397 xnn_params.qc8.gemm.mr = 2;
2398 xnn_params.qc8.gemm.nr = 4;
2399 xnn_params.qc8.gemm.log2_kr = 3;
2400 } else if (cpuinfo_has_x86_sse4_1()) {
2401 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
2402 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
2403 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
2404 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
2405 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse4_params;
2406 xnn_params.qc8.gemm.mr = 3;
2407 xnn_params.qc8.gemm.nr = 4;
2408 xnn_params.qc8.gemm.log2_kr = 3;
2409 } else {
2410 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
2411 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
2412 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
2413 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
2414 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse2_params;
2415 xnn_params.qc8.gemm.mr = 3;
2416 xnn_params.qc8.gemm.nr = 4;
2417 xnn_params.qc8.gemm.log2_kr = 3;
2418 }
2419
2420 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
2421 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
2422 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_avx512_params;
2423 xnn_params.qc8.dwconv[0].channel_tile = 32;
2424 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
2425 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_avx512_params;
2426 xnn_params.qc8.dwconv[1].channel_tile = 32;
2427 } else if (cpuinfo_has_x86_xop()) {
2428 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
Marat Dukhan28480592021-07-27 23:52:27 -07002429 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16;
Marat Dukhan5e353862021-06-15 09:03:25 -07002430 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse4_params;
2431 xnn_params.qc8.dwconv[0].channel_tile = 16;
Marat Dukhan28480592021-07-27 23:52:27 -07002432 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul16_add16;
Marat Dukhan5e353862021-06-15 09:03:25 -07002433 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse4_params;
2434 xnn_params.qc8.dwconv[1].channel_tile = 16;
2435 } else if (cpuinfo_has_x86_avx2()) {
2436 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
2437 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_avx2_params;
2438 xnn_params.qc8.dwconv[0].channel_tile = 16;
2439 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
2440 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_avx2_params;
2441 xnn_params.qc8.dwconv[1].channel_tile = 16;
2442 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan28480592021-07-27 23:52:27 -07002443 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16;
Marat Dukhan5e353862021-06-15 09:03:25 -07002444 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse4_params;
2445 xnn_params.qc8.dwconv[0].channel_tile = 16;
Marat Dukhan28480592021-07-27 23:52:27 -07002446 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16;
Marat Dukhan5e353862021-06-15 09:03:25 -07002447 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse4_params;
2448 xnn_params.qc8.dwconv[1].channel_tile = 16;
2449 } else if (cpuinfo_has_x86_sse4_1()) {
2450 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16;
2451 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse4_params;
2452 xnn_params.qc8.dwconv[0].channel_tile = 8;
2453 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16;
2454 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse4_params;
2455 xnn_params.qc8.dwconv[1].channel_tile = 8;
2456 } else if (cpuinfo_has_x86_sse2()) {
2457 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16;
2458 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse2_params;
2459 xnn_params.qc8.dwconv[0].channel_tile = 8;
2460 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16;
2461 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse2_params;
2462 xnn_params.qc8.dwconv[1].channel_tile = 8;
2463 }
2464 xnn_params.qc8.dwconv[0].primary_tile = 9;
2465 xnn_params.qc8.dwconv[1].primary_tile = 25;
2466 #endif // XNN_NO_QC8_OPERATORS
2467
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002468 /**************************** QS8 micro-kernels ****************************/
2469 #ifndef XNN_NO_QS8_OPERATORS
2470 init_flags |= XNN_INIT_FLAG_QS8;
2471
Marat Dukhanbb00b1d2020-08-10 11:37:23 -07002472 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhan71855ee2021-05-25 19:05:06 -07002473 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
2474 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
2475 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
2476 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
2477 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
Marat Dukhanbb00b1d2020-08-10 11:37:23 -07002478 xnn_params.qs8.gemm.mr = 4;
2479 xnn_params.qs8.gemm.nr = 16;
2480 xnn_params.qs8.gemm.log2_kr = 3;
2481 } else if (cpuinfo_has_x86_xop()) {
Marat Dukhan75215d82020-08-07 23:08:03 -07002482 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
Marat Dukhanc46e6712021-06-01 19:00:16 -07002483 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
2484 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
2485 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
2486 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
2487 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan75215d82020-08-07 23:08:03 -07002488 xnn_params.qs8.gemm.mr = 2;
2489 xnn_params.qs8.gemm.nr = 4;
2490 xnn_params.qs8.gemm.log2_kr = 3;
2491 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan9b474cf2021-05-25 16:37:48 -07002492 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
2493 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
2494 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
2495 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
2496 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002497 xnn_params.qs8.gemm.mr = 3;
2498 xnn_params.qs8.gemm.nr = 8;
2499 xnn_params.qs8.gemm.log2_kr = 3;
Marat Dukhana3c16332021-04-02 15:03:27 -07002500 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhanc46e6712021-06-01 19:00:16 -07002501 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
2502 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
2503 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
2504 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
2505 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhana3c16332021-04-02 15:03:27 -07002506 xnn_params.qs8.gemm.mr = 2;
2507 xnn_params.qs8.gemm.nr = 4;
2508 xnn_params.qs8.gemm.log2_kr = 3;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002509 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhanc46e6712021-06-01 19:00:16 -07002510 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
2511 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
2512 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
2513 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
2514 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002515 xnn_params.qs8.gemm.mr = 3;
2516 xnn_params.qs8.gemm.nr = 4;
2517 xnn_params.qs8.gemm.log2_kr = 3;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002518 } else {
Marat Dukhanc46e6712021-06-01 19:00:16 -07002519 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
2520 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
2521 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
2522 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
2523 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002524 xnn_params.qs8.gemm.mr = 3;
2525 xnn_params.qs8.gemm.nr = 4;
2526 xnn_params.qs8.gemm.log2_kr = 3;
2527 }
2528
Marat Dukhan2ffc5e62020-09-06 22:33:38 -07002529 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhan71855ee2021-05-25 19:05:06 -07002530 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
2531 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
Marat Dukhan2ffc5e62020-09-06 22:33:38 -07002532 xnn_params.qs8.dwconv[0].channel_tile = 32;
Marat Dukhan71855ee2021-05-25 19:05:06 -07002533 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
2534 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07002535 xnn_params.qs8.dwconv[1].channel_tile = 32;
Marat Dukhan3fd4e272021-04-10 11:16:42 -07002536 } else if (cpuinfo_has_x86_xop()) {
2537 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
Marat Dukhan02f06e32021-07-27 14:33:47 -07002538 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07002539 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan3fd4e272021-04-10 11:16:42 -07002540 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhan09668562021-07-26 16:52:20 -07002541 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07002542 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07002543 xnn_params.qs8.dwconv[1].channel_tile = 16;
Marat Dukhan2ffc5e62020-09-06 22:33:38 -07002544 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan9b474cf2021-05-25 16:37:48 -07002545 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
2546 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
Marat Dukhand65a1522020-08-04 19:28:18 -07002547 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhan9b474cf2021-05-25 16:37:48 -07002548 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
2549 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07002550 xnn_params.qs8.dwconv[1].channel_tile = 16;
Marat Dukhanfa0ab852021-04-02 17:30:49 -07002551 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan09668562021-07-26 16:52:20 -07002552 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07002553 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhanfa0ab852021-04-02 17:30:49 -07002554 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhan09668562021-07-26 16:52:20 -07002555 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07002556 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07002557 xnn_params.qs8.dwconv[1].channel_tile = 16;
Marat Dukhand65a1522020-08-04 19:28:18 -07002558 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhan09668562021-07-26 16:52:20 -07002559 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07002560 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhand65a1522020-08-04 19:28:18 -07002561 xnn_params.qs8.dwconv[0].channel_tile = 8;
Marat Dukhan09668562021-07-26 16:52:20 -07002562 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07002563 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07002564 xnn_params.qs8.dwconv[1].channel_tile = 8;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002565 } else if (cpuinfo_has_x86_sse2()) {
Marat Dukhan09668562021-07-26 16:52:20 -07002566 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07002567 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
Marat Dukhand65a1522020-08-04 19:28:18 -07002568 xnn_params.qs8.dwconv[0].channel_tile = 8;
Marat Dukhan09668562021-07-26 16:52:20 -07002569 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07002570 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07002571 xnn_params.qs8.dwconv[1].channel_tile = 8;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002572 }
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002573 xnn_params.qs8.dwconv[0].primary_tile = 9;
Marat Dukhan4ed14882021-05-12 17:50:40 -07002574 xnn_params.qs8.dwconv[1].primary_tile = 25;
Marat Dukhan9e0b5392020-08-07 02:29:34 -07002575
2576 if (cpuinfo_has_x86_sse4_1()) {
2577 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
2578 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7x__sse41_c8_acc2,
2579 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c8_acc2,
2580 .mr = 7,
2581 };
2582 } else if (cpuinfo_has_x86_ssse3()) {
2583 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
2584 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7x__ssse3_c8_acc2,
2585 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7p7x__ssse3_c8_acc2,
2586 .mr = 7,
2587 };
2588 } else if (cpuinfo_has_x86_sse2()) {
2589 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
2590 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c8_acc2,
2591 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c8_acc2,
2592 .mr = 7,
2593 };
2594 }
Marat Dukhanff209482020-09-03 14:26:53 -07002595
Marat Dukhane76049a2021-07-22 14:48:59 -07002596 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
2597 xnn_params.qs8.vadd = (struct vbinary_parameters) {
2598 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16,
2599 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
2600 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
Marat Dukhan64287252021-09-07 16:20:03 -07002601 .init.qs8_addsub = xnn_init_qs8_add_minmax_avx512_params,
Marat Dukhane76049a2021-07-22 14:48:59 -07002602 .element_tile = 16,
2603 };
2604 } else if (cpuinfo_has_x86_xop()) {
Marat Dukhanbb9225e2020-09-06 22:40:56 -07002605 xnn_params.qs8.vadd = (struct vbinary_parameters) {
2606 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8,
2607 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
2608 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07002609 .init.qs8_addsub = xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhanbb9225e2020-09-06 22:40:56 -07002610 .element_tile = 8,
2611 };
Marat Dukhan3eac69c2021-07-21 01:42:29 -07002612 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan7679b1e2021-07-20 18:32:23 -07002613 xnn_params.qs8.vadd = (struct vbinary_parameters) {
2614 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_x16,
2615 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
2616 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
Marat Dukhan64287252021-09-07 16:20:03 -07002617 .init.qs8_addsub = xnn_init_qs8_add_minmax_avx2_params,
Marat Dukhan7679b1e2021-07-20 18:32:23 -07002618 .element_tile = 16,
2619 };
Marat Dukhane9c4b962021-04-02 16:56:55 -07002620 } else if (cpuinfo_has_x86_avx()) {
2621 xnn_params.qs8.vadd = (struct vbinary_parameters) {
2622 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_x8,
2623 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
2624 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07002625 .init.qs8_addsub = xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhane9c4b962021-04-02 16:56:55 -07002626 .element_tile = 8,
2627 };
Marat Dukhanbb9225e2020-09-06 22:40:56 -07002628 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhanff209482020-09-03 14:26:53 -07002629 xnn_params.qs8.vadd = (struct vbinary_parameters) {
2630 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8,
2631 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
2632 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07002633 .init.qs8_addsub = xnn_init_qs8_add_minmax_sse4_mul16_params,
Marat Dukhanff209482020-09-03 14:26:53 -07002634 .element_tile = 8,
2635 };
2636 } else {
2637 xnn_params.qs8.vadd = (struct vbinary_parameters) {
2638 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8,
2639 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
2640 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07002641 .init.qs8_addsub = xnn_init_qs8_add_minmax_sse2_params,
Marat Dukhanff209482020-09-03 14:26:53 -07002642 .element_tile = 8,
2643 };
2644 }
Marat Dukhan0853b8a2021-08-03 01:01:53 -07002645 if (cpuinfo_has_x86_avx()) {
2646 xnn_params.qs8.vmul = (struct vbinary_parameters) {
2647 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16,
2648 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
2649 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
2650 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse4_params,
2651 .element_tile = 16,
2652 };
2653 } else if (cpuinfo_has_x86_sse4_1()) {
2654 xnn_params.qs8.vmul = (struct vbinary_parameters) {
2655 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
2656 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
2657 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
2658 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse4_params,
2659 .element_tile = 16,
2660 };
2661 } else {
2662 xnn_params.qs8.vmul = (struct vbinary_parameters) {
2663 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
2664 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
2665 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
2666 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse2_params,
2667 .element_tile = 8,
2668 };
2669 }
Marat Dukhan07e50402020-08-05 17:16:53 -07002670 #endif // XNN_NO_QS8_OPERATORS
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002671
Marat Dukhan08b7a972020-07-14 18:17:29 -07002672 /**************************** QU8 micro-kernels ****************************/
2673 #ifndef XNN_NO_QU8_OPERATORS
2674 init_flags |= XNN_INIT_FLAG_QU8;
Marat Dukhan854fb6b2020-06-19 12:33:44 -07002675
Marat Dukhan3cf2e222021-07-08 11:38:45 -07002676 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
2677 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
2678 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
2679 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
2680 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
2681 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
2682 xnn_params.qu8.gemm.mr = 4;
2683 xnn_params.qu8.gemm.nr = 16;
2684 xnn_params.qu8.gemm.log2_kr = 3;
2685 } else if (cpuinfo_has_x86_xop()) {
Marat Dukhan902ef7f2021-07-02 16:11:06 -07002686 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
2687 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
2688 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
2689 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
2690 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
2691 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
2692 xnn_params.qu8.gemm.mr = 2;
2693 xnn_params.qu8.gemm.nr = 4;
2694 xnn_params.qu8.gemm.log2_kr = 3;
2695 } else if (cpuinfo_has_x86_avx2()) {
2696 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
2697 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
2698 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
2699 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
2700 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
2701 xnn_params.qu8.gemm.mr = 3;
2702 xnn_params.qu8.gemm.nr = 8;
2703 xnn_params.qu8.gemm.log2_kr = 3;
2704 } else if (cpuinfo_has_x86_avx()) {
2705 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
2706 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
2707 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
2708 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
2709 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
2710 xnn_params.qu8.gemm.mr = 2;
2711 xnn_params.qu8.gemm.nr = 4;
2712 xnn_params.qu8.gemm.log2_kr = 3;
2713 } else if (cpuinfo_has_x86_sse4_1()) {
2714 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
2715 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
2716 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
2717 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
2718 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
2719 xnn_params.qu8.gemm.mr = 3;
2720 xnn_params.qu8.gemm.nr = 4;
2721 xnn_params.qu8.gemm.log2_kr = 3;
2722 } else {
2723 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
2724 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
2725 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
2726 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
2727 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
2728 xnn_params.qu8.gemm.mr = 3;
2729 xnn_params.qu8.gemm.nr = 4;
2730 xnn_params.qu8.gemm.log2_kr = 3;
2731 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07002732
Marat Dukhanabee3a72021-07-09 09:04:52 -07002733 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
2734 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
2735 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
2736 xnn_params.qu8.dwconv[0].channel_tile = 32;
2737 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
2738 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
2739 xnn_params.qu8.dwconv[1].channel_tile = 32;
2740 } else if (cpuinfo_has_x86_xop()) {
2741 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
2742 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32;
2743 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
2744 xnn_params.qu8.dwconv[0].channel_tile = 16;
2745 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32;
2746 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
2747 xnn_params.qu8.dwconv[1].channel_tile = 16;
2748 } else if (cpuinfo_has_x86_avx2()) {
2749 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
2750 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
2751 xnn_params.qu8.dwconv[0].channel_tile = 16;
2752 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
2753 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
2754 xnn_params.qu8.dwconv[1].channel_tile = 16;
2755 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhancaa7fc72021-07-27 07:48:24 -07002756 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16;
Marat Dukhanabee3a72021-07-09 09:04:52 -07002757 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
2758 xnn_params.qu8.dwconv[0].channel_tile = 16;
Marat Dukhancaa7fc72021-07-27 07:48:24 -07002759 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16;
Marat Dukhanabee3a72021-07-09 09:04:52 -07002760 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
2761 xnn_params.qu8.dwconv[1].channel_tile = 16;
2762 } else if (cpuinfo_has_x86_sse4_1()) {
2763 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16;
2764 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
2765 xnn_params.qu8.dwconv[0].channel_tile = 8;
2766 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16;
2767 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
2768 xnn_params.qu8.dwconv[1].channel_tile = 8;
2769 } else if (cpuinfo_has_x86_sse2()) {
2770 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16;
2771 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
2772 xnn_params.qu8.dwconv[0].channel_tile = 8;
2773 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16;
2774 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
2775 xnn_params.qu8.dwconv[1].channel_tile = 8;
2776 }
Marat Dukhan08b7a972020-07-14 18:17:29 -07002777 xnn_params.qu8.dwconv[0].primary_tile = 9;
Marat Dukhanabee3a72021-07-09 09:04:52 -07002778 xnn_params.qu8.dwconv[1].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002779
Marat Dukhan08b7a972020-07-14 18:17:29 -07002780 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
2781 .up = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__sse2_c8,
2782 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002783 .mr = 9,
2784 .qr = 8,
2785 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07002786 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
2787 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8,
2788 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__sse2_c8,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002789 .mr = 7,
2790 };
Marat Dukhan3eac69c2021-07-21 01:42:29 -07002791
Marat Dukhane76049a2021-07-22 14:48:59 -07002792 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
2793 xnn_params.qu8.vadd = (struct vbinary_parameters) {
2794 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16,
2795 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
2796 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
Marat Dukhan64287252021-09-07 16:20:03 -07002797 .init.qu8_addsub = xnn_init_qu8_add_minmax_avx512_params,
Marat Dukhane76049a2021-07-22 14:48:59 -07002798 .element_tile = 16,
2799 };
2800 } else if (cpuinfo_has_x86_xop()) {
Marat Dukhan3eac69c2021-07-21 01:42:29 -07002801 xnn_params.qu8.vadd = (struct vbinary_parameters) {
2802 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__xop_mul32_ld32_x8,
2803 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
2804 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07002805 .init.qu8_addsub = xnn_init_qu8_add_minmax_sse4_params,
Marat Dukhan3eac69c2021-07-21 01:42:29 -07002806 .element_tile = 8,
2807 };
2808 } else if (cpuinfo_has_x86_avx2()) {
2809 xnn_params.qu8.vadd = (struct vbinary_parameters) {
2810 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_x16,
2811 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
2812 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
Marat Dukhan64287252021-09-07 16:20:03 -07002813 .init.qu8_addsub = xnn_init_qu8_add_minmax_avx2_params,
Marat Dukhan3eac69c2021-07-21 01:42:29 -07002814 .element_tile = 16,
2815 };
2816 } else if (cpuinfo_has_x86_avx()) {
2817 xnn_params.qu8.vadd = (struct vbinary_parameters) {
2818 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx_mul32_ld32_x8,
2819 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
2820 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07002821 .init.qu8_addsub = xnn_init_qu8_add_minmax_sse4_params,
Marat Dukhan3eac69c2021-07-21 01:42:29 -07002822 .element_tile = 8,
2823 };
2824 } else if (cpuinfo_has_x86_sse4_1()) {
2825 xnn_params.qu8.vadd = (struct vbinary_parameters) {
2826 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__sse41_mul16_ld64_x8,
2827 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
2828 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07002829 .init.qu8_addsub = xnn_init_qu8_add_minmax_sse2_params,
Marat Dukhan3eac69c2021-07-21 01:42:29 -07002830 .element_tile = 8,
2831 };
2832 } else {
2833 xnn_params.qu8.vadd = (struct vbinary_parameters) {
2834 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__sse2_mul16_ld64_x8,
2835 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
2836 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07002837 .init.qu8_addsub = xnn_init_qu8_add_minmax_sse2_params,
Marat Dukhan3eac69c2021-07-21 01:42:29 -07002838 .element_tile = 8,
2839 };
2840 }
Marat Dukhan0853b8a2021-08-03 01:01:53 -07002841 if (cpuinfo_has_x86_avx()) {
2842 xnn_params.qu8.vmul = (struct vbinary_parameters) {
2843 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16,
2844 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
2845 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
2846 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
2847 .element_tile = 16,
2848 };
2849 } else if (cpuinfo_has_x86_sse4_1()) {
2850 xnn_params.qu8.vmul = (struct vbinary_parameters) {
2851 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
2852 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
2853 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
2854 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
2855 .element_tile = 16,
2856 };
2857 } else {
2858 xnn_params.qu8.vmul = (struct vbinary_parameters) {
2859 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
2860 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
2861 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
2862 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
2863 .element_tile = 8,
2864 };
2865 }
Marat Dukhan08b7a972020-07-14 18:17:29 -07002866 #endif // XNN_NO_QU8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07002867
2868 /**************************** U8 micro-kernels ****************************/
Marat Dukhan23147532021-08-16 07:26:56 -07002869 #ifndef XNN_NO_S8_OPERATORS
2870 init_flags |= XNN_INIT_FLAG_S8;
2871
2872 if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07002873 xnn_params.s8.clamp = (struct vunary_parameters) {
2874 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__sse41_x64,
2875 .init.s8_minmax = xnn_init_s8_minmax_sse4_params,
2876 .element_tile = 64,
2877 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08002878 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
2879 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__sse41_c16,
2880 .pixel_tile = 1,
2881 .channel_tile = 16,
2882 };
Marat Dukhan23147532021-08-16 07:26:56 -07002883 xnn_params.s8.maxpool = (struct maxpool_parameters) {
2884 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__sse41_c16,
2885 .init.s8 = xnn_init_s8_minmax_sse4_params,
2886 .mr = 9,
2887 .qr = 8,
2888 };
2889 } else {
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07002890 xnn_params.s8.clamp = (struct vunary_parameters) {
2891 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__sse2_x64,
2892 .init.s8_minmax = xnn_init_s8_minmax_sse2_params,
2893 .element_tile = 64,
2894 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08002895 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
2896 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__sse2_c8,
2897 .pixel_tile = 1,
2898 .channel_tile = 8,
2899 };
Marat Dukhan23147532021-08-16 07:26:56 -07002900 xnn_params.s8.maxpool = (struct maxpool_parameters) {
2901 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__sse2_c16,
2902 .init.s8 = xnn_init_s8_minmax_sse2_params,
2903 .mr = 9,
2904 .qr = 8,
2905 };
2906 }
Marat Dukhan94912792021-08-16 21:40:30 -07002907 #endif // XNN_NO_S8_OPERATORS
Marat Dukhan23147532021-08-16 07:26:56 -07002908
2909 /**************************** U8 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002910 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07002911 init_flags |= XNN_INIT_FLAG_U8;
2912
Marat Dukhan94912792021-08-16 21:40:30 -07002913 xnn_params.u8.clamp = (struct vunary_parameters) {
2914 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__sse2_x64,
2915 .init.u8_minmax = xnn_init_u8_minmax_sse2_params,
2916 .element_tile = 64,
2917 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08002918 if (cpuinfo_has_x86_sse4_1()) {
2919 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
2920 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__sse41_c16,
2921 .pixel_tile = 1,
2922 .channel_tile = 16,
2923 };
2924 } else {
2925 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
2926 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__sse2_c8,
2927 .pixel_tile = 1,
2928 .channel_tile = 8,
2929 };
2930 }
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002931 xnn_params.u8.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07002932 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16,
Marat Dukhan91ae1652021-08-15 19:19:49 -07002933 .init.u8 = xnn_init_u8_minmax_sse2_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002934 .mr = 9,
2935 .qr = 8,
2936 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002937 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
2938 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__sse2;
2939 #endif // XNN_NO_U8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07002940
2941 /**************************** X8 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002942 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07002943 init_flags |= XNN_INIT_FLAG_X8;
2944
Marat Dukhan98e054b2021-09-13 09:43:50 -07002945 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
2946 xnn_params.x8.lut = xnn_x8_lut_ukernel__avx512skx_vpshufb_x64;
2947 } else if (cpuinfo_has_x86_avx2()) {
2948 xnn_params.x8.lut = xnn_x8_lut_ukernel__avx2_x128;
2949 } else if (cpuinfo_has_x86_avx()) {
2950 xnn_params.x8.lut = xnn_x8_lut_ukernel__avx_x64;
2951 } else {
2952 // Note: SSSE3 version is usually slower than scalar
2953 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
2954 }
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002955 xnn_params.x8.zip = (struct zip_parameters) {
2956 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__sse2,
2957 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__sse2,
2958 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__sse2,
2959 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__sse2,
2960 };
2961 #endif // XNN_NO_X8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07002962
2963 /**************************** F32 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002964 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07002965 init_flags |= XNN_INIT_FLAG_F32;
2966
Marat Dukhan0f349c42019-11-27 11:58:54 -08002967 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07002968 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast);
2969 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast);
2970 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast);
2971 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002972 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002973 xnn_params.f32.gemm.mr = 7;
2974 xnn_params.f32.gemm.nr = 16;
Marat Dukhan0f349c42019-11-27 11:58:54 -08002975 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_fma3()) {
Marat Dukhan27121322019-12-09 14:57:40 -08002976 switch (cpuinfo_get_core(0)->uarch) {
2977 case cpuinfo_uarch_zen:
Marat Dukhanb3801eb2020-03-12 13:41:11 -07002978 case cpuinfo_uarch_dhyana:
Marat Dukhanaefaef32020-04-09 07:09:34 -07002979 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast);
2980 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast);
2981 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast);
2982 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002983 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002984 xnn_params.f32.gemm.mr = 4;
2985 xnn_params.f32.gemm.nr = 16;
2986 xnn_params.f32.gemm.log2_sr = 2;
Marat Dukhan27121322019-12-09 14:57:40 -08002987 break;
2988 default:
Marat Dukhanaefaef32020-04-09 07:09:34 -07002989 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast);
2990 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast);
2991 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast);
2992 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002993 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002994 xnn_params.f32.gemm.mr = 5;
2995 xnn_params.f32.gemm.nr = 16;
Marat Dukhan27121322019-12-09 14:57:40 -08002996 break;
2997 }
Marat Dukhan1025ea32019-11-21 16:01:08 -08002998 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07002999 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast);
3000 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast);
3001 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast);
3002 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003003 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003004 xnn_params.f32.gemm.mr = 5;
3005 xnn_params.f32.gemm.nr = 16;
Marat Dukhan1025ea32019-11-21 16:01:08 -08003006 } else {
Marat Dukhanaefaef32020-04-09 07:09:34 -07003007 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__sse_load1);
3008 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__sse_load1);
3009 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__sse_load1);
3010 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__sse_load1);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003011 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_sse_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003012 xnn_params.f32.gemm.mr = 4;
3013 xnn_params.f32.gemm.nr = 8;
Marat Dukhan1025ea32019-11-21 16:01:08 -08003014 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07003015 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__sse);
3016 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__sse);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003017 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_sse_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003018 xnn_params.f32.gemm2.mr = 4;
3019 xnn_params.f32.gemm2.nr = 2;
3020 xnn_params.f32.gemm2.log2_kr = 2;
3021
Marat Dukhan479f87e2019-11-27 15:17:06 -08003022 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003023 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__avx512f;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003024 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003025 xnn_params.f32.dwconv[0].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003026 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003027
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003028 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__avx512f;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003029 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003030 xnn_params.f32.dwconv[1].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003031 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003032
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003033 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__avx512f;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003034 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003035 xnn_params.f32.dwconv[2].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003036 xnn_params.f32.dwconv[2].primary_tile = 9;
3037
3038 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x25__avx512f;
3039 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
3040 xnn_params.f32.dwconv[3].channel_tile = 16;
3041 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhan479f87e2019-11-27 15:17:06 -08003042 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_fma3()) {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003043 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__fma3;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003044 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003045 xnn_params.f32.dwconv[0].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003046 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003047
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003048 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__fma3;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003049 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003050 xnn_params.f32.dwconv[1].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003051 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003052
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003053 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__fma3;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003054 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_avx_params;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003055 xnn_params.f32.dwconv[2].channel_tile = 16;
3056 xnn_params.f32.dwconv[2].primary_tile = 9;
3057
3058 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__fma3;
3059 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_avx_params;
3060 xnn_params.f32.dwconv[3].channel_tile = 8;
3061 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhan17ec5f32019-11-22 13:34:16 -08003062 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003063 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__avx;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003064 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003065 xnn_params.f32.dwconv[0].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003066 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003067
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003068 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__avx;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003069 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003070 xnn_params.f32.dwconv[1].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003071 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003072
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003073 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__avx;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003074 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_avx_params;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003075 xnn_params.f32.dwconv[2].channel_tile = 16;
3076 xnn_params.f32.dwconv[2].primary_tile = 9;
3077
3078 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__avx;
3079 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_avx_params;
3080 xnn_params.f32.dwconv[3].channel_tile = 8;
3081 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhan17ec5f32019-11-22 13:34:16 -08003082 } else {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003083 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__sse;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003084 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_sse_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003085 xnn_params.f32.dwconv[0].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003086 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003087
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003088 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__sse;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003089 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_sse_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003090 xnn_params.f32.dwconv[1].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003091 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003092
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003093 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__sse;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003094 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_sse_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003095 xnn_params.f32.dwconv[2].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003096 xnn_params.f32.dwconv[2].primary_tile = 9;
3097
3098 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__sse;
3099 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_sse_params;
3100 xnn_params.f32.dwconv[3].channel_tile = 8;
3101 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhan17ec5f32019-11-22 13:34:16 -08003102 }
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003103 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07003104 .up = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__sse_c4,
3105 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003106 .mr = 9,
3107 .qr = 8,
3108 };
3109 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07003110 .up = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4,
3111 .mp = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003112 .mr = 9,
3113 .qr = 8,
3114 };
3115 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07003116 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4,
3117 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003118 .mr = 7,
3119 };
3120 xnn_params.f32.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07003121 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4,
Marat Dukhan91ae1652021-08-15 19:19:49 -07003122 .init.f32 = xnn_init_f32_minmax_sse_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003123 .mr = 9,
3124 .qr = 8,
3125 };
3126 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07003127 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__sse2_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003128 .mr = 4,
3129 };
3130 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07003131 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__sse2_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003132 .mr = 9,
3133 };
3134 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07003135 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003136 .mr = 9,
3137 .qr = 8,
3138 };
Marat Dukhan660fd192020-03-10 04:55:30 -07003139 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
3140 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__sse_c8,
Marat Dukhan69722492019-11-11 19:55:50 -08003141 .pixel_tile = 1,
3142 .channel_tile = 8,
3143 };
Marat Dukhane2c3f292019-11-27 15:40:54 -08003144 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan5020b962020-06-08 13:30:10 -07003145 xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__avx512f_x16;
3146 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
3147 xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__avx_x16;
3148 } else {
3149 xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__sse_x8;
3150 }
3151 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan94912792021-08-16 21:40:30 -07003152 xnn_params.f32.clamp = (struct vunary_parameters) {
3153 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__avx512f_x16,
3154 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
3155 .element_tile = 16,
3156 };
Marat Dukhane2c3f292019-11-27 15:40:54 -08003157 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
Marat Dukhan94912792021-08-16 21:40:30 -07003158 xnn_params.f32.clamp = (struct vunary_parameters) {
3159 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__avx_x16,
3160 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
3161 .element_tile = 16,
3162 };
Marat Dukhane2c3f292019-11-27 15:40:54 -08003163 } else {
Marat Dukhan94912792021-08-16 21:40:30 -07003164 xnn_params.f32.clamp = (struct vunary_parameters) {
3165 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__sse_x8,
3166 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
3167 .element_tile = 8,
3168 };
Marat Dukhane2c3f292019-11-27 15:40:54 -08003169 }
Marat Dukhan662faa02019-12-09 22:48:16 -08003170 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08003171 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x64;
3172 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx2()) {
3173 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56;
3174 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
3175 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x32;
3176 } else {
3177 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__sse2_rr2_lut16_p3_x12;
3178 }
3179 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan6674d692021-05-05 22:27:00 -07003180 xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__avx512f_x16;
Marat Dukhan662faa02019-12-09 22:48:16 -08003181 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_fma3()) {
Marat Dukhan6674d692021-05-05 22:27:00 -07003182 xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__fma3_x16;
Marat Dukhan662faa02019-12-09 22:48:16 -08003183 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
Marat Dukhan6674d692021-05-05 22:27:00 -07003184 xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__avx_x16;
Marat Dukhan662faa02019-12-09 22:48:16 -08003185 } else {
Marat Dukhan6674d692021-05-05 22:27:00 -07003186 xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__sse_x8;
Marat Dukhan662faa02019-12-09 22:48:16 -08003187 }
Marat Dukhan5020b962020-06-08 13:30:10 -07003188 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan28813332020-06-10 18:05:38 -07003189 xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__avx512f_x16;
3190 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
3191 xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__avx_x16;
Marat Dukhan0d3f4672020-06-25 16:42:58 -07003192 } else if (cpuinfo_has_x86_sse4_1()) {
3193 xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__sse41_x8;
Marat Dukhan28813332020-06-10 18:05:38 -07003194 } else {
3195 xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__sse_x8;
3196 }
3197 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan5020b962020-06-08 13:30:10 -07003198 xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__avx512f_x16;
3199 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
3200 xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__avx_x16;
3201 } else {
3202 xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__sse_x8;
3203 }
Marat Dukhan64e52512020-06-09 13:41:16 -07003204 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
3205 xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__avx512f_x16;
3206 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__avx512f_x16;
3207 xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__avx512f_x16;
3208 xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__avx512f_x16;
3209 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
3210 xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__avx_x16;
3211 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__avx_x16;
3212 xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__avx_x16;
3213 xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__avx_x16;
3214 } else if (cpuinfo_has_x86_sse4_1()) {
3215 xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__sse41_x8;
3216 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__sse41_x8;
3217 xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__sse41_x8;
3218 xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__sse41_x8;
3219 } else {
3220 xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__sse2_x8;
3221 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__sse2_x8;
3222 xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__sse2_x8;
3223 xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__sse2_x8;
3224 }
Marat Dukhand9ca7e62020-09-23 23:45:29 -07003225 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan6674d692021-05-05 22:27:00 -07003226 xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x64;
Marat Dukhand9ca7e62020-09-23 23:45:29 -07003227 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx2()) {
Marat Dukhan6674d692021-05-05 22:27:00 -07003228 xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_x40;
T.J. Alumbaughdc2b29c2020-10-14 13:56:08 -07003229 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
Marat Dukhan6674d692021-05-05 22:27:00 -07003230 xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_x40;
Marat Dukhan6dd71362020-09-17 23:11:21 -07003231 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhan6674d692021-05-05 22:27:00 -07003232 xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__sse41_lut64_p2_div_x8;
Marat Dukhanfa0a4322020-01-06 16:14:29 -08003233 } else {
Marat Dukhan6674d692021-05-05 22:27:00 -07003234 xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__sse2_lut64_p2_div_x8;
Marat Dukhanfa0a4322020-01-06 16:14:29 -08003235 }
Marat Dukhan90eca0a2020-03-11 00:52:23 -07003236 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan5020b962020-06-08 13:30:10 -07003237 xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__avx512f_x16;
3238 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
3239 xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__avx_x16;
3240 } else {
3241 xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__sse_x8;
3242 }
Marat Dukhan6804bbd2020-06-30 19:26:11 -07003243 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
3244 xnn_params.f32.sqrt = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__avx_sqrt_x8;
3245 } else {
3246 xnn_params.f32.sqrt = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__sse_sqrt_x4;
3247 }
Marat Dukhan5020b962020-06-08 13:30:10 -07003248 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan90eca0a2020-03-11 00:52:23 -07003249 xnn_params.f32.prelu = (struct prelu_parameters) {
3250 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__avx512f_2x16,
3251 .row_tile = 2,
3252 .channel_tile = 16,
3253 };
3254 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
3255 xnn_params.f32.prelu = (struct prelu_parameters) {
3256 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__avx_2x16,
3257 .row_tile = 2,
3258 .channel_tile = 16,
3259 };
Marat Dukhan39b5e942020-06-24 15:03:48 -07003260 } else if (cpuinfo_has_x86_sse4_1()) {
3261 xnn_params.f32.prelu = (struct prelu_parameters) {
3262 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__sse41_2x8,
3263 .row_tile = 2,
3264 .channel_tile = 8,
3265 };
Marat Dukhan90eca0a2020-03-11 00:52:23 -07003266 } else {
3267 xnn_params.f32.prelu = (struct prelu_parameters) {
3268 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__sse2_2x8,
3269 .row_tile = 2,
3270 .channel_tile = 8,
3271 };
3272 }
Marat Dukhan1edc4542020-01-27 12:40:13 -08003273 xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20_acc2;
3274 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__sse;
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003275 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
3276 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003277 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__avx512f_x32,
3278 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx512f_x32,
3279 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx512f_x32,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003280 .element_tile = 32,
3281 };
3282 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003283 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__avx512f_x32,
3284 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__avx512f_x32,
3285 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__avx512f_x32,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003286 .element_tile = 32,
3287 };
3288 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003289 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__avx512f_x32,
3290 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx512f_x32,
3291 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx512f_x32,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003292 .element_tile = 32,
3293 };
3294 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003295 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__avx512f_x32,
3296 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx512f_x32,
3297 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx512f_x32,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003298 .element_tile = 32,
3299 };
3300 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003301 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__avx512f_x32,
3302 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
3303 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003304 .element_tile = 32,
3305 };
3306 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003307 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__avx512f_x32,
3308 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__avx512f_x32,
3309 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__avx512f_x32,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003310 .element_tile = 32,
3311 };
Marat Dukhanf7399262020-06-05 10:58:44 -07003312 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003313 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__avx512f_x32,
3314 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx512f_x32,
3315 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx512f_x32,
Marat Dukhanf7399262020-06-05 10:58:44 -07003316 .element_tile = 32,
3317 };
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003318 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
3319 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003320 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__avx_x16,
3321 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx_x16,
3322 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx_x16,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003323 .element_tile = 16,
3324 };
3325 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003326 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__avx_x16,
3327 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__avx_x16,
3328 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__avx_x16,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003329 .element_tile = 16,
3330 };
3331 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003332 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__avx_x16,
3333 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx_x16,
3334 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx_x16,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003335 .element_tile = 16,
3336 };
3337 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003338 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__avx_x16,
3339 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx_x16,
3340 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx_x16,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003341 .element_tile = 16,
3342 };
3343 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003344 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__avx_x16,
3345 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx_x16,
3346 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx_x16,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003347 .element_tile = 16,
3348 };
3349 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003350 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__avx_x16,
3351 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__avx_x16,
3352 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__avx_x16,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003353 .element_tile = 16,
3354 };
Marat Dukhanf7399262020-06-05 10:58:44 -07003355 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003356 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__avx_x16,
3357 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx_x16,
3358 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx_x16,
Marat Dukhanf7399262020-06-05 10:58:44 -07003359 .element_tile = 16,
3360 };
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003361 } else {
3362 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003363 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__sse_x8,
3364 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__sse_x8,
3365 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__sse_x8,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003366 .element_tile = 8,
3367 };
3368 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003369 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__sse_x8,
3370 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__sse_x8,
3371 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__sse_x8,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003372 .element_tile = 8,
3373 };
3374 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003375 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__sse_x8,
3376 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
3377 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003378 .element_tile = 8,
3379 };
3380 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003381 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__sse_x8,
3382 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__sse_x8,
3383 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__sse_x8,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003384 .element_tile = 8,
3385 };
3386 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003387 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__sse_x8,
3388 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__sse_x8,
3389 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__sse_x8,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003390 .element_tile = 8,
3391 };
3392 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003393 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__sse_x8,
3394 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__sse_x8,
3395 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__sse_x8,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003396 .element_tile = 8,
3397 };
Marat Dukhanf7399262020-06-05 10:58:44 -07003398 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003399 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__sse_x8,
3400 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__sse_x8,
3401 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__sse_x8,
Marat Dukhanf7399262020-06-05 10:58:44 -07003402 .element_tile = 8,
3403 };
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003404 }
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003405 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07003406 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07003407 .init.f32 = xnn_init_f32_minmax_sse_params,
Marat Dukhan49e6ee92019-11-06 15:55:29 -08003408 .channel_tile = 4,
3409 .row_tile = 2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003410 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08003411 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -08003412 // Sparse microkernels on x86 currently target only SSE, and on processors
3413 // with AVX ISA dense inference is expected to be faster than sparse.
3414 if (!cpuinfo_has_x86_avx()) {
3415 init_flags |= XNN_INIT_FLAG_CHW_OPT;
3416 }
3417
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003418 xnn_params.f32.spmm = (struct spmm_parameters) {
Frank Barchard4fd38b22020-10-30 17:10:11 -07003419 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__sse,
3420 .mr = 32,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003421 .nr = 1,
3422 };
Erich Elsen5b2e07a2020-06-09 03:27:59 -07003423 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
3424 .ukernel_with_symm_padding =
3425 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2,
3426 .output_channel_tile = 4,
3427 .output_height_tile = 2,
3428 .output_width_tile = 2,
3429 };
Frank Barchard0b18cb32020-11-23 10:50:44 -08003430 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_ssse3()) {
3431 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
3432 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4_acc2,
Frank Barchard0b18cb32020-11-23 10:50:44 -08003433 .output_width_tile = 4,
3434 .output_height_tile = 2,
3435 };
3436 } else {
3437 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
3438 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2,
Frank Barchard0b18cb32020-11-23 10:50:44 -08003439 .output_width_tile = 4,
3440 .output_height_tile = 2,
3441 };
3442 }
Marat Dukhanbf715f92020-10-23 20:17:00 -07003443 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
3444 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003445 .output_width_tile = 4,
3446 .output_height_tile = 1,
3447 };
Marat Dukhand0503892020-10-30 08:22:04 -07003448 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
3449 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4,
Marat Dukhand0503892020-10-30 08:22:04 -07003450 .output_width_tile = 4,
3451 .output_height_tile = 4,
3452 };
Marat Dukhanccca2142020-10-30 17:32:45 -07003453 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
3454 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4,
Marat Dukhanccca2142020-10-30 17:32:45 -07003455 .output_width_tile = 4,
3456 .output_height_tile = 2,
3457 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07003458 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
3459 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__sse_x4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003460 .channel_tile = 4,
3461 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -07003462 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
Artsiom Ablavatskib3ffd582021-03-31 13:00:08 -07003463 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__sse_p8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07003464 .channel_tile = 1,
Artsiom Ablavatskib3ffd582021-03-31 13:00:08 -07003465 .pixel_tile = 8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07003466 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08003467 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003468 #endif // XNN_NO_F32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07003469
Marat Dukhanaf2ba002021-10-24 14:21:41 -07003470 /*************************** VCVT micro-kernels ***************************/
3471 #ifndef XNN_NO_VCVT_OPERATORS
3472 init_flags |= XNN_INIT_FLAG_VCVT;
3473
3474 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3475 xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__avx512skx_x16;
Marat Dukhana0c61682021-11-10 19:23:41 -08003476 xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__avx512skx_x16;
Marat Dukhanaf2ba002021-10-24 14:21:41 -07003477 } else if (cpuinfo_has_x86_f16c()) {
3478 xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__f16c_x16;
Marat Dukhana0c61682021-11-10 19:23:41 -08003479 xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__f16c_x16;
Marat Dukhanaf2ba002021-10-24 14:21:41 -07003480 } else if (cpuinfo_has_x86_avx()) {
3481 xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__avx_int16_x16;
Marat Dukhana0c61682021-11-10 19:23:41 -08003482 xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__avx_x24;
Marat Dukhanaf2ba002021-10-24 14:21:41 -07003483 } else if (cpuinfo_has_x86_sse4_1()) {
3484 xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__sse41_int16_x16;
Marat Dukhana0c61682021-11-10 19:23:41 -08003485 xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__sse41_x8;
Marat Dukhanaf2ba002021-10-24 14:21:41 -07003486 } else {
3487 xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__sse2_int16_x32;
Marat Dukhana0c61682021-11-10 19:23:41 -08003488 xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__sse2_x16;
Marat Dukhanaf2ba002021-10-24 14:21:41 -07003489 }
Marat Dukhaned2d7762021-12-03 23:51:19 -08003490 if (cpuinfo_has_x86_sse4_1()) {
3491 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
3492 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__sse41_x32,
3493 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_sse4_params,
3494 .element_tile = 32,
3495 };
3496 } else {
3497 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
3498 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__sse2_x32,
3499 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_sse2_params,
3500 .element_tile = 32,
3501 };
3502 }
3503 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
3504 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__sse2_x32,
3505 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_sse2_params,
3506 .element_tile = 32,
3507 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07003508 #endif // XNN_NO_VCVT_OPERATORS
3509
XNNPACK Teamb455b122019-09-27 18:10:33 -07003510 /**************************** X32 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003511 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07003512 init_flags |= XNN_INIT_FLAG_X32;
3513
Marat Dukhan57dccd82020-04-14 00:53:10 -07003514 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__sse2;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003515 xnn_params.x32.zip = (struct zip_parameters) {
3516 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__sse2,
3517 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__sse2,
3518 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__sse2,
3519 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__sse2,
3520 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08003521 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -08003522 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
3523 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08003524 .channel_tile = 1,
3525 .pixel_tile = 1,
3526 };
3527 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003528 #endif // XNN_NO_X32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07003529
Marat Dukhan048931b2020-11-24 20:53:54 -08003530 /**************************** XX micro-kernels ****************************/
3531 #ifndef XNN_NO_XX_OPERATORS
3532 init_flags |= XNN_INIT_FLAG_XX;
3533
3534 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
Marat Dukhan933051b2021-08-07 16:26:15 -07003535 xnn_params.xx.fill = (struct fill_parameters) {
3536 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__sse2_x64,
3537 .row_tile = 1,
3538 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -07003539 xnn_params.xx.pad = (struct pad_parameters) {
3540 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__sse2,
3541 .row_tile = 1,
3542 };
Marat Dukhan048931b2020-11-24 20:53:54 -08003543 #endif
3544
Marat Dukhan933051b2021-08-07 16:26:15 -07003545#elif XNN_ARCH_WASMSIMD
3546
Marat Dukhan898d5852021-06-30 21:18:34 -07003547 /**************************** QC8 micro-kernels ****************************/
3548 #ifndef XNN_NO_QS8_OPERATORS
3549 init_flags |= XNN_INIT_FLAG_QC8;
3550
Marat Dukhan189c1d02021-09-03 15:39:54 -07003551 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
3552 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128);
3553 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128);
3554 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128);
3555 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128);
3556 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
3557 xnn_params.qc8.gemm.mr = 3;
3558 xnn_params.qc8.gemm.nr = 4;
3559 xnn_params.qc8.gemm.log2_kr = 1;
3560 #else
3561 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
3562 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
3563 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
3564 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
3565 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
3566 xnn_params.qc8.gemm.mr = 3;
3567 xnn_params.qc8.gemm.nr = 4;
3568 xnn_params.qc8.gemm.log2_kr = 3;
3569 #endif
Marat Dukhan898d5852021-06-30 21:18:34 -07003570
Marat Dukhan9cedb592021-08-17 17:25:24 -07003571 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16_add16;
Marat Dukhan898d5852021-06-30 21:18:34 -07003572 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
Marat Dukhan9cedb592021-08-17 17:25:24 -07003573 xnn_params.qc8.dwconv[0].channel_tile = 16;
Marat Dukhan898d5852021-06-30 21:18:34 -07003574 xnn_params.qc8.dwconv[0].primary_tile = 9;
Marat Dukhan9cedb592021-08-17 17:25:24 -07003575 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__wasmsimd_mul16_add16;
Marat Dukhan898d5852021-06-30 21:18:34 -07003576 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
Marat Dukhan9cedb592021-08-17 17:25:24 -07003577 xnn_params.qc8.dwconv[1].channel_tile = 16;
Marat Dukhan898d5852021-06-30 21:18:34 -07003578 xnn_params.qc8.dwconv[1].primary_tile = 25;
3579 #endif // XNN_NO_QC8_OPERATORS
3580
Marat Dukhan07e50402020-08-05 17:16:53 -07003581 /**************************** QS8 micro-kernels ****************************/
3582 #ifndef XNN_NO_QS8_OPERATORS
3583 init_flags |= XNN_INIT_FLAG_QS8;
3584
Marat Dukhan189c1d02021-09-03 15:39:54 -07003585 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
3586 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128);
3587 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128);
3588 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128);
3589 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128);
3590 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
3591 xnn_params.qs8.gemm.mr = 3;
3592 xnn_params.qs8.gemm.nr = 4;
3593 xnn_params.qs8.gemm.log2_kr = 1;
3594 #else // XNN_WASMSIMD_VERSION >= 88
3595 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
3596 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
3597 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
3598 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
3599 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
3600 xnn_params.qs8.gemm.mr = 3;
3601 xnn_params.qs8.gemm.nr = 4;
3602 xnn_params.qs8.gemm.log2_kr = 3;
3603 #endif // XNN_WASMSIMD_VERSION >= 88
Marat Dukhan07e50402020-08-05 17:16:53 -07003604
Marat Dukhan9cedb592021-08-17 17:25:24 -07003605 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16_add16;
Marat Dukhan400e7cb2021-08-07 15:14:54 -07003606 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
Marat Dukhan9cedb592021-08-17 17:25:24 -07003607 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhan07e50402020-08-05 17:16:53 -07003608 xnn_params.qs8.dwconv[0].primary_tile = 9;
Marat Dukhan9cedb592021-08-17 17:25:24 -07003609 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__wasmsimd_mul16_add16;
Marat Dukhan400e7cb2021-08-07 15:14:54 -07003610 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
Marat Dukhan9cedb592021-08-17 17:25:24 -07003611 xnn_params.qs8.dwconv[1].channel_tile = 16;
Marat Dukhan4ed14882021-05-12 17:50:40 -07003612 xnn_params.qs8.dwconv[1].primary_tile = 25;
Marat Dukhan9e0b5392020-08-07 02:29:34 -07003613
3614 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
3615 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7x__wasmsimd_c8_acc2,
3616 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7p7x__wasmsimd_c8_acc2,
3617 .mr = 7,
3618 };
Marat Dukhanff209482020-09-03 14:26:53 -07003619
3620 xnn_params.qs8.vadd = (struct vbinary_parameters) {
Marat Dukhane20a8732021-12-07 17:11:37 -08003621 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__wasmsimd_x32,
3622 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32,
3623 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32,
Marat Dukhan64287252021-09-07 16:20:03 -07003624 .init.qs8_addsub = xnn_init_qs8_add_minmax_wasmsimd_params,
Marat Dukhane20a8732021-12-07 17:11:37 -08003625 .element_tile = 32,
Marat Dukhanff209482020-09-03 14:26:53 -07003626 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07003627 xnn_params.qs8.vmul = (struct vbinary_parameters) {
3628 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
3629 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
3630 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
3631 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_wasmsimd_params,
3632 .element_tile = 8,
3633 };
Marat Dukhan07e50402020-08-05 17:16:53 -07003634 #endif // XNN_NO_QS8_OPERATORS
3635
Marat Dukhan08b7a972020-07-14 18:17:29 -07003636 /**************************** QU8 micro-kernels ****************************/
3637 #ifndef XNN_NO_QU8_OPERATORS
3638 init_flags |= XNN_INIT_FLAG_QU8;
Marat Dukhan854fb6b2020-06-19 12:33:44 -07003639
Marat Dukhan189c1d02021-09-03 15:39:54 -07003640 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
3641 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128);
3642 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128);
3643 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128);
3644 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128);
3645 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
3646 xnn_params.qu8.gemm.mr = 3;
3647 xnn_params.qu8.gemm.nr = 4;
3648 xnn_params.qu8.gemm.log2_kr = 1;
3649 #else // XNN_WASMSIMD_VERSION >= 88
3650 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64);
3651 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64);
3652 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64);
3653 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64);
3654 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
3655 xnn_params.qu8.gemm.mr = 3;
3656 xnn_params.qu8.gemm.nr = 4;
3657 xnn_params.qu8.gemm.log2_kr = 3;
3658 #endif // XNN_WASMSIMD_VERSION >= 88
Marat Dukhanaefaef32020-04-09 07:09:34 -07003659
Marat Dukhana97e9752021-07-15 16:30:41 -07003660 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__wasmsimd_mul16;
3661 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
3662 xnn_params.qu8.dwconv[0].channel_tile = 8;
Marat Dukhan08b7a972020-07-14 18:17:29 -07003663 xnn_params.qu8.dwconv[0].primary_tile = 9;
Marat Dukhana97e9752021-07-15 16:30:41 -07003664 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__wasmsimd_mul16;
3665 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
3666 xnn_params.qu8.dwconv[1].channel_tile = 8;
3667 xnn_params.qu8.dwconv[1].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003668
Marat Dukhan08b7a972020-07-14 18:17:29 -07003669 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
3670 .up = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
3671 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003672 .mr = 9,
3673 .qr = 8,
3674 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07003675 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
3676 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__scalar_c1,
3677 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__scalar_c1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003678 .mr = 7,
3679 };
Marat Dukhandb007cd2021-07-20 23:42:39 -07003680
3681 xnn_params.qu8.vadd = (struct vbinary_parameters) {
Marat Dukhane20a8732021-12-07 17:11:37 -08003682 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__wasmsimd_x32,
3683 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_x32,
3684 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_x32,
Marat Dukhan64287252021-09-07 16:20:03 -07003685 .init.qu8_addsub = xnn_init_qu8_add_minmax_wasmsimd_params,
Marat Dukhane20a8732021-12-07 17:11:37 -08003686 .element_tile = 32,
Marat Dukhandb007cd2021-07-20 23:42:39 -07003687 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07003688 xnn_params.qu8.vmul = (struct vbinary_parameters) {
3689 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
3690 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
3691 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
3692 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_wasmsimd_params,
3693 .element_tile = 8,
3694 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07003695 #endif // XNN_NO_QU8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07003696
Marat Dukhandc5c1482021-08-16 09:03:15 -07003697 /**************************** S8 micro-kernels ****************************/
3698 #ifndef XNN_NO_S8_OPERATORS
3699 init_flags |= XNN_INIT_FLAG_S8;
3700
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07003701 xnn_params.s8.clamp = (struct vunary_parameters) {
3702 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__wasmsimd_x64,
3703 .init.s8_minmax = xnn_init_s8_minmax_wasmsimd_params,
3704 .element_tile = 64,
3705 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08003706 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
3707 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
3708 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__wasmsimd_dot16x2_c8,
3709 .pixel_tile = 1,
3710 .channel_tile = 8,
3711 };
3712 #else // XNN_WASMSIMD_VERSION >= 88
3713 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
3714 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__wasmsimd_mul32_c8,
3715 .pixel_tile = 1,
3716 .channel_tile = 8,
3717 };
3718 #endif // XNN_WASMSIMD_VERSION >= 88
Marat Dukhandc5c1482021-08-16 09:03:15 -07003719 xnn_params.s8.maxpool = (struct maxpool_parameters) {
3720 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__wasmsimd_c16,
3721 .init.s8 = xnn_init_s8_minmax_wasmsimd_params,
3722 .mr = 9,
3723 .qr = 8,
3724 };
3725 #endif // XNN_NO_S8_OPERATORS
3726
XNNPACK Teamb455b122019-09-27 18:10:33 -07003727 /**************************** U8 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003728 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07003729 init_flags |= XNN_INIT_FLAG_U8;
3730
Marat Dukhan94912792021-08-16 21:40:30 -07003731 xnn_params.u8.clamp = (struct vunary_parameters) {
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07003732 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__wasmsimd_x64,
3733 .init.u8_minmax = xnn_init_u8_minmax_wasmsimd_params,
3734 .element_tile = 64,
Marat Dukhan94912792021-08-16 21:40:30 -07003735 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08003736 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
3737 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
3738 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__wasmsimd_dot16x2_c8,
3739 .pixel_tile = 1,
3740 .channel_tile = 8,
3741 };
3742 #else // XNN_WASMSIMD_VERSION >= 88
3743 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
3744 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__wasmsimd_mul32_c8,
3745 .pixel_tile = 1,
3746 .channel_tile = 8,
3747 };
3748 #endif // XNN_WASMSIMD_VERSION >= 88
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003749 xnn_params.u8.maxpool = (struct maxpool_parameters) {
Marat Dukhanf1589422021-08-15 20:37:06 -07003750 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__wasmsimd_c16,
3751 .init.u8 = xnn_init_u8_minmax_wasmsimd_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003752 .mr = 9,
3753 .qr = 8,
3754 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003755 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
3756 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
3757 #endif // XNN_NO_U8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07003758
3759 /**************************** X8 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003760 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07003761 init_flags |= XNN_INIT_FLAG_X8;
3762
Marat Dukhand67539d2021-09-08 23:06:03 -07003763 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003764 xnn_params.x8.zip = (struct zip_parameters) {
3765 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
3766 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
3767 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
3768 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
3769 };
3770 #endif // XNN_NO_X8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07003771
3772 /**************************** F32 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003773 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07003774 init_flags |= XNN_INIT_FLAG_F32;
3775
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003776 if (is_wasm_x86) {
Frank Barchard0725b8d2020-12-07 11:07:35 -08003777 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
3778 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
3779 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
3780 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
Marat Dukhan688f6d82020-07-14 17:02:11 -07003781 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat);
3782 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat);
3783 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
3784 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
Marat Dukhan802808c2020-06-16 11:01:17 -07003785 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__wasmsimd_splat);
3786 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__wasmsimd_splat);
3787 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
3788 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003789 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003790 xnn_params.f32.gemm.mr = 4;
3791 xnn_params.f32.gemm.nr = 8;
Marat Dukhane39e6462020-07-09 01:33:36 -07003792
3793 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
3794 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
3795 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
3796 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003797 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhane39e6462020-07-09 01:33:36 -07003798 xnn_params.f32.gemm2.mr = 4;
3799 xnn_params.f32.gemm2.nr = 2;
3800 xnn_params.f32.gemm2.log2_kr = 2;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003801 } else {
Frank Barchard0725b8d2020-12-07 11:07:35 -08003802 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
3803 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
3804 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
3805 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
Marat Dukhan688f6d82020-07-14 17:02:11 -07003806 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat);
3807 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat);
3808 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
3809 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
Marat Dukhan802808c2020-06-16 11:01:17 -07003810 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_5x8__wasmsimd_splat);
3811 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_5x8__wasmsimd_splat);
3812 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
3813 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003814 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07003815 xnn_params.f32.gemm.mr = 5;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003816 xnn_params.f32.gemm.nr = 8;
Marat Dukhane39e6462020-07-09 01:33:36 -07003817
3818 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
3819 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
3820 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
3821 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003822 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhane39e6462020-07-09 01:33:36 -07003823 xnn_params.f32.gemm2.mr = 4;
3824 xnn_params.f32.gemm2.nr = 2;
3825 xnn_params.f32.gemm2.log2_kr = 2;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003826 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07003827
Marat Dukhanac014d72020-06-16 08:36:47 -07003828 if (is_wasm_x86) {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003829 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__wasmsimd_x86;
3830 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x3__wasmsimd;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003831 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanac014d72020-06-16 08:36:47 -07003832 xnn_params.f32.dwconv[0].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003833 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003834
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003835 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86;
3836 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x4__wasmsimd;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003837 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanac014d72020-06-16 08:36:47 -07003838 xnn_params.f32.dwconv[1].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003839 xnn_params.f32.dwconv[1].primary_tile = 4;
3840
3841 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__wasmsimd_x86;
3842 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x9__wasmsimd;
3843 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
3844 xnn_params.f32.dwconv[2].channel_tile = 8;
3845 xnn_params.f32.dwconv[2].primary_tile = 9;
Marat Dukhanac014d72020-06-16 08:36:47 -07003846 } else {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003847 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x3__wasmsimd_arm;
3848 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x3__wasmsimd;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003849 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanac014d72020-06-16 08:36:47 -07003850 xnn_params.f32.dwconv[0].channel_tile = 4;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003851 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003852
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003853 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_arm;
3854 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x4__wasmsimd;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003855 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanac014d72020-06-16 08:36:47 -07003856 xnn_params.f32.dwconv[1].channel_tile = 4;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003857 xnn_params.f32.dwconv[1].primary_tile = 4;
3858
3859 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__wasmsimd_arm;
3860 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x9__wasmsimd;
3861 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
3862 xnn_params.f32.dwconv[2].channel_tile = 4;
3863 xnn_params.f32.dwconv[2].primary_tile = 9;
Marat Dukhanac014d72020-06-16 08:36:47 -07003864 }
3865
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003866 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x25__wasmsimd_arm;
3867 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x25__wasmsimd;
3868 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
3869 xnn_params.f32.dwconv[3].channel_tile = 4;
3870 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003871
Marat Dukhan3b7432d2020-07-16 17:46:32 -07003872 if (is_wasm_x86) {
3873 xnn_params.f32.avgpool = (struct avgpool_parameters) {
3874 .up = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_x86_c4,
3875 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
3876 .mr = 9,
3877 .qr = 8,
3878 };
Marat Dukhan1483c532020-07-16 18:08:19 -07003879 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
3880 .up = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_x86_c4,
3881 .mp = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
3882 .mr = 9,
3883 .qr = 8,
3884 };
Marat Dukhanc6016802020-07-16 18:51:28 -07003885 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
3886 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4,
3887 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4,
3888 .mr = 7,
3889 };
Marat Dukhan3b7432d2020-07-16 17:46:32 -07003890 } else {
3891 xnn_params.f32.avgpool = (struct avgpool_parameters) {
3892 .up = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_arm_c4,
3893 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
3894 .mr = 9,
3895 .qr = 8,
3896 };
Marat Dukhan1483c532020-07-16 18:08:19 -07003897 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
3898 .up = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_arm_c4,
3899 .mp = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
3900 .mr = 9,
3901 .qr = 8,
3902 };
Marat Dukhanc6016802020-07-16 18:51:28 -07003903 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
3904 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4,
3905 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4,
3906 .mr = 7,
3907 };
Marat Dukhan3b7432d2020-07-16 17:46:32 -07003908 }
Marat Dukhanf6e24802020-07-08 22:20:40 -07003909 if (is_wasm_x86) {
3910 xnn_params.f32.maxpool = (struct maxpool_parameters) {
3911 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
Marat Dukhan91ae1652021-08-15 19:19:49 -07003912 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhanf6e24802020-07-08 22:20:40 -07003913 .mr = 9,
3914 .qr = 8,
3915 };
3916 } else {
3917 xnn_params.f32.maxpool = (struct maxpool_parameters) {
3918 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
Marat Dukhan91ae1652021-08-15 19:19:49 -07003919 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhanf6e24802020-07-08 22:20:40 -07003920 .mr = 9,
3921 .qr = 8,
3922 };
3923 }
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003924 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
Marat Dukhan40f05522020-07-16 22:33:12 -07003925 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__wasmsimd_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003926 .mr = 4,
3927 };
3928 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
Marat Dukhan40f05522020-07-16 22:33:12 -07003929 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__wasmsimd_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003930 .mr = 9,
3931 };
3932 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
Marat Dukhan40f05522020-07-16 22:33:12 -07003933 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__wasmsimd_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003934 .mr = 9,
3935 .qr = 8,
3936 };
Marat Dukhan660fd192020-03-10 04:55:30 -07003937 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
Marat Dukhan00d1d6e2020-07-09 01:37:27 -07003938 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__wasmsimd_c8,
Marat Dukhan69722492019-11-11 19:55:50 -08003939 .pixel_tile = 1,
3940 .channel_tile = 8,
3941 };
Marat Dukhan37c83512020-06-29 13:25:53 -07003942 xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__wasmsimd_x8;
Marat Dukhanc303fe62020-06-26 10:09:25 -07003943 if (is_wasm_x86) {
Marat Dukhan94912792021-08-16 21:40:30 -07003944 xnn_params.f32.clamp = (struct vunary_parameters) {
3945 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__wasmsimd_x86_x8,
3946 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
3947 .element_tile = 8,
3948 };
Marat Dukhanc303fe62020-06-26 10:09:25 -07003949 } else {
Marat Dukhan94912792021-08-16 21:40:30 -07003950 xnn_params.f32.clamp = (struct vunary_parameters) {
3951 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__wasmsimd_arm_x8,
3952 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
3953 .element_tile = 8,
3954 };
Marat Dukhanc303fe62020-06-26 10:09:25 -07003955 }
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08003956 if (is_wasm_x86) {
3957 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x20;
3958 } else {
3959 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x20;
3960 }
Marat Dukhan6674d692021-05-05 22:27:00 -07003961 xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__wasmsimd_x16;
Marat Dukhanf4935a22020-07-16 15:59:10 -07003962 if (is_wasm_x86) {
3963 xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__wasmsimd_minmax_x8;
3964 } else {
3965 xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__wasmsimd_bitselect_x8;
3966 }
Marat Dukhan37c83512020-06-29 13:25:53 -07003967 xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__wasmsimd_x8;
Marat Dukhan6674d692021-05-05 22:27:00 -07003968 xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_vrelu_ukernel__wasmsimd_x16;
Marat Dukhan189c1d02021-09-03 15:39:54 -07003969 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 91)
3970 xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__wasmsimd_native_x8;
3971 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_native_x8;
3972 xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__wasmsimd_native_x8;
3973 xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__wasmsimd_native_x8;
3974 #else // XNN_WASMSIMD_VERSION >= 91
3975 xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__wasmsimd_addsub_x8;
3976 if (is_wasm_x86) {
3977 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_addsub_x8;
3978 } else {
3979 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_cvt_x8;
3980 }
3981 xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__wasmsimd_addsub_x8;
3982 xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__wasmsimd_addsub_x8;
3983 #endif // XNN_WASMSIMD_VERSION >= 91
Marat Dukhan6674d692021-05-05 22:27:00 -07003984 xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__wasmsimd_p5_div_x16;
Marat Dukhan37c83512020-06-29 13:25:53 -07003985 xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__wasmsimd_x8;
Marat Dukhan6804bbd2020-06-30 19:26:11 -07003986 xnn_params.f32.sqrt = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_x8;
Marat Dukhan195f8eb2020-06-25 12:50:57 -07003987 if (is_wasm_x86) {
3988 xnn_params.f32.prelu = (struct prelu_parameters) {
Marat Dukhan78299282020-07-15 17:38:06 -07003989 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasmsimd_minmax_2x8,
Marat Dukhan195f8eb2020-06-25 12:50:57 -07003990 .row_tile = 2,
3991 .channel_tile = 8,
3992 };
3993 } else {
3994 xnn_params.f32.prelu = (struct prelu_parameters) {
Marat Dukhan78299282020-07-15 17:38:06 -07003995 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasmsimd_bitselect_2x8,
Marat Dukhan195f8eb2020-06-25 12:50:57 -07003996 .row_tile = 2,
3997 .channel_tile = 8,
3998 };
3999 }
Marat Dukhan52238f02020-07-16 15:30:28 -07004000 xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_p5_x16_acc2;
Marat Dukhancdc56552020-06-26 19:49:41 -07004001 if (is_wasm_x86) {
Marat Dukhan0bf8afa2021-09-20 10:02:18 -07004002 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__wasmsimd_x86;
Marat Dukhancdc56552020-06-26 19:49:41 -07004003 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004004 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_x16,
4005 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16,
4006 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16,
4007 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x16,
4008 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
4009 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
4010 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004011 };
4012 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardb392f8e2020-10-27 10:46:44 -07004013 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16,
4014 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x16,
4015 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x16,
4016 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__wasmsimd_x16,
4017 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__wasmsimd_x16,
4018 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__wasmsimd_x16,
4019 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004020 };
4021 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004022 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_x86_x16,
4023 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16,
4024 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16,
4025 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004026 };
4027 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004028 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_x86_x16,
4029 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x16,
4030 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x16,
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004031
Frank Barchard9c7308f2020-08-31 17:03:01 -07004032 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004033 };
4034 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004035 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16,
4036 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16,
4037 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16,
4038 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x16,
4039 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
4040 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
4041 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004042 };
4043 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004044 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16,
4045 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16,
4046 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16,
4047 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x16,
4048 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x16,
4049 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x16,
4050 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004051 };
4052 } else {
Marat Dukhan0bf8afa2021-09-20 10:02:18 -07004053 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__wasmsimd_arm;
Marat Dukhancdc56552020-06-26 19:49:41 -07004054 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004055 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_x16,
4056 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16,
4057 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16,
4058 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x16,
4059 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
4060 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
4061 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004062 };
4063 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardb392f8e2020-10-27 10:46:44 -07004064 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16,
4065 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x16,
4066 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x16,
4067 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__wasmsimd_x16,
4068 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__wasmsimd_x16,
4069 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__wasmsimd_x16,
4070 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004071 };
4072 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004073 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_arm_x16,
4074 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16,
4075 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16,
4076 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004077 };
4078 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004079 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_arm_x16,
4080 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x16,
4081 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x16,
4082 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004083 };
4084 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004085 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16,
4086 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16,
4087 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16,
4088 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x16,
4089 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
4090 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
4091 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004092 };
4093 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004094 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16,
4095 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16,
4096 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16,
4097 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x16,
4098 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x16,
4099 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x16,
4100 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004101 };
4102 }
Marat Dukhanf7399262020-06-05 10:58:44 -07004103 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004104 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__wasmsimd_x16,
4105 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16,
4106 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16,
4107 .element_tile = 16,
Marat Dukhanf7399262020-06-05 10:58:44 -07004108 };
Marat Dukhand816f622020-07-15 10:14:39 -07004109 if (is_wasm_x86) {
4110 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07004111 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_x86_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07004112 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhand816f622020-07-15 10:14:39 -07004113 .channel_tile = 4,
4114 .row_tile = 2,
4115 };
4116 } else {
4117 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07004118 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_arm_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07004119 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhand816f622020-07-15 10:14:39 -07004120 .channel_tile = 4,
4121 .row_tile = 2,
4122 };
4123 }
Erich Elsen6e80fdc2020-06-09 15:35:37 -07004124 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -08004125 init_flags |= XNN_INIT_FLAG_CHW_OPT;
4126
Frank Barchard498cb502020-11-16 23:50:04 -08004127 if (is_wasm_x86) {
4128 xnn_params.f32.spmm = (struct spmm_parameters) {
4129 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86,
4130 .mr = 32,
4131 .nr = 1,
4132 };
4133 } else {
4134 xnn_params.f32.spmm = (struct spmm_parameters) {
4135 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm,
4136 .mr = 32,
4137 .nr = 1,
4138 };
4139 }
Erich Elsen0a1970e2020-06-10 09:24:59 -07004140 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
4141 .ukernel_with_symm_padding =
Frank Barchard22136062020-11-24 18:44:46 -08004142 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2,
Erich Elsen0a1970e2020-06-10 09:24:59 -07004143 .output_channel_tile = 4,
4144 .output_height_tile = 2,
4145 .output_width_tile = 2,
4146 };
Frank Barcharddb5c32d2020-11-16 23:58:42 -08004147 if (is_wasm_x86) {
4148 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08004149 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08004150 .output_width_tile = 4,
Frank Barchard97883b82020-11-23 13:01:03 -08004151 .output_height_tile = 2,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08004152 };
4153 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08004154 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_1x4_acc2,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08004155 .output_width_tile = 4,
4156 .output_height_tile = 1,
4157 };
4158 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08004159 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08004160 .output_width_tile = 4,
4161 .output_height_tile = 3,
4162 };
4163 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08004164 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08004165 .output_width_tile = 4,
4166 .output_height_tile = 1,
4167 };
4168 } else {
4169 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08004170 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08004171 .output_width_tile = 4,
Frank Barchard97883b82020-11-23 13:01:03 -08004172 .output_height_tile = 2,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08004173 };
4174 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08004175 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_1x4_acc4,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08004176 .output_width_tile = 4,
4177 .output_height_tile = 1,
4178 };
4179 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08004180 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08004181 .output_width_tile = 4,
4182 .output_height_tile = 3,
4183 };
4184 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08004185 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08004186 .output_width_tile = 4,
4187 .output_height_tile = 1,
4188 };
4189 }
Marat Dukhanc5045bf2020-07-27 18:16:35 -07004190 if (is_wasm_x86) {
4191 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
4192 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4,
4193 .channel_tile = 4,
4194 };
4195 } else {
4196 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
4197 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4,
4198 .channel_tile = 4,
4199 };
4200 }
Artsiom Ablavatski97918102020-10-27 15:52:59 -07004201 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
4202 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8,
4203 .channel_tile = 1,
Artsiom Ablavatskib3ffd582021-03-31 13:00:08 -07004204 .pixel_tile = 8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07004205 };
Erich Elsen6e80fdc2020-06-09 15:35:37 -07004206 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004207 #endif // XNN_NO_F32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07004208
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004209 /*************************** VCVT micro-kernels ***************************/
4210 #ifndef XNN_NO_VCVT_OPERATORS
4211 init_flags |= XNN_INIT_FLAG_VCVT;
4212
4213 xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x16;
Marat Dukhana0c61682021-11-10 19:23:41 -08004214 xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__wasmsimd_x24;
Marat Dukhand52d20b2021-12-05 09:50:25 -08004215 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4216 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_x32,
4217 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_wasmsimd_magic_params,
4218 .element_tile = 32,
4219 };
4220 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4221 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_x32,
4222 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_wasmsimd_magic_params,
4223 .element_tile = 32,
4224 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004225 #endif // XNN_NO_VCVT_OPERATORS
4226
XNNPACK Teamb455b122019-09-27 18:10:33 -07004227 /**************************** X32 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004228 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07004229 init_flags |= XNN_INIT_FLAG_X32;
4230
Marat Dukhan9d4bfa22020-07-16 19:07:04 -07004231 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__wasmsimd;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004232 xnn_params.x32.zip = (struct zip_parameters) {
Marat Dukhane3b78762020-07-16 20:02:58 -07004233 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__wasmsimd,
4234 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__wasmsimd,
4235 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__wasmsimd,
4236 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__wasmsimd,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004237 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08004238 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -08004239 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
4240 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08004241 .channel_tile = 1,
4242 .pixel_tile = 1,
4243 };
4244 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004245 #endif // XNN_NO_X32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07004246
Marat Dukhan048931b2020-11-24 20:53:54 -08004247 /**************************** XX micro-kernels ****************************/
4248 #ifndef XNN_NO_XX_OPERATORS
4249 init_flags |= XNN_INIT_FLAG_XX;
4250
4251 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
Marat Dukhan933051b2021-08-07 16:26:15 -07004252 xnn_params.xx.fill = (struct fill_parameters) {
4253 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__wasmsimd_x64,
4254 .row_tile = 1,
4255 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -07004256 xnn_params.xx.pad = (struct pad_parameters) {
4257 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__wasmsimd,
4258 .row_tile = 1,
4259 };
Marat Dukhan048931b2020-11-24 20:53:54 -08004260 #endif
4261
Marat Dukhan933051b2021-08-07 16:26:15 -07004262#elif XNN_ARCH_WASM
4263
Marat Dukhan898d5852021-06-30 21:18:34 -07004264 /**************************** QC8 micro-kernels ****************************/
4265 #ifndef XNN_NO_QC8_OPERATORS
4266 init_flags |= XNN_INIT_FLAG_QC8;
4267
4268 if (is_wasm_x86) {
4269 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_magic);
4270 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x2__scalar_magic);
4271 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_magic);
4272 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x2__scalar_magic);
4273 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_scalar_magic_params;
4274 xnn_params.qc8.gemm.mr = 2;
4275 xnn_params.qc8.gemm.nr = 2;
4276 } else {
4277 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x4__scalar_magic);
4278 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x4__scalar_magic);
4279 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_magic);
4280 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4__scalar_magic);
4281 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_scalar_magic_params;
4282 xnn_params.qc8.gemm.mr = 4;
4283 xnn_params.qc8.gemm.nr = 4;
4284 }
4285
4286 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x9__scalar_magic;
4287 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_scalar_magic_params;
4288 xnn_params.qc8.dwconv[0].channel_tile = 2;
4289 xnn_params.qc8.dwconv[0].primary_tile = 9;
4290 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x25__scalar_magic;
4291 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_scalar_magic_params;
4292 xnn_params.qc8.dwconv[1].channel_tile = 2;
4293 xnn_params.qc8.dwconv[1].primary_tile = 25;
4294 #endif // XNN_NO_QC8_OPERATORS
4295
Marat Dukhan803c1f82021-05-12 00:13:37 -07004296 /**************************** QS8 micro-kernels ****************************/
4297 #ifndef XNN_NO_QS8_OPERATORS
4298 init_flags |= XNN_INIT_FLAG_QS8;
4299
4300 if (is_wasm_x86) {
Marat Dukhan3d76e552021-07-15 18:54:01 -07004301 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_magic);
4302 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_magic);
4303 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_magic);
4304 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_magic);
4305 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_magic_params;
Marat Dukhan803c1f82021-05-12 00:13:37 -07004306 xnn_params.qs8.gemm.mr = 2;
4307 xnn_params.qs8.gemm.nr = 2;
4308 } else {
Marat Dukhan3d76e552021-07-15 18:54:01 -07004309 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x4__scalar_magic);
4310 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x4__scalar_magic);
4311 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_magic);
4312 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_magic);
4313 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_magic_params;
Marat Dukhan803c1f82021-05-12 00:13:37 -07004314 xnn_params.qs8.gemm.mr = 4;
4315 xnn_params.qs8.gemm.nr = 4;
4316 }
4317
Marat Dukhan3d76e552021-07-15 18:54:01 -07004318 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_magic;
4319 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_magic_params;
Marat Dukhan803c1f82021-05-12 00:13:37 -07004320 xnn_params.qs8.dwconv[0].channel_tile = 2;
4321 xnn_params.qs8.dwconv[0].primary_tile = 9;
Marat Dukhan3d76e552021-07-15 18:54:01 -07004322 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x25__scalar_magic;
4323 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_magic_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07004324 xnn_params.qs8.dwconv[1].channel_tile = 2;
4325 xnn_params.qs8.dwconv[1].primary_tile = 25;
Marat Dukhan803c1f82021-05-12 00:13:37 -07004326
4327 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
4328 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7x__scalar_c4,
4329 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7p7x__scalar_c4,
4330 .mr = 7,
4331 };
4332
4333 xnn_params.qs8.vadd = (struct vbinary_parameters) {
4334 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x4,
4335 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
4336 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
Marat Dukhan64287252021-09-07 16:20:03 -07004337 .init.qs8_addsub = xnn_init_qs8_add_minmax_scalar_params,
Marat Dukhan803c1f82021-05-12 00:13:37 -07004338 .element_tile = 4,
4339 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07004340 xnn_params.qs8.vmul = (struct vbinary_parameters) {
4341 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
4342 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
4343 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
4344 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
4345 .element_tile = 4,
4346 };
Marat Dukhan803c1f82021-05-12 00:13:37 -07004347 #endif // XNN_NO_QS8_OPERATORS
4348
Marat Dukhan08b7a972020-07-14 18:17:29 -07004349 /**************************** QU8 micro-kernels ****************************/
4350 #ifndef XNN_NO_QU8_OPERATORS
4351 init_flags |= XNN_INIT_FLAG_QU8;
Marat Dukhan854fb6b2020-06-19 12:33:44 -07004352
Marat Dukhan3d76e552021-07-15 18:54:01 -07004353 if (is_wasm_x86) {
4354 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_magic);
4355 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_magic);
4356 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_magic);
4357 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_magic);
4358 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_magic_params;
4359 xnn_params.qu8.gemm.mr = 2;
4360 xnn_params.qu8.gemm.nr = 2;
4361 } else {
4362 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_magic);
4363 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_4x4__scalar_magic);
4364 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_magic);
4365 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_magic);
4366 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_magic_params;
4367 xnn_params.qu8.gemm.mr = 4;
4368 xnn_params.qu8.gemm.nr = 4;
4369 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07004370
Marat Dukhan3d76e552021-07-15 18:54:01 -07004371 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x9__scalar_magic;
4372 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_magic_params;
4373 xnn_params.qu8.dwconv[0].channel_tile = 2;
Marat Dukhan08b7a972020-07-14 18:17:29 -07004374 xnn_params.qu8.dwconv[0].primary_tile = 9;
Marat Dukhan3d76e552021-07-15 18:54:01 -07004375 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x25__scalar_magic;
4376 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_magic_params;
4377 xnn_params.qu8.dwconv[1].channel_tile = 2;
4378 xnn_params.qu8.dwconv[1].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004379
Marat Dukhan08b7a972020-07-14 18:17:29 -07004380 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
4381 .up = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
4382 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004383 .mr = 9,
4384 .qr = 8,
4385 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07004386 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
4387 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__scalar_c1,
4388 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__scalar_c1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004389 .mr = 7,
4390 };
Marat Dukhandb007cd2021-07-20 23:42:39 -07004391
4392 xnn_params.qu8.vadd = (struct vbinary_parameters) {
4393 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x4,
4394 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
4395 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
Marat Dukhan64287252021-09-07 16:20:03 -07004396 .init.qu8_addsub = xnn_init_qu8_add_minmax_scalar_params,
Marat Dukhandb007cd2021-07-20 23:42:39 -07004397 .element_tile = 4,
4398 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07004399 xnn_params.qu8.vmul = (struct vbinary_parameters) {
4400 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
4401 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
4402 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
4403 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
4404 .element_tile = 4,
4405 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07004406 #endif // XNN_NO_QU8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07004407
Marat Dukhandc5c1482021-08-16 09:03:15 -07004408 /**************************** S8 micro-kernels ****************************/
4409 #ifndef XNN_NO_S8_OPERATORS
4410 init_flags |= XNN_INIT_FLAG_S8;
4411
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07004412 xnn_params.s8.clamp = (struct vunary_parameters) {
4413 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
4414 .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
4415 .element_tile = 4,
4416 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08004417 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
4418 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
4419 .pixel_tile = 1,
4420 .channel_tile = 1,
4421 };
Marat Dukhandc5c1482021-08-16 09:03:15 -07004422 xnn_params.s8.maxpool = (struct maxpool_parameters) {
4423 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
4424 .init.s8 = xnn_init_s8_minmax_scalar_params,
4425 .mr = 9,
4426 .qr = 8,
4427 };
4428 #endif // XNN_NO_S8_OPERATORS
4429
XNNPACK Teamb455b122019-09-27 18:10:33 -07004430 /**************************** U8 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004431 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07004432 init_flags |= XNN_INIT_FLAG_U8;
4433
Marat Dukhan94912792021-08-16 21:40:30 -07004434 xnn_params.u8.clamp = (struct vunary_parameters) {
4435 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
4436 .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
4437 .element_tile = 4,
4438 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08004439 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
4440 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
4441 .pixel_tile = 1,
4442 .channel_tile = 1,
4443 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004444 xnn_params.u8.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07004445 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -07004446 .init.u8 = xnn_init_u8_minmax_scalar_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004447 .mr = 9,
4448 .qr = 8,
4449 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004450 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
4451 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
4452 #endif // XNN_NO_U8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07004453
4454 /**************************** X8 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004455 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07004456 init_flags |= XNN_INIT_FLAG_X8;
4457
Marat Dukhand67539d2021-09-08 23:06:03 -07004458 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004459 xnn_params.x8.zip = (struct zip_parameters) {
4460 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
4461 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
4462 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
4463 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
4464 };
4465 #endif // XNN_NO_X8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07004466
4467 /**************************** F32 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004468 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07004469 init_flags |= XNN_INIT_FLAG_F32;
4470
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004471 if (is_wasm_x86) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07004472 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_2x4__scalar);
4473 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_2x4__scalar);
4474 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__wasm);
4475 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__wasm);
Marat Dukhan467f6362020-05-22 23:21:55 -07004476 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_2x4__scalar);
4477 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_2x4__scalar);
4478 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__wasm);
4479 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__wasm);
Marat Dukhan869c62d2020-04-09 17:17:55 -07004480 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_2x4__scalar);
4481 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_2x4__scalar);
4482 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__wasm);
4483 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__wasm);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07004484 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004485 xnn_params.f32.gemm.mr = 2;
4486 xnn_params.f32.gemm.nr = 4;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004487 } else {
Marat Dukhanaefaef32020-04-09 07:09:34 -07004488 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__wasm);
4489 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__wasm);
4490 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__wasm);
4491 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__wasm);
Marat Dukhan467f6362020-05-22 23:21:55 -07004492 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__wasm);
4493 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__wasm);
4494 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__wasm);
4495 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__wasm);
Marat Dukhan869c62d2020-04-09 17:17:55 -07004496 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__wasm);
4497 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__wasm);
4498 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__wasm);
4499 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__wasm);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07004500 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004501 xnn_params.f32.gemm.mr = 4;
4502 xnn_params.f32.gemm.nr = 4;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004503 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07004504 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__wasm);
4505 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__wasm),
Marat Dukhan869c62d2020-04-09 17:17:55 -07004506 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__wasm);
4507 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__wasm),
Marat Dukhan104ae5e2021-05-24 13:41:57 -07004508 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004509 xnn_params.f32.gemm2.mr = 4;
4510 xnn_params.f32.gemm2.nr = 2;
4511
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004512 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__wasm_acc2;
4513 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__wasm_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07004514 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004515 xnn_params.f32.dwconv[0].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004516 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004517
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004518 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__wasm_acc2;
4519 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__wasm_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07004520 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004521 xnn_params.f32.dwconv[1].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004522 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004523
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004524 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__wasm_acc2;
4525 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__wasm_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07004526 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004527 xnn_params.f32.dwconv[2].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004528 xnn_params.f32.dwconv[2].primary_tile = 9;
4529
4530 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__wasm_acc2;
4531 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__wasm_acc2;
4532 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
4533 xnn_params.f32.dwconv[3].channel_tile = 1;
4534 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004535
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004536 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07004537 .up = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasm_c1,
4538 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasm_c1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004539 .mr = 9,
4540 .qr = 8,
4541 };
4542 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07004543 .up = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasm_c1,
4544 .mp = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasm_c1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004545 .mr = 9,
4546 .qr = 8,
4547 };
4548 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07004549 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1,
4550 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004551 .mr = 7,
4552 };
4553 xnn_params.f32.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07004554 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasm_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -07004555 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004556 .mr = 9,
4557 .qr = 8,
4558 };
4559 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07004560 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004561 .mr = 4,
4562 };
4563 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07004564 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004565 .mr = 9,
4566 };
4567 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07004568 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004569 .mr = 9,
4570 .qr = 8,
4571 };
Marat Dukhan660fd192020-03-10 04:55:30 -07004572 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
4573 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
Marat Dukhan69722492019-11-11 19:55:50 -08004574 .pixel_tile = 1,
4575 .channel_tile = 2,
4576 };
Marat Dukhan5020b962020-06-08 13:30:10 -07004577 xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4;
Marat Dukhan94912792021-08-16 21:40:30 -07004578 xnn_params.f32.clamp = (struct vunary_parameters) {
4579 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__wasm_x4,
4580 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
4581 .element_tile = 4,
4582 };
Marat Dukhanc303fe62020-06-26 10:09:25 -07004583 if (is_wasm_x86) {
Marat Dukhan6674d692021-05-05 22:27:00 -07004584 xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4;
Marat Dukhanc303fe62020-06-26 10:09:25 -07004585 } else {
Marat Dukhan6674d692021-05-05 22:27:00 -07004586 xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__wasm_x4;
Marat Dukhanc303fe62020-06-26 10:09:25 -07004587 }
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08004588 if (is_wasm_x86) {
4589 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x2;
4590 } else {
4591 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasm_rr2_p6_x6;
4592 }
Marat Dukhan28813332020-06-10 18:05:38 -07004593 xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4;
Marat Dukhan5020b962020-06-08 13:30:10 -07004594 xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4;
Frank Barchard62c5e232020-07-21 17:42:19 -07004595 if (is_wasm_x86) {
Marat Dukhan6674d692021-05-05 22:27:00 -07004596 xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_vrelu_ukernel__scalar_x8;
Frank Barchard62c5e232020-07-21 17:42:19 -07004597 } else {
Marat Dukhan6674d692021-05-05 22:27:00 -07004598 xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_vrelu_ukernel__wasm_x8;
Frank Barchard62c5e232020-07-21 17:42:19 -07004599 }
Marat Dukhan64e52512020-06-09 13:41:16 -07004600 xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x4;
4601 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x4;
4602 xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x4;
4603 xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x4;
Marat Dukhan6674d692021-05-05 22:27:00 -07004604 xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_lut64_p2_div_x2;
Marat Dukhan5020b962020-06-08 13:30:10 -07004605 xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4;
Marat Dukhan6804bbd2020-06-30 19:26:11 -07004606 xnn_params.f32.sqrt = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1;
Marat Dukhan7c1f8082020-06-25 13:26:20 -07004607 if (is_wasm_x86) {
4608 xnn_params.f32.prelu = (struct prelu_parameters) {
4609 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
4610 .row_tile = 2,
4611 .channel_tile = 4,
4612 };
4613 } else {
4614 xnn_params.f32.prelu = (struct prelu_parameters) {
4615 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasm_2x4,
4616 .row_tile = 2,
4617 .channel_tile = 4,
4618 };
4619 }
Marat Dukhan1edc4542020-01-27 12:40:13 -08004620 xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc2;
4621 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
Marat Dukhanb1a0fc32019-12-02 19:32:02 -08004622 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004623 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasm_x8,
4624 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x8,
4625 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x8,
Marat Dukhanb1a0fc32019-12-02 19:32:02 -08004626 .element_tile = 8,
4627 };
Marat Dukhan69180502019-12-06 15:00:31 -08004628 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardb392f8e2020-10-27 10:46:44 -07004629 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasm_x8,
4630 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasm_x8,
4631 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasm_x8,
4632 .element_tile = 8,
Marat Dukhan69180502019-12-06 15:00:31 -08004633 };
Marat Dukhan79e7f842019-12-05 14:35:50 -08004634 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004635 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasm_x8,
4636 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x8,
4637 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x8,
Marat Dukhan79e7f842019-12-05 14:35:50 -08004638 .element_tile = 8,
4639 };
4640 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004641 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasm_x8,
4642 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x8,
4643 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x8,
Marat Dukhan79e7f842019-12-05 14:35:50 -08004644 .element_tile = 8,
4645 };
Marat Dukhan1e782c42019-11-21 17:02:40 -08004646 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004647 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasm_x8,
4648 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x8,
4649 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x8,
Marat Dukhanca2733c2019-11-15 23:21:17 -08004650 .element_tile = 8,
4651 };
Marat Dukhan05f3f6d2019-12-03 15:13:53 -08004652 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004653 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasm_x8,
4654 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasm_x8,
4655 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasm_x8,
Marat Dukhan05f3f6d2019-12-03 15:13:53 -08004656 .element_tile = 8,
4657 };
Marat Dukhanf7399262020-06-05 10:58:44 -07004658 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004659 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
4660 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
4661 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
Marat Dukhanf7399262020-06-05 10:58:44 -07004662 .element_tile = 8,
4663 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004664 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07004665 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__wasm_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07004666 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan49e6ee92019-11-06 15:55:29 -08004667 .channel_tile = 1,
4668 .row_tile = 2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004669 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08004670 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -08004671 init_flags |= XNN_INIT_FLAG_CHW_OPT;
4672
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004673 xnn_params.f32.spmm = (struct spmm_parameters) {
Marat Dukhan355ab432020-04-09 19:01:52 -07004674 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
Marat Dukhanbff791e2019-10-24 11:05:37 -07004675 .mr = 8,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004676 .nr = 1,
4677 };
Erich Elsenc6afd9b2019-10-24 16:10:53 -07004678 xnn_params.f32.spmm2 = (struct spmm_parameters) {
Marat Dukhan355ab432020-04-09 19:01:52 -07004679 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
Erich Elsenc6afd9b2019-10-24 16:10:53 -07004680 .mr = 8,
4681 .nr = 2,
4682 };
4683 xnn_params.f32.spmm4 = (struct spmm_parameters) {
Marat Dukhan355ab432020-04-09 19:01:52 -07004684 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
Erich Elsenc6afd9b2019-10-24 16:10:53 -07004685 .mr = 8,
4686 .nr = 4,
4687 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07004688 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
Marat Dukhan14fe0b22019-10-23 21:20:07 -07004689 .ukernel_with_symm_padding =
Marat Dukhan1f29b802020-05-15 23:46:39 -07004690 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
Marat Dukhan14fe0b22019-10-23 21:20:07 -07004691 .output_channel_tile = 4,
4692 .output_height_tile = 1,
4693 .output_width_tile = 1,
4694 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07004695 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
Marat Dukhan91249d22020-10-24 12:02:51 -07004696 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2,
Marat Dukhan14fe0b22019-10-23 21:20:07 -07004697 .output_width_tile = 1,
Marat Dukhan91249d22020-10-24 12:02:51 -07004698 .output_height_tile = 2,
Marat Dukhan14fe0b22019-10-23 21:20:07 -07004699 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07004700 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
Marat Dukhancf5b3c32020-10-25 19:21:10 -07004701 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2,
Marat Dukhan14fe0b22019-10-23 21:20:07 -07004702 .output_width_tile = 1,
4703 .output_height_tile = 1,
4704 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07004705 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
4706 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5,
Marat Dukhana99918a2019-11-15 14:40:12 -08004707 .output_width_tile = 1,
4708 .output_height_tile = 1,
4709 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07004710 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
4711 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5,
Marat Dukhana99918a2019-11-15 14:40:12 -08004712 .output_width_tile = 1,
4713 .output_height_tile = 1,
4714 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07004715 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
4716 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
Marat Dukhan14fe0b22019-10-23 21:20:07 -07004717 .channel_tile = 1,
4718 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -07004719 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
4720 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
4721 .channel_tile = 1,
4722 .pixel_tile = 4,
4723 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08004724 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004725 #endif // XNN_NO_F32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07004726
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004727 /*************************** VCVT micro-kernels ***************************/
4728 #ifndef XNN_NO_VCVT_OPERATORS
4729 init_flags |= XNN_INIT_FLAG_VCVT;
4730
4731 xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_float_x1;
Marat Dukhana0c61682021-11-10 19:23:41 -08004732 xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_bitcast_x4;
Marat Dukhan430b1732021-12-04 02:53:12 -08004733 if (is_wasm_x86) {
4734 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4735 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_magic_iminmax_x1,
4736 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_magic_iminmax_params,
4737 .element_tile = 1,
4738 };
4739 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4740 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_magic_iminmax_x1,
4741 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_magic_iminmax_params,
4742 .element_tile = 1,
4743 };
4744 } else {
4745 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4746 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasm_magic_fminmax_x4,
4747 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_magic_fminmax_params,
4748 .element_tile = 4,
4749 };
4750 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4751 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasm_magic_fminmax_x4,
4752 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_magic_fminmax_params,
4753 .element_tile = 4,
4754 };
4755 }
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004756 #endif // XNN_NO_VCVT_OPERATORS
4757
XNNPACK Teamb455b122019-09-27 18:10:33 -07004758 /**************************** X32 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004759 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07004760 init_flags |= XNN_INIT_FLAG_X32;
4761
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004762 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
4763 xnn_params.x32.zip = (struct zip_parameters) {
4764 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
4765 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
4766 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
4767 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
4768 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08004769 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -08004770 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
4771 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08004772 .channel_tile = 1,
4773 .pixel_tile = 1,
4774 };
4775 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004776 #endif // XNN_NO_X32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07004777
Marat Dukhan933051b2021-08-07 16:26:15 -07004778 /**************************** XX micro-kernels ****************************/
4779 #ifndef XNN_NO_XX_OPERATORS
4780 init_flags |= XNN_INIT_FLAG_XX;
4781
4782 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
4783 xnn_params.xx.fill = (struct fill_parameters) {
4784 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
4785 .row_tile = 1,
4786 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -07004787 xnn_params.xx.pad = (struct pad_parameters) {
4788 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
4789 .row_tile = 1,
4790 };
Marat Dukhan933051b2021-08-07 16:26:15 -07004791 #endif
4792
Marat Dukhane3cb19b2021-05-07 03:38:49 -07004793#elif XNN_ARCH_RISCV
4794
Marat Dukhan803c1f82021-05-12 00:13:37 -07004795 /**************************** QS8 micro-kernels ****************************/
4796 #ifndef XNN_NO_QS8_OPERATORS
4797 init_flags |= XNN_INIT_FLAG_QS8;
4798
Frank Barchard22f9a9f2021-07-21 11:35:27 -07004799 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_3x4__scalar);
4800 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_3x4__scalar);
4801 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x4__scalar);
4802 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x4__scalar);
4803 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_scalar_params;
Marat Dukhan15a35c02021-05-12 11:40:03 -07004804 xnn_params.qs8.gemm.mr = 3;
Marat Dukhan803c1f82021-05-12 00:13:37 -07004805 xnn_params.qs8.gemm.nr = 4;
4806
Marat Dukhanb07c26a2021-05-24 19:44:51 -07004807 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up2x9__scalar;
Marat Dukhan9b474cf2021-05-25 16:37:48 -07004808 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_gemmlowp_scalar_params;
Marat Dukhan803c1f82021-05-12 00:13:37 -07004809 xnn_params.qs8.dwconv[0].channel_tile = 2;
4810 xnn_params.qs8.dwconv[0].primary_tile = 9;
Marat Dukhanb07c26a2021-05-24 19:44:51 -07004811 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up2x25__scalar;
Marat Dukhan9b474cf2021-05-25 16:37:48 -07004812 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_gemmlowp_scalar_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07004813 xnn_params.qs8.dwconv[1].channel_tile = 2;
4814 xnn_params.qs8.dwconv[1].primary_tile = 25;
Marat Dukhan803c1f82021-05-12 00:13:37 -07004815
4816 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
4817 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7x__scalar_c1,
4818 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7p7x__scalar_c1,
4819 .mr = 7,
4820 };
4821
4822 xnn_params.qs8.vadd = (struct vbinary_parameters) {
4823 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x4,
4824 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
4825 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
Marat Dukhan64287252021-09-07 16:20:03 -07004826 .init.qs8_addsub = xnn_init_qs8_add_minmax_scalar_params,
Marat Dukhan803c1f82021-05-12 00:13:37 -07004827 .element_tile = 4,
4828 };
4829 #endif // XNN_NO_QS8_OPERATORS
4830
Marat Dukhane3cb19b2021-05-07 03:38:49 -07004831 /**************************** QU8 micro-kernels ****************************/
4832 #ifndef XNN_NO_QU8_OPERATORS
4833 init_flags |= XNN_INIT_FLAG_QU8;
4834
Marat Dukhanc2e8f662021-07-01 17:06:34 -07004835 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_gemmlowp_ukernel_2x2__scalar);
4836 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_gemmlowp_ukernel_2x2__scalar);
Marat Dukhanc698c112021-07-01 18:52:10 -07004837 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_gemmlowp_scalar_params;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07004838 xnn_params.qu8.gemm.mr = 2;
4839 xnn_params.qu8.gemm.nr = 2;
4840
Marat Dukhanc2e8f662021-07-01 17:06:34 -07004841 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_gemmlowp_ukernel_up1x9__scalar;
Marat Dukhanc698c112021-07-01 18:52:10 -07004842 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_gemmlowp_scalar_params;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07004843 xnn_params.qu8.dwconv[0].channel_tile = 1;
4844 xnn_params.qu8.dwconv[0].primary_tile = 9;
4845
4846 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
4847 .up = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
4848 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
4849 .mr = 9,
4850 .qr = 8,
4851 };
4852 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
4853 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__scalar_c1,
4854 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__scalar_c1,
4855 .mr = 7,
4856 };
Marat Dukhandb007cd2021-07-20 23:42:39 -07004857
4858 xnn_params.qu8.vadd = (struct vbinary_parameters) {
4859 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x4,
4860 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
4861 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
Marat Dukhan64287252021-09-07 16:20:03 -07004862 .init.qu8_addsub = xnn_init_qu8_add_minmax_scalar_params,
Marat Dukhandb007cd2021-07-20 23:42:39 -07004863 .element_tile = 4,
4864 };
Marat Dukhane3cb19b2021-05-07 03:38:49 -07004865 #endif // XNN_NO_QU8_OPERATORS
4866
4867 /**************************** U8 micro-kernels ****************************/
4868 #ifndef XNN_NO_U8_OPERATORS
4869 init_flags |= XNN_INIT_FLAG_U8;
4870
Marat Dukhan94912792021-08-16 21:40:30 -07004871 xnn_params.u8.clamp = (struct vunary_parameters) {
4872 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
4873 .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
4874 .element_tile = 4,
4875 };
Marat Dukhane3cb19b2021-05-07 03:38:49 -07004876 xnn_params.u8.maxpool = (struct maxpool_parameters) {
4877 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -07004878 .init.u8 = xnn_init_u8_minmax_scalar_params,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07004879 .mr = 9,
4880 .qr = 8,
4881 };
Marat Dukhane3cb19b2021-05-07 03:38:49 -07004882 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
4883 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
4884 #endif // XNN_NO_U8_OPERATORS
4885
4886 /**************************** X8 micro-kernels ****************************/
4887 #ifndef XNN_NO_X8_OPERATORS
4888 init_flags |= XNN_INIT_FLAG_X8;
4889
Marat Dukhand67539d2021-09-08 23:06:03 -07004890 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07004891 xnn_params.x8.zip = (struct zip_parameters) {
4892 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
4893 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
4894 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
4895 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
4896 };
4897 #endif // XNN_NO_X8_OPERATORS
4898
4899 /**************************** F32 micro-kernels ****************************/
4900 #ifndef XNN_NO_F32_OPERATORS
4901 init_flags |= XNN_INIT_FLAG_F32;
4902
4903 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__scalar);
4904 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__scalar);
4905 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__scalar);
4906 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__scalar);
4907 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__scalar);
4908 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__scalar);
4909 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__scalar);
4910 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__scalar);
4911 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__scalar);
4912 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__scalar);
4913 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar);
4914 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07004915 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07004916 xnn_params.f32.gemm.mr = 4;
4917 xnn_params.f32.gemm.nr = 4;
4918
4919 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__scalar);
4920 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__scalar),
4921 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__scalar);
4922 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__scalar),
Marat Dukhan104ae5e2021-05-24 13:41:57 -07004923 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07004924 xnn_params.f32.gemm2.mr = 4;
4925 xnn_params.f32.gemm2.nr = 2;
4926
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004927 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__scalar_acc2;
4928 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07004929 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07004930 xnn_params.f32.dwconv[0].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004931 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07004932
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004933 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__scalar_acc2;
4934 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07004935 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07004936 xnn_params.f32.dwconv[1].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004937 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07004938
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004939 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__scalar_acc2;
4940 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07004941 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07004942 xnn_params.f32.dwconv[2].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004943 xnn_params.f32.dwconv[2].primary_tile = 9;
4944
4945 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__scalar_acc2;
4946 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar_acc2;
4947 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
4948 xnn_params.f32.dwconv[3].channel_tile = 1;
4949 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07004950
4951 xnn_params.f32.avgpool = (struct avgpool_parameters) {
4952 .up = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__scalar_c1,
4953 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__scalar_c1,
4954 .mr = 9,
4955 .qr = 8,
4956 };
4957 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
4958 .up = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__scalar_c1,
4959 .mp = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__scalar_c1,
4960 .mr = 9,
4961 .qr = 8,
4962 };
4963 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
4964 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1,
4965 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1,
4966 .mr = 7,
4967 };
4968 xnn_params.f32.maxpool = (struct maxpool_parameters) {
4969 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -07004970 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07004971 .mr = 9,
4972 .qr = 8,
4973 };
4974 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
4975 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
4976 .mr = 4,
4977 };
4978 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
4979 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
4980 .mr = 9,
4981 };
4982 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
4983 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
4984 .mr = 9,
4985 .qr = 8,
4986 };
4987 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
4988 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
4989 .pixel_tile = 1,
4990 .channel_tile = 2,
4991 };
4992 xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4;
4993 xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__scalar_x4;
4994 xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4;
4995 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x2;
4996 xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4;
4997 xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4;
4998 xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_vrelu_ukernel__scalar_x8;
4999 xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x4;
5000 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x4;
5001 xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x4;
5002 xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x4;
5003 xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_lut64_p2_div_x2;
5004 xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4;
5005 xnn_params.f32.sqrt = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1;
5006 xnn_params.f32.prelu = (struct prelu_parameters) {
5007 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
5008 .row_tile = 2,
5009 .channel_tile = 4,
5010 };
5011 xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc2;
5012 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
5013 xnn_params.f32.vadd = (struct vbinary_parameters) {
5014 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__scalar_x8,
5015 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
5016 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
5017 .element_tile = 8,
5018 };
5019 xnn_params.f32.vdiv = (struct vbinary_parameters) {
5020 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x8,
5021 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x8,
5022 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x8,
5023 .element_tile = 8,
5024 };
5025 xnn_params.f32.vmax = (struct vbinary_parameters) {
5026 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__scalar_x8,
5027 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
5028 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
5029 .element_tile = 8,
5030 };
5031 xnn_params.f32.vmin = (struct vbinary_parameters) {
5032 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__scalar_x8,
5033 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
5034 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
5035 .element_tile = 8,
5036 };
5037 xnn_params.f32.vmul = (struct vbinary_parameters) {
5038 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__scalar_x8,
5039 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
5040 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
5041 .element_tile = 8,
5042 };
5043 xnn_params.f32.vsub = (struct vbinary_parameters) {
5044 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__scalar_x8,
5045 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__scalar_x8,
5046 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__scalar_x8,
5047 .element_tile = 8,
5048 };
5049 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
5050 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
5051 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
5052 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
5053 .element_tile = 8,
5054 };
5055 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
5056 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__scalar_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07005057 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005058 .channel_tile = 1,
5059 .row_tile = 2,
5060 };
5061 #ifndef XNN_NO_NCHW_OPERATORS
5062 init_flags |= XNN_INIT_FLAG_CHW_OPT;
5063
5064 xnn_params.f32.spmm = (struct spmm_parameters) {
5065 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
5066 .mr = 8,
5067 .nr = 1,
5068 };
5069 xnn_params.f32.spmm2 = (struct spmm_parameters) {
5070 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
5071 .mr = 8,
5072 .nr = 2,
5073 };
5074 xnn_params.f32.spmm4 = (struct spmm_parameters) {
5075 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
5076 .mr = 8,
5077 .nr = 4,
5078 };
5079 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
5080 .ukernel_with_symm_padding =
5081 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
5082 .output_channel_tile = 4,
5083 .output_height_tile = 1,
5084 .output_width_tile = 1,
5085 };
5086 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
5087 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2,
5088 .output_width_tile = 1,
5089 .output_height_tile = 2,
5090 };
5091 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
5092 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2,
5093 .output_width_tile = 1,
5094 .output_height_tile = 1,
5095 };
5096 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
5097 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5,
5098 .output_width_tile = 1,
5099 .output_height_tile = 1,
5100 };
5101 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
5102 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5,
5103 .output_width_tile = 1,
5104 .output_height_tile = 1,
5105 };
5106 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
5107 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
5108 .channel_tile = 1,
5109 };
5110 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
5111 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
5112 .channel_tile = 1,
5113 .pixel_tile = 4,
5114 };
5115 #endif // XNN_NO_NCHW_OPERATORS
5116 #endif // XNN_NO_F32_OPERATORS
5117
5118 /**************************** X32 micro-kernels ****************************/
5119 #ifndef XNN_NO_X32_OPERATORS
5120 init_flags |= XNN_INIT_FLAG_X32;
5121
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005122 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
5123 xnn_params.x32.zip = (struct zip_parameters) {
5124 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
5125 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
5126 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
5127 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
5128 };
5129 #ifndef XNN_NO_NCHW_OPERATORS
5130 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
5131 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
5132 .channel_tile = 1,
5133 .pixel_tile = 1,
5134 };
5135 #endif // XNN_NO_NCHW_OPERATORS
5136 #endif // XNN_NO_X32_OPERATORS
5137
Marat Dukhan0461f2d2021-08-08 12:36:29 -07005138 /**************************** XX micro-kernels ****************************/
5139 #ifndef XNN_NO_XX_OPERATORS
5140 init_flags |= XNN_INIT_FLAG_XX;
5141
5142 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
5143 xnn_params.xx.fill = (struct fill_parameters) {
5144 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
5145 .row_tile = 1,
5146 };
5147 xnn_params.xx.pad = (struct pad_parameters) {
5148 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
5149 .row_tile = 1,
5150 };
5151 #endif
5152
XNNPACK Teamb455b122019-09-27 18:10:33 -07005153#else
5154 #error "Unsupported architecture"
5155#endif
Marat Dukhan496389f2021-04-07 15:47:12 -07005156
5157 memcpy(&xnn_params.allocator, init_allocator, sizeof(struct xnn_allocator));
Marat Dukhan854fb6b2020-06-19 12:33:44 -07005158 xnn_params.init_flags = init_flags;
XNNPACK Teamb455b122019-09-27 18:10:33 -07005159}
5160
Marat Dukhan57133c02020-04-13 00:54:59 -07005161#ifdef _WIN32
5162 static BOOL CALLBACK init_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) {
5163 init();
5164 return TRUE;
5165 }
5166#endif
5167
Marat Dukhan04f03be2019-11-19 12:36:47 -08005168enum xnn_status xnn_initialize(const struct xnn_allocator* allocator) {
Marat Dukhand343c222019-10-07 09:22:14 -07005169 #ifndef __EMSCRIPTEN__
5170 if (!cpuinfo_initialize()) {
5171 return xnn_status_out_of_memory;
5172 }
5173 #endif
Marat Dukhan496389f2021-04-07 15:47:12 -07005174 if (allocator == NULL) {
5175 allocator = &xnn_default_allocator;
5176 }
5177 #ifdef _MSC_VER
Marat Dukhandf94d982021-06-01 12:21:33 -07005178 _InterlockedCompareExchangePointer((PVOID volatile*) &init_allocator, (PVOID) allocator, NULL);
Marat Dukhan496389f2021-04-07 15:47:12 -07005179 #else
5180 __sync_bool_compare_and_swap(&init_allocator, NULL, allocator);
5181 #endif
Marat Dukhan57133c02020-04-13 00:54:59 -07005182 #ifdef _WIN32
5183 InitOnceExecuteOnce(&init_guard, &init_windows, NULL, NULL);
5184 #else
5185 pthread_once(&init_guard, &init);
5186 #endif
Marat Dukhan854fb6b2020-06-19 12:33:44 -07005187 if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) != 0) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07005188 return xnn_status_success;
5189 } else {
5190 return xnn_status_unsupported_hardware;
5191 }
5192}
5193
5194enum xnn_status xnn_deinitialize(void) {
Marat Dukhand343c222019-10-07 09:22:14 -07005195 #ifndef __EMSCRIPTEN__
5196 cpuinfo_deinitialize();
5197 #endif
XNNPACK Teamb455b122019-09-27 18:10:33 -07005198 return xnn_status_success;
Marat Dukhan3b9b4bc2021-08-24 00:23:07 -07005199}