blob: 4c73574cc72500f9d881df5c1bb808588fb8cc3c [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
Marat Dukhan01849012020-04-27 19:28:32 -07009#include <math.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070010#include <stdbool.h>
11#include <stddef.h>
12#include <stdint.h>
Marat Dukhan04f03be2019-11-19 12:36:47 -080013#include <string.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070014
Marat Dukhan57133c02020-04-13 00:54:59 -070015#ifdef _WIN32
16 #include <windows.h>
17#else
18 #include <pthread.h>
19#endif
XNNPACK Teamb455b122019-09-27 18:10:33 -070020
Marat Dukhan496389f2021-04-07 15:47:12 -070021#ifdef _MSC_VER
22 #include <intrin.h>
23#endif
24
Marat Dukhand343c222019-10-07 09:22:14 -070025#ifndef __EMSCRIPTEN__
26 #include <cpuinfo.h>
27#endif
XNNPACK Teamb455b122019-09-27 18:10:33 -070028
29#include <xnnpack.h>
Marat Dukhan496389f2021-04-07 15:47:12 -070030#include <xnnpack/allocator.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070031#include <xnnpack/argmaxpool.h>
32#include <xnnpack/avgpool.h>
Marat Dukhan1dadbf72019-10-01 10:46:20 -070033#include <xnnpack/common.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070034#include <xnnpack/conv.h>
35#include <xnnpack/dwconv.h>
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -080036#include <xnnpack/depthtospace.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070037#include <xnnpack/gavgpool.h>
38#include <xnnpack/gemm.h>
Marat Dukhan4662b192020-05-21 15:52:03 -070039#include <xnnpack/fill.h>
Marat Dukhan660fd192020-03-10 04:55:30 -070040#include <xnnpack/ibilinear.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070041#include <xnnpack/igemm.h>
42#include <xnnpack/log.h>
43#include <xnnpack/lut.h>
44#include <xnnpack/maxpool.h>
45#include <xnnpack/pad.h>
46#include <xnnpack/params.h>
Marat Dukhanc5a7a392021-05-21 16:04:31 -070047#include <xnnpack/params-init.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070048#include <xnnpack/pavgpool.h>
49#include <xnnpack/prelu.h>
Marat Dukhan1edc4542020-01-27 12:40:13 -080050#include <xnnpack/raddstoreexpminusmax.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070051#include <xnnpack/rmax.h>
52#include <xnnpack/spmm.h>
53#include <xnnpack/unpool.h>
Marat Dukhan64287252021-09-07 16:20:03 -070054#include <xnnpack/vaddsub.h>
Marat Dukhan1e782c42019-11-21 17:02:40 -080055#include <xnnpack/vbinary.h>
Marat Dukhanaf2ba002021-10-24 14:21:41 -070056#include <xnnpack/vcvt.h>
Marat Dukhan0853b8a2021-08-03 01:01:53 -070057#include <xnnpack/vmul.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070058#include <xnnpack/vmulcaddc.h>
Marat Dukhan1e782c42019-11-21 17:02:40 -080059#include <xnnpack/vunary.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070060#include <xnnpack/zip.h>
61
62#ifndef XNN_ENABLE_ASSEMBLY
63 #define XNN_ENABLE_ASSEMBLY 1
64#endif
65
Zhi An Ng0db15d32021-12-10 16:45:06 -080066#if XNN_PLATFORM_WINDOWS
Marat Dukhan57133c02020-04-13 00:54:59 -070067 static INIT_ONCE init_guard = INIT_ONCE_STATIC_INIT;
68#else
69 static pthread_once_t init_guard = PTHREAD_ONCE_INIT;
70#endif
XNNPACK Teamb455b122019-09-27 18:10:33 -070071
Marat Dukhan496389f2021-04-07 15:47:12 -070072static const struct xnn_allocator* volatile init_allocator = NULL;
73
XNNPACK Teamb455b122019-09-27 18:10:33 -070074struct xnn_parameters xnn_params = {
Marat Dukhan854fb6b2020-06-19 12:33:44 -070075 .init_flags = 0
XNNPACK Teamb455b122019-09-27 18:10:33 -070076};
77
Marat Dukhan01849012020-04-27 19:28:32 -070078static void init(void) {
Marat Dukhan4c617792021-12-21 15:47:58 -080079#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
Marat Dukhan01849012020-04-27 19:28:32 -070080 // Unlike most other architectures, on x86/x86-64 when floating-point instructions
81 // have no NaN arguments, but produce NaN output, the output NaN has sign bit set.
82 // We use it to distinguish x86/x86-64 from other architectures, by doing subtraction
83 // of two infinities (must produce NaN per IEEE 754 standard).
84 static const volatile float inf = INFINITY;
85 const bool is_wasm_x86 = signbit(inf - inf);
XNNPACK Teamb455b122019-09-27 18:10:33 -070086#endif
Marat Dukhan854fb6b2020-06-19 12:33:44 -070087 uint32_t init_flags = XNN_INIT_FLAG_XNNPACK;
XNNPACK Teamb455b122019-09-27 18:10:33 -070088
Marat Dukhan1dadbf72019-10-01 10:46:20 -070089#if XNN_ARCH_ARM
Frank Barchardbcdb1c12020-05-11 14:13:20 -070090 #if XNN_PLATFORM_MOBILE
Marat Dukhan3b745a42020-05-10 21:43:25 -070091 if (!cpuinfo_has_arm_neon()) {
92 xnn_log_error("XNNPACK initialization failed: NEON is not supported");
93 return;
94 }
95 #else
96 if (!cpuinfo_has_arm_vfpv2() && !cpuinfo_has_arm_vfpv3()) {
97 xnn_log_error("XNNPACK initialization failed: VFP is not supported");
98 return;
99 }
100 #endif
XNNPACK Teamb455b122019-09-27 18:10:33 -0700101
Marat Dukhan3b745a42020-05-10 21:43:25 -0700102 if (cpuinfo_has_arm_neon()) {
Frank Barchardb40ee632021-12-30 11:10:02 -0800103 /**************************** QC8 AArch32 micro-kernels ****************************/
Marat Dukhan898d5852021-06-30 21:18:34 -0700104 #ifndef XNN_NO_QC8_OPERATORS
105 init_flags |= XNN_INIT_FLAG_QC8;
106
Frank Barchardf290a142022-01-05 01:08:37 -0800107 #if XNN_ENABLE_ASSEMBLY
108 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
Frank Barchard70137e42021-12-28 15:40:18 -0800109 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64);
110 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64);
111 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
112 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
113 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
114 xnn_params.qc8.gemm.mr = 4;
115 xnn_params.qc8.gemm.nr = 8;
116 xnn_params.qc8.gemm.log2_kr = 2;
Frank Barchardf290a142022-01-05 01:08:37 -0800117
118 } else {
119 switch (cpuinfo_get_uarch(0)->uarch) {
120 case cpuinfo_uarch_cortex_a53:
121 case cpuinfo_uarch_cortex_a72:
122 case cpuinfo_uarch_exynos_m1:
123 case cpuinfo_uarch_exynos_m2:
124 case cpuinfo_uarch_exynos_m3:
125 case cpuinfo_uarch_kryo:
126 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64);
127 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64);
128 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
129 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
130 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
131 xnn_params.qc8.gemm.mr = 4;
132 xnn_params.qc8.gemm.nr = 8;
133 break;
134
135 default:
136 if (cpuinfo_has_arm_neon_v8()) {
137 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64);
138 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64);
139 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
140 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
141 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
142 xnn_params.qc8.gemm.mr = 4;
143 xnn_params.qc8.gemm.nr = 8;
144 } else {
Frank Barchardd2e8d4d2022-01-14 17:18:53 -0800145 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
146 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
147 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
148 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
Frank Barchardf290a142022-01-05 01:08:37 -0800149 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neon_params;
Frank Barchardd2e8d4d2022-01-14 17:18:53 -0800150 xnn_params.qc8.gemm.mr = 4;
Frank Barchardf290a142022-01-05 01:08:37 -0800151 xnn_params.qc8.gemm.nr = 8;
Frank Barchardf290a142022-01-05 01:08:37 -0800152 }
153 break;
154 }
155 }
156 #else // XNN_ENABLE_ASSEMBLY
157 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
Frank Barchard70137e42021-12-28 15:40:18 -0800158 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot);
Frank Barchard70137e42021-12-28 15:40:18 -0800159 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__neondot);
Frank Barchardf290a142022-01-05 01:08:37 -0800160 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
Frank Barchard70137e42021-12-28 15:40:18 -0800161 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
162 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
163 xnn_params.qc8.gemm.mr = 4;
164 xnn_params.qc8.gemm.nr = 8;
165 xnn_params.qc8.gemm.log2_kr = 2;
Frank Barchardf290a142022-01-05 01:08:37 -0800166 } else if (cpuinfo_has_arm_v8()) {
167 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
168 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
169 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
170 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
171 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
172 xnn_params.qc8.gemm.mr = 2;
173 xnn_params.qc8.gemm.nr = 8;
174 xnn_params.qc8.gemm.log2_kr = 1;
175 xnn_params.qc8.gemm.log2_sr = 2;
176 } else {
177 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal);
178 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal);
179 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal);
180 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal);
181 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neon_params;
182 xnn_params.qc8.gemm.mr = 2;
183 xnn_params.qc8.gemm.nr = 8;
184 xnn_params.qc8.gemm.log2_kr = 1;
185 xnn_params.qc8.gemm.log2_sr = 2;
186 }
187 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhan898d5852021-06-30 21:18:34 -0700188
Frank Barchard0bc58012021-11-22 18:12:05 -0800189 if (cpuinfo_has_arm_neon_v8()) {
190 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neonv8_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -0800191 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Frank Barchard0bc58012021-11-22 18:12:05 -0800192 xnn_params.qc8.dwconv[0].channel_tile = 16;
193 xnn_params.qc8.dwconv[0].primary_tile = 9;
194 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -0800195 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Frank Barchard0bc58012021-11-22 18:12:05 -0800196 xnn_params.qc8.dwconv[1].channel_tile = 8;
197 xnn_params.qc8.dwconv[1].primary_tile = 25;
198 } else {
199 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neon_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -0800200 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_neon_params;
Frank Barchard0bc58012021-11-22 18:12:05 -0800201 xnn_params.qc8.dwconv[0].channel_tile = 16;
202 xnn_params.qc8.dwconv[0].primary_tile = 9;
203 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neon_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -0800204 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_neon_params;
Frank Barchard0bc58012021-11-22 18:12:05 -0800205 xnn_params.qc8.dwconv[1].channel_tile = 8;
206 xnn_params.qc8.dwconv[1].primary_tile = 25;
207 }
Marat Dukhan898d5852021-06-30 21:18:34 -0700208 #endif // XNN_NO_QC8_OPERATORS
209
Frank Barchardb40ee632021-12-30 11:10:02 -0800210 /**************************** QS8 AArch32 micro-kernels ****************************/
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700211 #ifndef XNN_NO_QS8_OPERATORS
212 init_flags |= XNN_INIT_FLAG_QS8;
213
Frank Barchard95198162021-12-21 17:29:10 -0800214 #if XNN_ENABLE_ASSEMBLY
215 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
Frank Barchard8c7355a2021-12-21 15:11:06 -0800216 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64);
Frank Barchard8c7355a2021-12-21 15:11:06 -0800217 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64);
Frank Barchard8c7355a2021-12-21 15:11:06 -0800218 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
Frank Barchard8c7355a2021-12-21 15:11:06 -0800219 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
220 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
221 xnn_params.qs8.gemm.mr = 4;
222 xnn_params.qs8.gemm.nr = 8;
223 xnn_params.qs8.gemm.log2_kr = 2;
Frank Barchard95198162021-12-21 17:29:10 -0800224 } else {
Frank Barchard1c852c92021-12-23 13:10:20 -0800225 switch (cpuinfo_get_uarch(0)->uarch) {
226 case cpuinfo_uarch_cortex_a53:
227 case cpuinfo_uarch_cortex_a72:
228 case cpuinfo_uarch_exynos_m1:
229 case cpuinfo_uarch_exynos_m2:
230 case cpuinfo_uarch_exynos_m3:
231 case cpuinfo_uarch_kryo:
232 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
233 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
234 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
235 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
236 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
237 xnn_params.qs8.gemm.mr = 4;
238 xnn_params.qs8.gemm.nr = 8;
239 break;
240 default:
241 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
242 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
243 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
244 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
245 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
246 xnn_params.qs8.gemm.mr = 4;
247 xnn_params.qs8.gemm.nr = 8;
248 break;
249 }
Frank Barchard95198162021-12-21 17:29:10 -0800250 }
251 #else // XNN_ENABLE_ASSEMBLY
252 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
253 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
254 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
255 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
256 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
257 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
258 xnn_params.qs8.gemm.mr = 4;
259 xnn_params.qs8.gemm.nr = 8;
260 xnn_params.qs8.gemm.log2_kr = 2;
261 } else {
262 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
263 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
264 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
265 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
266 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
267 xnn_params.qs8.gemm.mr = 2;
268 xnn_params.qs8.gemm.nr = 8;
269 xnn_params.qs8.gemm.log2_kr = 1;
270 xnn_params.qs8.gemm.log2_sr = 2;
271 }
272 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700273
Frank Barchard0d065732021-08-31 00:01:40 -0700274 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64;
Marat Dukhanbe18f5c2021-07-16 18:46:39 -0700275 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchard0d065732021-08-31 00:01:40 -0700276 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700277 xnn_params.qs8.dwconv[0].primary_tile = 9;
Marat Dukhan5f2939f2021-07-23 13:38:32 -0700278 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mla8_ld64;
Marat Dukhanbe18f5c2021-07-16 18:46:39 -0700279 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -0700280 xnn_params.qs8.dwconv[1].channel_tile = 8;
281 xnn_params.qs8.dwconv[1].primary_tile = 25;
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700282
283 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan85755042022-01-13 01:46:05 -0800284 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
285 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
286 .init.qs8 = xnn_init_qs8_avgpool_minmax_rndnu_neon_params,
287 .update.qs8 = xnn_update_qs8_avgpool_minmax_rndnu_neon_params,
Marat Dukhanda382d12022-01-07 19:51:20 -0800288 .row_tile = 7,
289 .channel_tile = 8,
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700290 };
Marat Dukhanff209482020-09-03 14:26:53 -0700291
292 xnn_params.qs8.vadd = (struct vbinary_parameters) {
Marat Dukhan01debd92021-07-29 18:14:21 -0700293 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16,
294 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16,
295 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16,
Marat Dukhan64287252021-09-07 16:20:03 -0700296 .init.qs8_addsub = xnn_init_qs8_add_minmax_neon_params,
Marat Dukhan01debd92021-07-29 18:14:21 -0700297 .element_tile = 16,
Marat Dukhanff209482020-09-03 14:26:53 -0700298 };
Marat Dukhan33a98fa2022-01-13 00:08:57 -0800299 xnn_params.qs8.vmul = (struct vbinary_parameters) {
300 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
301 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
302 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
303 .init.qs8_mul = xnn_init_qs8_mul_minmax_rndnu_neon_params,
304 .element_tile = 16,
305 };
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700306 #endif // XNN_NO_QS8_OPERATORS
307
Frank Barchardb40ee632021-12-30 11:10:02 -0800308 /*************************** QU8 AArch32 micro-kernels ***************************/
Marat Dukhan08b7a972020-07-14 18:17:29 -0700309 #ifndef XNN_NO_QU8_OPERATORS
310 init_flags |= XNN_INIT_FLAG_QU8;
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700311
Frank Barchard20255152021-08-11 14:01:45 -0700312 if (cpuinfo_has_arm_neon_dot()) {
Frank Barchardde9c64a2021-08-17 18:32:50 -0700313 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot);
314 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_2x16c4__neondot);
315 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
316 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
Frank Barchard20255152021-08-11 14:01:45 -0700317 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
Frank Barcharde0331262021-08-11 23:18:59 -0700318 xnn_params.qu8.gemm.mr = 2;
Frank Barchardde9c64a2021-08-17 18:32:50 -0700319 xnn_params.qu8.gemm.nr = 16;
Frank Barchard20255152021-08-11 14:01:45 -0700320 xnn_params.qu8.gemm.log2_kr = 2;
321 } else {
Frank Barchard8d07e402022-01-19 01:51:40 -0800322 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
323 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
Frank Barchard77817862022-01-11 23:20:38 -0800324 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
325 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
Frank Barchard20255152021-08-11 14:01:45 -0700326 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
Frank Barchard8d07e402022-01-19 01:51:40 -0800327 xnn_params.qu8.gemm.mr = 4;
Frank Barchard77817862022-01-11 23:20:38 -0800328 xnn_params.qu8.gemm.nr = 8;
Frank Barchard20255152021-08-11 14:01:45 -0700329 }
Frank Barchard354cbc62021-09-27 21:42:41 -0700330 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8;
Marat Dukhan73a899a2021-07-27 00:10:38 -0700331 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
Frank Barchard0d065732021-08-31 00:01:40 -0700332 xnn_params.qu8.dwconv[0].channel_tile = 16;
Marat Dukhan08b7a972020-07-14 18:17:29 -0700333 xnn_params.qu8.dwconv[0].primary_tile = 9;
Frank Barchard354cbc62021-09-27 21:42:41 -0700334 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8;
Marat Dukhan73a899a2021-07-27 00:10:38 -0700335 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
Marat Dukhan43b46ee2021-07-15 19:07:50 -0700336 xnn_params.qu8.dwconv[1].channel_tile = 8;
337 xnn_params.qu8.dwconv[1].primary_tile = 25;
338
Marat Dukhan08b7a972020-07-14 18:17:29 -0700339 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -0800340 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8,
341 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8,
Marat Dukhan3c949a32022-01-09 20:12:33 -0800342 .init.qu8 = xnn_init_qu8_avgpool_minmax_neon_params,
Marat Dukhanda382d12022-01-07 19:51:20 -0800343 .primary_tile = 9,
344 .incremental_tile = 8,
345 .channel_tile = 8,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700346 };
Marat Dukhan08b7a972020-07-14 18:17:29 -0700347 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan85755042022-01-13 01:46:05 -0800348 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
349 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
350 .init.qu8 = xnn_init_qu8_avgpool_minmax_rndnu_neon_params,
351 .update.qu8 = xnn_update_qu8_avgpool_minmax_rndnu_neon_params,
Marat Dukhanda382d12022-01-07 19:51:20 -0800352 .row_tile = 7,
353 .channel_tile = 8,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700354 };
Marat Dukhandb007cd2021-07-20 23:42:39 -0700355 xnn_params.qu8.vadd = (struct vbinary_parameters) {
Frank Barchard0a3093c2021-08-31 09:58:11 -0700356 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__neon_ld64_x16,
357 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16,
358 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16,
Marat Dukhan64287252021-09-07 16:20:03 -0700359 .init.qu8_addsub = xnn_init_qu8_add_minmax_neon_params,
Marat Dukhandb007cd2021-07-20 23:42:39 -0700360 .element_tile = 8,
361 };
Marat Dukhan33a98fa2022-01-13 00:08:57 -0800362 xnn_params.qu8.vmul = (struct vbinary_parameters) {
363 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
364 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
365 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
366 .init.qu8_mul = xnn_init_qu8_mul_minmax_rndnu_neon_params,
367 .element_tile = 16,
368 };
Marat Dukhan08b7a972020-07-14 18:17:29 -0700369 #endif // XNN_NO_QU8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -0700370
Frank Barchardb40ee632021-12-30 11:10:02 -0800371 /**************************** S8 AArch32 micro-kernels ****************************/
Marat Dukhan23147532021-08-16 07:26:56 -0700372 #ifndef XNN_NO_S8_OPERATORS
373 init_flags |= XNN_INIT_FLAG_S8;
374
Marat Dukhan61c0c9e2021-08-16 23:16:14 -0700375 xnn_params.s8.clamp = (struct vunary_parameters) {
376 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__neon_x64,
377 .init.s8_minmax = xnn_init_s8_minmax_neon_params,
378 .element_tile = 64,
379 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -0800380 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
381 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__neon_c8,
382 .pixel_tile = 1,
383 .channel_tile = 8,
384 };
Marat Dukhan23147532021-08-16 07:26:56 -0700385 xnn_params.s8.maxpool = (struct maxpool_parameters) {
386 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__neon_c16,
Marat Dukhandc5c1482021-08-16 09:03:15 -0700387 .init.s8 = xnn_init_s8_minmax_neon_params,
Marat Dukhan23147532021-08-16 07:26:56 -0700388 .mr = 9,
389 .qr = 8,
390 };
391 #endif // XNN_NO_S8_OPERATORS
392
Frank Barchardb40ee632021-12-30 11:10:02 -0800393 /**************************** U8 AArch32 micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -0700394 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700395 init_flags |= XNN_INIT_FLAG_U8;
396
Marat Dukhan94912792021-08-16 21:40:30 -0700397 xnn_params.u8.clamp = (struct vunary_parameters) {
398 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__neon_x64,
399 .init.u8_minmax = xnn_init_u8_minmax_neon_params,
400 .element_tile = 64,
401 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -0800402 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
403 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__neon_c8,
404 .pixel_tile = 1,
405 .channel_tile = 8,
406 };
Marat Dukhan3b745a42020-05-10 21:43:25 -0700407 xnn_params.u8.maxpool = (struct maxpool_parameters) {
408 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16,
Marat Dukhan2ea50a02021-08-16 12:59:19 -0700409 .init.u8 = xnn_init_u8_minmax_neon_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700410 .mr = 9,
411 .qr = 8,
412 };
Marat Dukhan3b745a42020-05-10 21:43:25 -0700413 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
414 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
415 #endif // XNN_NO_U8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -0700416
Frank Barchardb40ee632021-12-30 11:10:02 -0800417 /**************************** X8 AArch32 micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -0700418 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700419 init_flags |= XNN_INIT_FLAG_X8;
420
Marat Dukhand67539d2021-09-08 23:06:03 -0700421 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700422 xnn_params.x8.zip = (struct zip_parameters) {
423 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__neon,
424 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__neon,
425 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__neon,
426 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__neon,
427 };
428 #endif // XNN_NO_X8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -0700429
Frank Barchardb40ee632021-12-30 11:10:02 -0800430 /**************************** F32 AArch32 micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -0700431 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700432 init_flags |= XNN_INIT_FLAG_F32;
433
Marat Dukhan3b745a42020-05-10 21:43:25 -0700434 #if XNN_ENABLE_ASSEMBLY
435 switch (cpuinfo_get_uarch(0)->uarch) {
436 case cpuinfo_uarch_cortex_a5:
437 case cpuinfo_uarch_cortex_a7:
Frank Barchard490febe2020-07-16 18:42:17 -0700438 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7);
439 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7);
Marat Dukhan3b745a42020-05-10 21:43:25 -0700440 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
441 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700442 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700443 xnn_params.f32.gemm.mr = 4;
444 xnn_params.f32.gemm.nr = 8;
Marat Dukhan05702cf2020-03-26 15:41:33 -0700445 break;
Marat Dukhan05702cf2020-03-26 15:41:33 -0700446
Marat Dukhan3b745a42020-05-10 21:43:25 -0700447 case cpuinfo_uarch_cortex_a53:
448 case cpuinfo_uarch_cortex_a55r0:
449 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53);
450 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53);
451 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
452 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700453 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700454 xnn_params.f32.gemm.mr = 4;
455 xnn_params.f32.gemm.nr = 8;
456 break;
457
Frank Barchardf975ee02021-11-05 16:01:00 -0700458 case cpuinfo_uarch_cortex_a35:
Marat Dukhan3b745a42020-05-10 21:43:25 -0700459 case cpuinfo_uarch_cortex_a55:
460 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55);
461 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55);
462 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
463 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700464 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700465 xnn_params.f32.gemm.mr = 4;
466 xnn_params.f32.gemm.nr = 8;
467 break;
468
469 case cpuinfo_uarch_cortex_a57:
470 case cpuinfo_uarch_cortex_a72:
471 case cpuinfo_uarch_cortex_a73:
Frank Barchard78735862022-01-04 16:47:44 -0800472 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75);
473 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75);
Marat Dukhan3b745a42020-05-10 21:43:25 -0700474 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
475 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700476 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700477 xnn_params.f32.gemm.mr = 4;
478 xnn_params.f32.gemm.nr = 8;
479 break;
480
481 case cpuinfo_uarch_krait:
482 default:
483 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75);
484 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75);
485 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
486 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700487 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700488 xnn_params.f32.gemm.mr = 4;
489 xnn_params.f32.gemm.nr = 8;
490 break;
491 }
492 #if XNN_MAX_UARCH_TYPES > 1
493 {
494 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
495 const uint32_t mr = xnn_params.f32.gemm.mr;
496 const uint32_t nr = xnn_params.f32.gemm.nr;
497 const uint32_t log2_sr = xnn_params.f32.gemm.log2_sr;
498 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
499 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
500 if (uarch_info == NULL) {
501 /* No more microarchitectures in the system */
Marat Dukhan05702cf2020-03-26 15:41:33 -0700502 break;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700503 }
504
505 switch (uarch_info->uarch) {
506 case cpuinfo_uarch_cortex_a53:
507 case cpuinfo_uarch_cortex_a55r0:
508 if (mr == 4 && nr == 8 && log2_sr == 0) {
509 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53;
510 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53;
511 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
512 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
513 }
514 break;
515 case cpuinfo_uarch_cortex_a55:
516 if (mr == 4 && nr == 8 && log2_sr == 0) {
517 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55;
518 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55;
519 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
520 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
521 }
522 break;
523 default:
524 break;
525 }
Marat Dukhan05702cf2020-03-26 15:41:33 -0700526 }
527 }
Marat Dukhan3b745a42020-05-10 21:43:25 -0700528 #endif // XNN_MAX_UARCH_TYPES > 1
529 #else // XNN_ENABLE_ASSEMBLY
530 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128);
531 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128);
532 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
533 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700534 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700535 xnn_params.f32.gemm.mr = 4;
536 xnn_params.f32.gemm.nr = 8;
537 #endif // XNN_ENABLE_ASSEMBLY
538 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64);
539 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700540 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700541 xnn_params.f32.gemm2.mr = 4;
542 xnn_params.f32.gemm2.nr = 2;
543
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700544 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__neon;
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700545 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Frank Barcharddbe781b2021-10-18 10:29:52 -0700546 xnn_params.f32.dwconv[0].channel_tile = 8,
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700547 xnn_params.f32.dwconv[0].primary_tile = 3,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700548
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700549 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__neon;
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700550 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700551 xnn_params.f32.dwconv[1].channel_tile = 8,
552 xnn_params.f32.dwconv[1].primary_tile = 4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700553
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700554 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neon;
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700555 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
Frank Barcharddbe781b2021-10-18 10:29:52 -0700556 xnn_params.f32.dwconv[2].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700557 xnn_params.f32.dwconv[2].primary_tile = 9;
558
559 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__neon_acc2;
560 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
561 xnn_params.f32.dwconv[3].channel_tile = 8;
562 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700563
564 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -0800565 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__neon_c4,
566 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4,
567 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
568 .primary_tile = 9,
569 .incremental_tile = 8,
570 .channel_tile = 4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700571 };
572 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -0800573 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4,
574 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4,
575 .primary_tile = 9,
576 .incremental_tile = 8,
577 .channel_tile = 4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700578 };
579 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -0800580 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4,
581 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
582 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
583 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
584 .row_tile = 7,
585 .channel_tile = 4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700586 };
587 xnn_params.f32.maxpool = (struct maxpool_parameters) {
588 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4,
Marat Dukhan91ae1652021-08-15 19:19:49 -0700589 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700590 .mr = 9,
591 .qr = 8,
592 };
593 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -0700594 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__neon_c4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700595 .mr = 4,
596 };
597 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -0700598 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__neon_c4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700599 .mr = 9,
600 };
601 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -0700602 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__neon_c4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700603 .mr = 9,
604 .qr = 8,
605 };
606 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
607 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__neon_c8,
608 .pixel_tile = 1,
609 .channel_tile = 8,
610 };
Marat Dukhane5efb162021-12-31 10:26:13 -0800611 xnn_params.f32.abs = (struct vunary_parameters) {
612 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__neon_x8,
613 .element_tile = 8,
614 };
Marat Dukhan94912792021-08-16 21:40:30 -0700615 xnn_params.f32.clamp = (struct vunary_parameters) {
616 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__neon_x8,
617 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
618 .element_tile = 8,
619 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -0800620 if (cpuinfo_has_arm_neon_fma()) {
Marat Dukhan4a79ff22022-01-01 12:16:48 -0800621 xnn_params.f32.elu = (struct vunary_parameters) {
622 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neonfma_rr1_p6_x8,
623 .init.f32_elu = xnn_init_f32_elu_neonfma_rr1_p6_params,
624 .element_tile = 8,
625 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -0800626 } else {
Marat Dukhan4a79ff22022-01-01 12:16:48 -0800627 xnn_params.f32.elu = (struct vunary_parameters) {
628 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x8,
629 .init.f32_elu = xnn_init_f32_elu_neon_rr2_lut16_p3_params,
630 .element_tile = 8,
631 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -0800632 }
Marat Dukhan561d0682021-12-23 16:12:35 -0800633 xnn_params.f32.hswish = (struct vunary_parameters) {
634 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__neon_x16,
Marat Dukhan0d10cc72021-12-23 19:49:19 -0800635 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
Marat Dukhan561d0682021-12-23 16:12:35 -0800636 .element_tile = 16,
637 };
Marat Dukhan2894e992021-12-30 08:29:48 -0800638 xnn_params.f32.lrelu = (struct vunary_parameters) {
639 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__neon_x8,
640 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
641 .element_tile = 8,
642 };
Marat Dukhane5efb162021-12-31 10:26:13 -0800643 xnn_params.f32.neg = (struct vunary_parameters) {
644 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__neon_x8,
645 .element_tile = 8,
646 };
Marat Dukhan64e52512020-06-09 13:41:16 -0700647 if (cpuinfo_has_arm_neon_v8()) {
Marat Dukhan0e801372022-01-04 00:10:41 -0800648 xnn_params.f32.rndne = (struct vunary_parameters) {
649 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8,
650 .element_tile = 8,
651 };
652 xnn_params.f32.rndz = (struct vunary_parameters) {
653 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8,
654 .element_tile = 8,
655 };
656 xnn_params.f32.rndu = (struct vunary_parameters) {
657 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8,
658 .element_tile = 8,
659 };
660 xnn_params.f32.rndd = (struct vunary_parameters) {
661 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8,
662 .element_tile = 8,
663 };
Marat Dukhan64e52512020-06-09 13:41:16 -0700664 } else {
Marat Dukhan0e801372022-01-04 00:10:41 -0800665 xnn_params.f32.rndne = (struct vunary_parameters) {
666 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neon_x8,
667 .element_tile = 8,
668 };
669 xnn_params.f32.rndz = (struct vunary_parameters) {
670 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neon_x8,
671 .element_tile = 8,
672 };
673 xnn_params.f32.rndu = (struct vunary_parameters) {
674 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neon_x8,
675 .element_tile = 8,
676 };
677 xnn_params.f32.rndd = (struct vunary_parameters) {
678 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neon_x8,
679 .element_tile = 8,
680 };
Marat Dukhan64e52512020-06-09 13:41:16 -0700681 }
Marat Dukhance834ad2022-01-03 00:22:01 -0800682 xnn_params.f32.sigmoid = (struct vunary_parameters) {
683 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x8,
Marat Dukhanbbfc27d2022-01-03 13:47:00 -0800684 .init.f32_sigmoid = xnn_init_f32_sigmoid_neon_rr2_lut64_p2_params,
Marat Dukhance834ad2022-01-03 00:22:01 -0800685 .element_tile = 8,
686 };
Marat Dukhane5efb162021-12-31 10:26:13 -0800687 xnn_params.f32.sqr = (struct vunary_parameters) {
688 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__neon_x8,
689 .element_tile = 8,
690 };
Marat Dukhane72b2822021-12-30 14:46:58 -0800691 xnn_params.f32.sqrt = (struct vunary_parameters) {
692 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
693 .element_tile = 1,
694 };
Marat Dukhan3b745a42020-05-10 21:43:25 -0700695 xnn_params.f32.prelu = (struct prelu_parameters) {
696 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
697 .row_tile = 2,
698 .channel_tile = 8,
699 };
Marat Dukhan4a5c7712022-01-05 22:43:13 -0800700 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
701 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x8,
702 .init = xnn_init_f32_expminus_neon_rr2_lut64_p2_params,
703 .element_tile = 8,
704 };
Marat Dukhan3b745a42020-05-10 21:43:25 -0700705 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__neon;
706 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -0700707 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__neon_x8,
708 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
709 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -0800710 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700711 .element_tile = 8,
712 };
713 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -0700714 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
715 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
716 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
Marat Dukhanf6004972021-12-30 11:23:02 -0800717 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700718 .element_tile = 2,
719 };
720 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -0700721 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
722 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
723 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700724 .element_tile = 8,
725 };
726 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -0700727 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__neon_x8,
728 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
729 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700730 .element_tile = 8,
731 };
732 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -0700733 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__neon_x8,
734 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
735 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -0800736 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700737 .element_tile = 8,
738 };
739 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -0700740 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__neon_x8,
741 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__neon_x8,
742 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__neon_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -0800743 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700744 .element_tile = 8,
745 };
Marat Dukhanf7399262020-06-05 10:58:44 -0700746 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -0700747 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__neon_x8,
748 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
749 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
Marat Dukhanf7399262020-06-05 10:58:44 -0700750 .element_tile = 8,
751 };
Marat Dukhan3b745a42020-05-10 21:43:25 -0700752 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -0700753 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -0700754 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700755 .channel_tile = 4,
756 .row_tile = 2,
757 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -0700758 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -0800759 init_flags |= XNN_INIT_FLAG_CHW_OPT;
760
Marat Dukhan3e913382020-12-07 13:36:08 -0800761 xnn_params.f32.spmm = (struct spmm_parameters) {
762 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__neon,
763 .mr = 32,
764 .nr = 1,
765 };
Marat Dukhanc7634882020-12-07 15:11:12 -0800766 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
767 .ukernel_with_symm_padding =
768 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2,
769 .output_channel_tile = 4,
770 .output_height_tile = 2,
771 .output_width_tile = 2,
772 };
Marat Dukhan3e913382020-12-07 13:36:08 -0800773 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
774 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4,
Marat Dukhan3e913382020-12-07 13:36:08 -0800775 .output_width_tile = 4,
776 .output_height_tile = 2,
777 };
778 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
779 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4,
Marat Dukhan3e913382020-12-07 13:36:08 -0800780 .output_width_tile = 4,
781 .output_height_tile = 1,
782 };
783 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
784 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4,
Marat Dukhan3e913382020-12-07 13:36:08 -0800785 .output_width_tile = 4,
786 .output_height_tile = 1,
787 };
788 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
789 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4,
Marat Dukhan3e913382020-12-07 13:36:08 -0800790 .output_width_tile = 4,
791 .output_height_tile = 1,
792 };
793 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
794 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__neon_x4,
795 .channel_tile = 4,
796 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -0700797 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
Artsiom Ablavatski2202c812021-01-22 14:16:43 -0800798 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__neon_p8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -0700799 .channel_tile = 1,
Artsiom Ablavatskib3ffd582021-03-31 13:00:08 -0700800 .pixel_tile = 8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -0700801 };
802 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan3b745a42020-05-10 21:43:25 -0700803 #endif // XNN_NO_F32_OPERATORS
804
Frank Barchardb40ee632021-12-30 11:10:02 -0800805 /*************************** VCVT AArch32 micro-kernels ***************************/
Marat Dukhanaf2ba002021-10-24 14:21:41 -0700806 #ifndef XNN_NO_VCVT_OPERATORS
807 init_flags |= XNN_INIT_FLAG_VCVT;
808
809 if (cpuinfo_has_arm_neon_fp16()) {
Marat Dukhan134f9842021-12-29 19:57:31 -0800810 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
811 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neonfp16_x16,
812 .element_tile = 16,
813 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -0800814 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
815 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__neonfp16_x16,
816 .element_tile = 16,
817 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -0700818 } else {
Marat Dukhan134f9842021-12-29 19:57:31 -0800819 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
820 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neon_int16_x16,
821 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_neon_params,
822 .element_tile = 16,
823 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -0800824 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
825 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__neon_x8,
826 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_neon_params,
827 .element_tile = 8,
828 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -0700829 }
Marat Dukhaned2d7762021-12-03 23:51:19 -0800830 if (cpuinfo_has_arm_neon_v8()) {
831 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
832 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neonv8_x32,
833 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neonv8_params,
834 .element_tile = 32,
835 };
836 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
837 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neonv8_x32,
838 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neonv8_params,
839 .element_tile = 32,
840 };
841 } else {
842 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
843 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neon_x32,
844 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neon_params,
845 .element_tile = 32,
846 };
847 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
848 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neon_x32,
849 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neon_params,
850 .element_tile = 32,
851 };
852 }
Marat Dukhanf92206b2021-12-10 17:02:07 -0800853 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
854 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__neon_x32,
855 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_neon_params,
856 .element_tile = 32,
857 };
858 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
859 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__neon_x32,
860 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_neon_params,
861 .element_tile = 32,
862 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -0700863 #endif // XNN_NO_VCVT_OPERATORS
864
Frank Barchardb40ee632021-12-30 11:10:02 -0800865 /**************************** X32 AArch32 micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -0700866 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700867 init_flags |= XNN_INIT_FLAG_X32;
868
Marat Dukhan3b745a42020-05-10 21:43:25 -0700869 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__neon;
870 xnn_params.x32.zip = (struct zip_parameters) {
871 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__neon,
872 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__neon,
873 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__neon,
874 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
875 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -0800876 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -0800877 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
878 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -0800879 .channel_tile = 1,
880 .pixel_tile = 1,
881 };
882 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan3b745a42020-05-10 21:43:25 -0700883 #endif // XNN_NO_X32_OPERATORS
Marat Dukhan933051b2021-08-07 16:26:15 -0700884
Frank Barchardb40ee632021-12-30 11:10:02 -0800885 /**************************** XX AArch32 micro-kernels ****************************/
Marat Dukhan933051b2021-08-07 16:26:15 -0700886 #ifndef XNN_NO_XX_OPERATORS
887 init_flags |= XNN_INIT_FLAG_XX;
888
889 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
890 xnn_params.xx.fill = (struct fill_parameters) {
891 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__neon_x64,
892 .row_tile = 1,
893 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -0700894 xnn_params.xx.pad = (struct pad_parameters) {
895 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__neon,
896 .row_tile = 1,
897 };
Marat Dukhan933051b2021-08-07 16:26:15 -0700898 #endif // XNN_NO_XX_OPERATORS
899
Marat Dukhan3b745a42020-05-10 21:43:25 -0700900 } else if (!XNN_PLATFORM_MOBILE) {
Marat Dukhan933051b2021-08-07 16:26:15 -0700901
Frank Barchardb40ee632021-12-30 11:10:02 -0800902 /*************************** QS8 AArch32 Pre-NEON micro-kernels ***************************/
Marat Dukhan66a3ca12021-08-06 18:24:19 -0700903 #ifndef XNN_NO_QS8_OPERATORS
904 init_flags |= XNN_INIT_FLAG_QS8;
905
Marat Dukhan2ac722e2022-01-04 01:54:20 -0800906 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic);
907 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_fmagic);
908 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic);
909 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic);
910 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan66a3ca12021-08-06 18:24:19 -0700911 xnn_params.qs8.gemm.mr = 2;
912 xnn_params.qs8.gemm.nr = 2;
913
Marat Dukhan2ac722e2022-01-04 01:54:20 -0800914 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic;
915 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan66a3ca12021-08-06 18:24:19 -0700916 xnn_params.qs8.dwconv[0].channel_tile = 1;
917 xnn_params.qs8.dwconv[0].primary_tile = 9;
Marat Dukhan2ac722e2022-01-04 01:54:20 -0800918 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x25__scalar_fmagic;
919 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan66a3ca12021-08-06 18:24:19 -0700920 xnn_params.qs8.dwconv[1].channel_tile = 1;
921 xnn_params.qs8.dwconv[1].primary_tile = 25;
922
923 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan847ff5e2022-01-11 20:31:06 -0800924 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
925 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
Marat Dukhan53f41062022-01-11 19:44:57 -0800926 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
927 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
Marat Dukhanda382d12022-01-07 19:51:20 -0800928 .row_tile = 7,
929 .channel_tile = 1,
Marat Dukhan66a3ca12021-08-06 18:24:19 -0700930 };
931 xnn_params.qs8.vadd = (struct vbinary_parameters) {
932 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x1,
933 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x1,
934 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x1,
Marat Dukhan64287252021-09-07 16:20:03 -0700935 .init.qs8_addsub = xnn_init_qs8_add_minmax_scalar_params,
Marat Dukhan66a3ca12021-08-06 18:24:19 -0700936 .element_tile = 1,
937 };
938 xnn_params.qs8.vmul = (struct vbinary_parameters) {
939 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
940 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
941 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
942 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
943 .element_tile = 4,
944 };
945 #endif // XNN_NO_QS8_OPERATORS
946
Frank Barchardb40ee632021-12-30 11:10:02 -0800947 /*************************** QU8 AArch32 Pre-NEON micro-kernels ***************************/
Marat Dukhan08b7a972020-07-14 18:17:29 -0700948 #ifndef XNN_NO_QU8_OPERATORS
949 init_flags |= XNN_INIT_FLAG_QU8;
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700950
Marat Dukhan2ac722e2022-01-04 01:54:20 -0800951 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic);
952 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_fmagic);
953 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic);
954 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic);
955 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan08b7a972020-07-14 18:17:29 -0700956 xnn_params.qu8.gemm.mr = 2;
957 xnn_params.qu8.gemm.nr = 2;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700958
Marat Dukhan2ac722e2022-01-04 01:54:20 -0800959 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic;
960 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan08b7a972020-07-14 18:17:29 -0700961 xnn_params.qu8.dwconv[0].channel_tile = 1;
962 xnn_params.qu8.dwconv[0].primary_tile = 9;
Marat Dukhan2ac722e2022-01-04 01:54:20 -0800963 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x25__scalar_fmagic;
964 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan43b46ee2021-07-15 19:07:50 -0700965 xnn_params.qu8.dwconv[1].channel_tile = 1;
966 xnn_params.qu8.dwconv[1].primary_tile = 25;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700967
Marat Dukhan08b7a972020-07-14 18:17:29 -0700968 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -0800969 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
970 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
971 .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
972 .primary_tile = 9,
973 .incremental_tile = 8,
974 .channel_tile = 1,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700975 };
Marat Dukhan08b7a972020-07-14 18:17:29 -0700976 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhand1f53e42022-01-12 22:34:51 -0800977 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
978 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
979 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
980 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
Marat Dukhanda382d12022-01-07 19:51:20 -0800981 .row_tile = 7,
982 .channel_tile = 1,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700983 };
Marat Dukhandb007cd2021-07-20 23:42:39 -0700984 xnn_params.qu8.vadd = (struct vbinary_parameters) {
985 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x1,
986 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x1,
987 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x1,
Marat Dukhan64287252021-09-07 16:20:03 -0700988 .init.qu8_addsub = xnn_init_qu8_add_minmax_scalar_params,
Marat Dukhandb007cd2021-07-20 23:42:39 -0700989 .element_tile = 1,
990 };
Marat Dukhan3c5e6622021-08-06 00:38:05 -0700991 xnn_params.qu8.vmul = (struct vbinary_parameters) {
992 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
993 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
994 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
995 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
996 .element_tile = 4,
997 };
Marat Dukhan08b7a972020-07-14 18:17:29 -0700998 #endif // XNN_NO_QU8_OPERATORS
Marat Dukhan3b745a42020-05-10 21:43:25 -0700999
Frank Barchardb40ee632021-12-30 11:10:02 -08001000 /**************************** S8 AArch32 Pre-NEON micro-kernels ****************************/
Marat Dukhan23147532021-08-16 07:26:56 -07001001 #ifndef XNN_NO_S8_OPERATORS
1002 init_flags |= XNN_INIT_FLAG_S8;
1003
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07001004 xnn_params.s8.clamp = (struct vunary_parameters) {
1005 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
Marat Dukhan3b9b4bc2021-08-24 00:23:07 -07001006 .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07001007 .element_tile = 4,
1008 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08001009 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
1010 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
1011 .pixel_tile = 1,
1012 .channel_tile = 1,
1013 };
Marat Dukhan23147532021-08-16 07:26:56 -07001014 xnn_params.s8.maxpool = (struct maxpool_parameters) {
1015 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
1016 .init.s8 = xnn_init_s8_minmax_scalar_params,
1017 .mr = 9,
1018 .qr = 8,
1019 };
1020 #endif // XNN_NO_S8_OPERATORS
1021
Frank Barchardb40ee632021-12-30 11:10:02 -08001022 /**************************** U8 AArch32 Pre-NEON micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -07001023 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07001024 init_flags |= XNN_INIT_FLAG_U8;
1025
Marat Dukhan94912792021-08-16 21:40:30 -07001026 xnn_params.u8.clamp = (struct vunary_parameters) {
1027 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
Marat Dukhan3b9b4bc2021-08-24 00:23:07 -07001028 .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
Marat Dukhan94912792021-08-16 21:40:30 -07001029 .element_tile = 4,
1030 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08001031 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
1032 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
1033 .pixel_tile = 1,
1034 .channel_tile = 1,
1035 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001036 xnn_params.u8.maxpool = (struct maxpool_parameters) {
1037 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -07001038 .init.u8 = xnn_init_u8_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001039 .mr = 9,
1040 .qr = 8,
1041 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001042 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
1043 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
1044 #endif // XNN_NO_U8_OPERATORS
1045
Frank Barchardb40ee632021-12-30 11:10:02 -08001046 /**************************** X8 AArch32 Pre-NEON micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -07001047 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07001048 init_flags |= XNN_INIT_FLAG_X8;
1049
Marat Dukhand67539d2021-09-08 23:06:03 -07001050 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
Marat Dukhan3b745a42020-05-10 21:43:25 -07001051 xnn_params.x8.zip = (struct zip_parameters) {
1052 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
1053 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
1054 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
1055 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
1056 };
1057 #endif // XNN_NO_X8_OPERATORS
1058
Frank Barchardb40ee632021-12-30 11:10:02 -08001059 /**************************** F32 AArch32 Pre-NEON micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -07001060 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07001061 init_flags |= XNN_INIT_FLAG_F32;
1062
Marat Dukhan3b745a42020-05-10 21:43:25 -07001063 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__scalar);
1064 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__scalar);
1065 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__scalar);
1066 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__scalar);
Marat Dukhan467f6362020-05-22 23:21:55 -07001067 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__scalar);
1068 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__scalar);
1069 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__scalar);
1070 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__scalar);
Marat Dukhan3b745a42020-05-10 21:43:25 -07001071 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__scalar);
1072 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__scalar);
1073 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar);
1074 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001075 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001076 xnn_params.f32.gemm.mr = 4;
Marat Dukhan3b745a42020-05-10 21:43:25 -07001077 xnn_params.f32.gemm.nr = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001078
Marat Dukhan3b745a42020-05-10 21:43:25 -07001079 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__scalar);
1080 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__scalar),
1081 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__scalar);
1082 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__scalar),
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001083 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -07001084 xnn_params.f32.gemm2.mr = 4;
1085 xnn_params.f32.gemm2.nr = 2;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001086
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07001087 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__scalar_acc2;
1088 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001089 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -07001090 xnn_params.f32.dwconv[0].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07001091 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001092
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07001093 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__scalar_acc2;
1094 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001095 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -07001096 xnn_params.f32.dwconv[1].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07001097 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001098
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07001099 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__scalar_acc2;
1100 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001101 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -07001102 xnn_params.f32.dwconv[2].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07001103 xnn_params.f32.dwconv[2].primary_tile = 9;
1104
1105 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__scalar_acc2;
1106 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar_acc2;
1107 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
1108 xnn_params.f32.dwconv[3].channel_tile = 1;
1109 xnn_params.f32.dwconv[3].primary_tile = 25;
XNNPACK Teamb455b122019-09-27 18:10:33 -07001110
Marat Dukhan3b745a42020-05-10 21:43:25 -07001111 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08001112 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__scalar_c1,
1113 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__scalar_c1,
1114 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
1115 .primary_tile = 9,
1116 .incremental_tile = 8,
1117 .channel_tile = 1,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001118 };
1119 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08001120 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__scalar_c1,
1121 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__scalar_c1,
1122 .primary_tile = 9,
1123 .incremental_tile = 8,
1124 .channel_tile = 1,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001125 };
1126 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08001127 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1,
1128 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1,
1129 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
1130 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
1131 .row_tile = 7,
1132 .channel_tile = 1,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001133 };
1134 xnn_params.f32.maxpool = (struct maxpool_parameters) {
1135 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -07001136 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001137 .mr = 9,
1138 .qr = 8,
1139 };
1140 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
1141 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
1142 .mr = 4,
1143 };
1144 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
1145 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
1146 .mr = 9,
1147 };
1148 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
1149 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
1150 .mr = 9,
1151 .qr = 8,
1152 };
1153 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
1154 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
1155 .pixel_tile = 1,
1156 .channel_tile = 2,
1157 };
Marat Dukhane5efb162021-12-31 10:26:13 -08001158 xnn_params.f32.abs = (struct vunary_parameters) {
1159 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4,
1160 .element_tile = 4,
1161 };
Marat Dukhan94912792021-08-16 21:40:30 -07001162 xnn_params.f32.clamp = (struct vunary_parameters) {
1163 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__scalar_x4,
1164 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1165 .element_tile = 4,
1166 };
Marat Dukhan4a79ff22022-01-01 12:16:48 -08001167 xnn_params.f32.elu = (struct vunary_parameters) {
1168 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4,
1169 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
1170 .element_tile = 4,
1171 };
Marat Dukhan561d0682021-12-23 16:12:35 -08001172 xnn_params.f32.hswish = (struct vunary_parameters) {
1173 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08001174 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08001175 .element_tile = 4,
1176 };
Marat Dukhan2894e992021-12-30 08:29:48 -08001177 xnn_params.f32.lrelu = (struct vunary_parameters) {
1178 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4,
1179 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
1180 .element_tile = 4,
1181 };
Marat Dukhane5efb162021-12-31 10:26:13 -08001182 xnn_params.f32.neg = (struct vunary_parameters) {
1183 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4,
1184 .element_tile = 4,
1185 };
Marat Dukhan0e801372022-01-04 00:10:41 -08001186 xnn_params.f32.rndne = (struct vunary_parameters) {
1187 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x1,
1188 .element_tile = 1,
1189 };
1190 xnn_params.f32.rndz = (struct vunary_parameters) {
1191 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x1,
1192 .element_tile = 1,
1193 };
1194 xnn_params.f32.rndu = (struct vunary_parameters) {
1195 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x1,
1196 .element_tile = 1,
1197 };
1198 xnn_params.f32.rndd = (struct vunary_parameters) {
1199 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x1,
1200 .element_tile = 1,
1201 };
Marat Dukhance834ad2022-01-03 00:22:01 -08001202 xnn_params.f32.sigmoid = (struct vunary_parameters) {
1203 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
1204 .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
1205 .element_tile = 2,
1206 };
Marat Dukhane5efb162021-12-31 10:26:13 -08001207 xnn_params.f32.sqr = (struct vunary_parameters) {
1208 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4,
1209 .element_tile = 4,
1210 };
Marat Dukhane72b2822021-12-30 14:46:58 -08001211 xnn_params.f32.sqrt = (struct vunary_parameters) {
1212 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
1213 .element_tile = 1,
1214 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001215 xnn_params.f32.prelu = (struct prelu_parameters) {
1216 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
1217 .row_tile = 4,
1218 .channel_tile = 4,
1219 };
Marat Dukhan4a5c7712022-01-05 22:43:13 -08001220 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
1221 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
1222 .init = xnn_init_f32_expminus_scalar_rr2_p5_params,
1223 .element_tile = 4,
1224 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001225 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
1226 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001227 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__scalar_x8,
1228 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
1229 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08001230 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001231 .element_tile = 8,
1232 };
1233 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07001234 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
1235 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
1236 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
Marat Dukhanf6004972021-12-30 11:23:02 -08001237 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001238 .element_tile = 2,
1239 };
1240 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001241 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__scalar_x8,
1242 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
1243 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001244 .element_tile = 8,
1245 };
1246 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001247 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__scalar_x8,
1248 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
1249 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001250 .element_tile = 8,
1251 };
1252 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001253 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__scalar_x8,
1254 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
1255 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08001256 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001257 .element_tile = 8,
1258 };
1259 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001260 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__scalar_x8,
1261 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__scalar_x8,
1262 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__scalar_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08001263 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001264 .element_tile = 8,
1265 };
Marat Dukhanf7399262020-06-05 10:58:44 -07001266 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001267 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
1268 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
1269 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
Marat Dukhanf7399262020-06-05 10:58:44 -07001270 .element_tile = 8,
1271 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001272 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07001273 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__scalar_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07001274 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001275 .channel_tile = 1,
1276 .row_tile = 2,
1277 };
1278 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -08001279 init_flags |= XNN_INIT_FLAG_CHW_OPT;
1280
Marat Dukhan3b745a42020-05-10 21:43:25 -07001281 xnn_params.f32.spmm = (struct spmm_parameters) {
1282 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
1283 .mr = 8,
1284 .nr = 1,
1285 };
1286 xnn_params.f32.spmm2 = (struct spmm_parameters) {
1287 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
1288 .mr = 8,
1289 .nr = 2,
1290 };
1291 xnn_params.f32.spmm4 = (struct spmm_parameters) {
1292 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
1293 .mr = 8,
1294 .nr = 4,
1295 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07001296 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
Marat Dukhan3b745a42020-05-10 21:43:25 -07001297 .ukernel_with_symm_padding =
Marat Dukhan1f29b802020-05-15 23:46:39 -07001298 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001299 .output_channel_tile = 4,
1300 .output_height_tile = 1,
1301 .output_width_tile = 1,
1302 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07001303 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001304 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1,
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001305 .output_width_tile = 1,
1306 .output_height_tile = 4,
1307 };
1308 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
1309 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001310 .output_width_tile = 1,
Marat Dukhan91249d22020-10-24 12:02:51 -07001311 .output_height_tile = 2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001312 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07001313 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001314 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001315 .output_width_tile = 1,
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001316 .output_height_tile = 2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001317 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07001318 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001319 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001320 .output_width_tile = 1,
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001321 .output_height_tile = 2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001322 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07001323 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
1324 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001325 .channel_tile = 1,
1326 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -07001327 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
1328 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
1329 .channel_tile = 1,
1330 .pixel_tile = 4,
1331 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001332 #endif // XNN_NO_NCHW_OPERATORS
1333 #endif // XNN_NO_F32_OPERATORS
1334
Frank Barchardb40ee632021-12-30 11:10:02 -08001335 /*************************** VCVT AArch32 Pre-NEON micro-kernels ***************************/
Marat Dukhanaf2ba002021-10-24 14:21:41 -07001336 #ifndef XNN_NO_VCVT_OPERATORS
1337 init_flags |= XNN_INIT_FLAG_VCVT;
1338
Marat Dukhan134f9842021-12-29 19:57:31 -08001339 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
1340 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_x4,
1341 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
1342 .element_tile = 4,
1343 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08001344 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
1345 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_fabsf_x2,
1346 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_fabsf_params,
1347 .element_tile = 2,
1348 };
Marat Dukhaned2d7762021-12-03 23:51:19 -08001349 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
Marat Dukhanbdf10992022-01-04 09:20:14 -08001350 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__scalar_imagic_x4,
1351 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_imagic_params,
Marat Dukhaned2d7762021-12-03 23:51:19 -08001352 .element_tile = 4,
1353 };
1354 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
Marat Dukhanbdf10992022-01-04 09:20:14 -08001355 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x4,
1356 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_imagic_params,
Marat Dukhaned2d7762021-12-03 23:51:19 -08001357 .element_tile = 4,
1358 };
Marat Dukhanf92206b2021-12-10 17:02:07 -08001359 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
1360 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__scalar_x4,
1361 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
1362 .element_tile = 4,
1363 };
1364 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
1365 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__scalar_x4,
1366 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
1367 .element_tile = 4,
1368 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07001369 #endif // XNN_NO_VCVT_OPERATORS
1370
Frank Barchardb40ee632021-12-30 11:10:02 -08001371 /**************************** X32 AArch32 Pre-NEON micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -07001372 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07001373 init_flags |= XNN_INIT_FLAG_X32;
1374
Marat Dukhan3b745a42020-05-10 21:43:25 -07001375 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
1376 xnn_params.x32.zip = (struct zip_parameters) {
1377 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
1378 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
1379 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
1380 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
1381 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08001382 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -08001383 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
1384 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08001385 .channel_tile = 1,
1386 .pixel_tile = 1,
1387 };
1388 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan3b745a42020-05-10 21:43:25 -07001389 #endif // XNN_NO_X32_OPERATORS
Marat Dukhan933051b2021-08-07 16:26:15 -07001390
Frank Barchardb40ee632021-12-30 11:10:02 -08001391 /**************************** XX AArch32 Pre-NEON micro-kernels ****************************/
Marat Dukhan933051b2021-08-07 16:26:15 -07001392 #ifndef XNN_NO_XX_OPERATORS
1393 init_flags |= XNN_INIT_FLAG_XX;
1394
1395 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
1396 xnn_params.xx.fill = (struct fill_parameters) {
1397 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
1398 .row_tile = 1,
1399 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -07001400 xnn_params.xx.pad = (struct pad_parameters) {
1401 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
1402 .row_tile = 1,
1403 };
Marat Dukhan933051b2021-08-07 16:26:15 -07001404 #endif // XNN_NO_XX_OPERATORS
Marat Dukhan3b745a42020-05-10 21:43:25 -07001405 }
XNNPACK Teamb455b122019-09-27 18:10:33 -07001406
Marat Dukhan1dadbf72019-10-01 10:46:20 -07001407#elif XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -07001408
Frank Barchardb40ee632021-12-30 11:10:02 -08001409 /**************************** QC8 AArch64 micro-kernels ****************************/
Marat Dukhan898d5852021-06-30 21:18:34 -07001410 #ifndef XNN_NO_QC8_OPERATORS
1411 init_flags |= XNN_INIT_FLAG_QC8;
1412
Marat Dukhan75d1b792021-07-01 13:00:28 -07001413 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
1414 #if XNN_ENABLE_ASSEMBLY
1415 if (cpuinfo_has_arm_neon_dot()) {
1416 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1417 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1418 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1419 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
Marat Dukhan7988a182021-12-06 22:00:33 -08001420 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001421 xnn_params.qc8.gemm.mr = 4;
1422 xnn_params.qc8.gemm.nr = 16;
1423 xnn_params.qc8.gemm.log2_kr = 2;
1424 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08001425 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1426 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1427 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
1428 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
Marat Dukhan7988a182021-12-06 22:00:33 -08001429 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001430 xnn_params.qc8.gemm.mr = 2;
1431 xnn_params.qc8.gemm.nr = 8;
1432 xnn_params.qc8.gemm.log2_kr = 3;
1433 }
1434 #else // !XNN_ENABLE_ASSEMBLY
1435 if (cpuinfo_has_arm_neon_dot()) {
1436 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__neondot);
1437 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1438 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__neondot);
1439 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
Marat Dukhan7988a182021-12-06 22:00:33 -08001440 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001441 xnn_params.qc8.gemm.mr = 4;
1442 xnn_params.qc8.gemm.nr = 16;
1443 xnn_params.qc8.gemm.log2_kr = 2;
1444 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08001445 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1446 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1447 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
1448 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
Marat Dukhan7988a182021-12-06 22:00:33 -08001449 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001450 xnn_params.qc8.gemm.mr = 2;
1451 xnn_params.qc8.gemm.nr = 8;
1452 xnn_params.qc8.gemm.log2_kr = 1;
Frank Barcharde7043ff2021-11-10 14:50:08 -08001453 xnn_params.qc8.gemm.log2_sr = 2;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001454 }
1455 #endif // XNN_ENABLE_ASSEMBLY
1456 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1457 #if XNN_ENABLE_ASSEMBLY
1458 if (cpuinfo_has_arm_neon_dot()) {
1459 switch (cpuinfo_get_core(0)->uarch) {
1460 case cpuinfo_uarch_cortex_a55:
1461 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1462 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1463 break;
1464 case cpuinfo_uarch_cortex_x1:
1465 case cpuinfo_uarch_cortex_a78:
1466 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1467 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1468 break;
1469 default:
1470 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64);
1471 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64);
1472 break;
1473 }
1474 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1475 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
Marat Dukhan7988a182021-12-06 22:00:33 -08001476 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001477 xnn_params.qc8.gemm.mr = 4;
1478 xnn_params.qc8.gemm.nr = 16;
1479 xnn_params.qc8.gemm.log2_kr = 2;
1480 } else {
1481 switch (cpuinfo_get_core(0)->uarch) {
Frank Barchard6c34dbf2021-11-22 16:14:53 -08001482 case cpuinfo_uarch_cortex_a35:
1483 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1484 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1485 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1486 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
Marat Dukhan7988a182021-12-06 22:00:33 -08001487 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Frank Barchard6c34dbf2021-11-22 16:14:53 -08001488 xnn_params.qc8.gemm.mr = 4;
1489 xnn_params.qc8.gemm.nr = 16;
1490 break;
1491
Marat Dukhan75d1b792021-07-01 13:00:28 -07001492 case cpuinfo_uarch_cortex_a53:
1493 case cpuinfo_uarch_cortex_a55r0:
1494 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1495 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1496 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1497 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
Marat Dukhan7988a182021-12-06 22:00:33 -08001498 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001499 xnn_params.qc8.gemm.mr = 4;
1500 xnn_params.qc8.gemm.nr = 16;
1501 break;
1502
1503 case cpuinfo_uarch_cortex_a72:
1504 case cpuinfo_uarch_cortex_a73:
1505 case cpuinfo_uarch_kryo:
Frank Barcharde22685a2021-11-12 11:36:58 -08001506 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1507 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1508 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm);
1509 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm);
Marat Dukhan7988a182021-12-06 22:00:33 -08001510 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001511 xnn_params.qc8.gemm.mr = 2;
1512 xnn_params.qc8.gemm.nr = 8;
1513 xnn_params.qc8.gemm.log2_kr = 3;
1514 break;
1515
1516 default:
Frank Barcharde22685a2021-11-12 11:36:58 -08001517 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1518 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1519 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
1520 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
Marat Dukhan7988a182021-12-06 22:00:33 -08001521 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001522 xnn_params.qc8.gemm.mr = 2;
1523 xnn_params.qc8.gemm.nr = 8;
1524 xnn_params.qc8.gemm.log2_kr = 3;
1525 break;
1526 }
1527 }
1528 #if XNN_MAX_UARCH_TYPES > 1
1529 {
1530 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
1531 const uint32_t mr = xnn_params.qc8.gemm.mr;
1532 const uint32_t nr = xnn_params.qc8.gemm.nr;
1533 const uint32_t log2_kr = xnn_params.qc8.gemm.log2_kr;
1534 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
1535 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
1536 if (uarch_info == NULL) {
1537 /* No more microarchitectures in the system */
1538 break;
1539 }
1540
1541 switch (uarch_info->uarch) {
1542 case cpuinfo_uarch_cortex_a53:
1543 case cpuinfo_uarch_cortex_a55r0:
1544 if (mr == 2 && nr == 8 && log2_kr == 3) {
Frank Barcharde22685a2021-11-12 11:36:58 -08001545 xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1546 xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1547 xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1548 xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001549 }
1550 break;
1551
1552 case cpuinfo_uarch_cortex_a55:
Frank Barchardc37b8da2021-09-01 00:35:19 -07001553 if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
Marat Dukhan75d1b792021-07-01 13:00:28 -07001554 xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55;
1555 xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55;
1556 xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot;
1557 xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot;
1558 }
1559 break;
1560 default:
1561 break;
1562 }
1563 }
1564 }
1565 #endif // XNN_MAX_UARCH_TYPES > 1
1566 #else // !XNN_ENABLE_ASSEMBLY
1567 if (cpuinfo_has_arm_neon_dot()) {
1568 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__neondot);
1569 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1570 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__neondot);
1571 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
Marat Dukhan7988a182021-12-06 22:00:33 -08001572 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001573 xnn_params.qc8.gemm.mr = 4;
1574 xnn_params.qc8.gemm.nr = 16;
1575 xnn_params.qc8.gemm.log2_kr = 2;
1576 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08001577 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1578 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1579 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
1580 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
Marat Dukhan7988a182021-12-06 22:00:33 -08001581 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001582 xnn_params.qc8.gemm.mr = 2;
1583 xnn_params.qc8.gemm.nr = 8;
1584 xnn_params.qc8.gemm.log2_kr = 1;
Frank Barcharde7043ff2021-11-10 14:50:08 -08001585 xnn_params.qc8.gemm.log2_sr = 2;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001586 }
1587 #endif // XNN_ENABLE_ASSEMBLY
1588 #endif // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Marat Dukhan898d5852021-06-30 21:18:34 -07001589
Frank Barchard0d065732021-08-31 00:01:40 -07001590 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neonv8_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -08001591 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Frank Barchard0d065732021-08-31 00:01:40 -07001592 xnn_params.qc8.dwconv[0].channel_tile = 16;
Marat Dukhan898d5852021-06-30 21:18:34 -07001593 xnn_params.qc8.dwconv[0].primary_tile = 9;
Frank Barchard7da8b022021-08-31 09:49:10 -07001594 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__neonv8_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -08001595 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Frank Barchard7da8b022021-08-31 09:49:10 -07001596 xnn_params.qc8.dwconv[1].channel_tile = 16;
Marat Dukhan898d5852021-06-30 21:18:34 -07001597 xnn_params.qc8.dwconv[1].primary_tile = 25;
1598 #endif // XNN_NO_QC8_OPERATORS
1599
Frank Barchardb40ee632021-12-30 11:10:02 -08001600 /**************************** QS8 AArch64 micro-kernels ****************************/
Marat Dukhanf28cddf2020-08-10 14:05:02 -07001601 #ifndef XNN_NO_QS8_OPERATORS
1602 init_flags |= XNN_INIT_FLAG_QS8;
1603
Marat Dukhandfe47b92020-12-14 02:48:43 -08001604 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Frank Barchardbc0c7292020-10-06 13:36:54 -07001605 #if XNN_ENABLE_ASSEMBLY
Marat Dukhan31677ad2020-10-13 23:59:31 -07001606 if (cpuinfo_has_arm_neon_dot()) {
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001607 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1608 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1609 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1610 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1611 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001612 xnn_params.qs8.gemm.mr = 4;
1613 xnn_params.qs8.gemm.nr = 16;
1614 xnn_params.qs8.gemm.log2_kr = 2;
1615 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08001616 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
1617 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
1618 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
1619 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001620 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001621 xnn_params.qs8.gemm.mr = 2;
Frank Barcharda414daa2021-02-23 15:50:39 -08001622 xnn_params.qs8.gemm.nr = 8;
Frank Barchardbbf51822021-03-12 10:37:31 -08001623 xnn_params.qs8.gemm.log2_kr = 3;
Frank Barchard1e8590e2020-10-12 21:20:46 -07001624 }
Marat Dukhan31677ad2020-10-13 23:59:31 -07001625 #else // !XNN_ENABLE_ASSEMBLY
1626 if (cpuinfo_has_arm_neon_dot()) {
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001627 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
1628 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1629 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
1630 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1631 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001632 xnn_params.qs8.gemm.mr = 4;
1633 xnn_params.qs8.gemm.nr = 16;
1634 xnn_params.qs8.gemm.log2_kr = 2;
1635 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08001636 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
1637 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
1638 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
1639 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001640 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001641 xnn_params.qs8.gemm.mr = 2;
Frank Barcharda414daa2021-02-23 15:50:39 -08001642 xnn_params.qs8.gemm.nr = 8;
1643 xnn_params.qs8.gemm.log2_kr = 1;
Frank Barchard66ae2572021-11-02 17:36:21 -07001644 xnn_params.qs8.gemm.log2_sr = 2;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001645 }
1646 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhandfe47b92020-12-14 02:48:43 -08001647 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
Marat Dukhan31677ad2020-10-13 23:59:31 -07001648 #if XNN_ENABLE_ASSEMBLY
1649 if (cpuinfo_has_arm_neon_dot()) {
1650 switch (cpuinfo_get_core(0)->uarch) {
1651 case cpuinfo_uarch_cortex_a55:
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001652 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1653 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
Marat Dukhan31677ad2020-10-13 23:59:31 -07001654 break;
Frank Barchard0ae35f22021-06-15 17:34:24 -07001655 case cpuinfo_uarch_cortex_x1:
1656 case cpuinfo_uarch_cortex_a78:
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001657 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1658 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
Frank Barchard0ae35f22021-06-15 17:34:24 -07001659 break;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001660 default:
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001661 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64);
1662 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64);
Marat Dukhan31677ad2020-10-13 23:59:31 -07001663 break;
1664 }
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001665 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1666 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1667 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001668 xnn_params.qs8.gemm.mr = 4;
1669 xnn_params.qs8.gemm.nr = 16;
1670 xnn_params.qs8.gemm.log2_kr = 2;
1671 } else {
Frank Barchard2a995e72021-04-13 16:24:25 -07001672 switch (cpuinfo_get_core(0)->uarch) {
Frank Barchard6c34dbf2021-11-22 16:14:53 -08001673 case cpuinfo_uarch_cortex_a35:
1674 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1675 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1676 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1677 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1678 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
1679 xnn_params.qs8.gemm.mr = 4;
1680 xnn_params.qs8.gemm.nr = 16;
1681 break;
1682
Frank Barchard2a995e72021-04-13 16:24:25 -07001683 case cpuinfo_uarch_cortex_a53:
Frank Barchardfb5983d2021-04-20 14:09:08 -07001684 case cpuinfo_uarch_cortex_a55r0:
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001685 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1686 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1687 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1688 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1689 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchardd4416d62021-05-17 15:51:37 -07001690 xnn_params.qs8.gemm.mr = 4;
1691 xnn_params.qs8.gemm.nr = 16;
Frank Barchard6ac1d182021-04-14 13:47:07 -07001692 break;
1693
Frank Barchard2a995e72021-04-13 16:24:25 -07001694 case cpuinfo_uarch_cortex_a72:
1695 case cpuinfo_uarch_cortex_a73:
1696 case cpuinfo_uarch_kryo:
Frank Barcharde22685a2021-11-12 11:36:58 -08001697 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1698 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1699 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm);
1700 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm);
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001701 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchard2a995e72021-04-13 16:24:25 -07001702 xnn_params.qs8.gemm.mr = 2;
1703 xnn_params.qs8.gemm.nr = 8;
1704 xnn_params.qs8.gemm.log2_kr = 3;
Frank Barchardc77fc4c2021-04-14 13:28:01 -07001705 break;
Frank Barchard2a995e72021-04-13 16:24:25 -07001706
1707 default:
Frank Barcharde22685a2021-11-12 11:36:58 -08001708 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
1709 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
1710 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
1711 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001712 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchard2a995e72021-04-13 16:24:25 -07001713 xnn_params.qs8.gemm.mr = 2;
1714 xnn_params.qs8.gemm.nr = 8;
1715 xnn_params.qs8.gemm.log2_kr = 3;
Frank Barchardc77fc4c2021-04-14 13:28:01 -07001716 break;
Frank Barchard2a995e72021-04-13 16:24:25 -07001717 }
Marat Dukhan31677ad2020-10-13 23:59:31 -07001718 }
1719 #if XNN_MAX_UARCH_TYPES > 1
1720 {
1721 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
1722 const uint32_t mr = xnn_params.qs8.gemm.mr;
1723 const uint32_t nr = xnn_params.qs8.gemm.nr;
1724 const uint32_t log2_kr = xnn_params.qs8.gemm.log2_kr;
1725 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
1726 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
1727 if (uarch_info == NULL) {
1728 /* No more microarchitectures in the system */
1729 break;
1730 }
1731
1732 switch (uarch_info->uarch) {
Frank Barchard2a995e72021-04-13 16:24:25 -07001733 case cpuinfo_uarch_cortex_a53:
Frank Barchard90f520b2021-04-26 18:01:51 -07001734 case cpuinfo_uarch_cortex_a55r0:
Frank Barchard2a995e72021-04-13 16:24:25 -07001735 if (mr == 2 && nr == 8 && log2_kr == 3) {
Frank Barcharde22685a2021-11-12 11:36:58 -08001736 xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1737 xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1738 xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1739 xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
Frank Barchard2a995e72021-04-13 16:24:25 -07001740 }
1741 break;
1742
Marat Dukhan31677ad2020-10-13 23:59:31 -07001743 case cpuinfo_uarch_cortex_a55:
Frank Barchardc37b8da2021-09-01 00:35:19 -07001744 if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001745 xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
1746 xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
1747 xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot;
1748 xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001749 }
1750 break;
1751 default:
1752 break;
1753 }
1754 }
1755 }
1756 #endif // XNN_MAX_UARCH_TYPES > 1
1757 #else // !XNN_ENABLE_ASSEMBLY
1758 if (cpuinfo_has_arm_neon_dot()) {
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001759 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
1760 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1761 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
1762 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1763 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001764 xnn_params.qs8.gemm.mr = 4;
1765 xnn_params.qs8.gemm.nr = 16;
1766 xnn_params.qs8.gemm.log2_kr = 2;
1767 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08001768 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
1769 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
1770 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
1771 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001772 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001773 xnn_params.qs8.gemm.mr = 2;
Frank Barcharda414daa2021-02-23 15:50:39 -08001774 xnn_params.qs8.gemm.nr = 8;
1775 xnn_params.qs8.gemm.log2_kr = 1;
Frank Barcharde7043ff2021-11-10 14:50:08 -08001776 xnn_params.qs8.gemm.log2_sr = 2;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001777 }
1778 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhandfe47b92020-12-14 02:48:43 -08001779 #endif // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Marat Dukhanf28cddf2020-08-10 14:05:02 -07001780
Frank Barchard0d065732021-08-31 00:01:40 -07001781 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64;
Marat Dukhan4ba70b72021-07-19 11:20:16 -07001782 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchard0d065732021-08-31 00:01:40 -07001783 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhanf28cddf2020-08-10 14:05:02 -07001784 xnn_params.qs8.dwconv[0].primary_tile = 9;
Frank Barchard7da8b022021-08-31 09:49:10 -07001785 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mla8_ld64;
Marat Dukhan4ba70b72021-07-19 11:20:16 -07001786 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchard7da8b022021-08-31 09:49:10 -07001787 xnn_params.qs8.dwconv[1].channel_tile = 16;
Marat Dukhan4ed14882021-05-12 17:50:40 -07001788 xnn_params.qs8.dwconv[1].primary_tile = 25;
Marat Dukhanf28cddf2020-08-10 14:05:02 -07001789
1790 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan85755042022-01-13 01:46:05 -08001791 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
1792 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
1793 .init.qs8 = xnn_init_qs8_avgpool_minmax_rndnu_neon_params,
1794 .update.qs8 = xnn_update_qs8_avgpool_minmax_rndnu_neon_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08001795 .row_tile = 7,
1796 .channel_tile = 8,
Marat Dukhanf28cddf2020-08-10 14:05:02 -07001797 };
Marat Dukhanff209482020-09-03 14:26:53 -07001798
1799 xnn_params.qs8.vadd = (struct vbinary_parameters) {
Marat Dukhan01debd92021-07-29 18:14:21 -07001800 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__neon_ld64_x32,
1801 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32,
1802 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32,
Marat Dukhan64287252021-09-07 16:20:03 -07001803 .init.qs8_addsub = xnn_init_qs8_add_minmax_neon_params,
Marat Dukhan01debd92021-07-29 18:14:21 -07001804 .element_tile = 32,
Marat Dukhanff209482020-09-03 14:26:53 -07001805 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07001806 xnn_params.qs8.vmul = (struct vbinary_parameters) {
Marat Dukhan33a98fa2022-01-13 00:08:57 -08001807 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
1808 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
1809 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
1810 .init.qs8_mul = xnn_init_qs8_mul_minmax_rndnu_neon_params,
Marat Dukhan0853b8a2021-08-03 01:01:53 -07001811 .element_tile = 16,
1812 };
Marat Dukhanf28cddf2020-08-10 14:05:02 -07001813 #endif // XNN_NO_QS8_OPERATORS
1814
Frank Barchardb40ee632021-12-30 11:10:02 -08001815 /**************************** QU8 AArch64 micro-kernels ****************************/
Marat Dukhan08b7a972020-07-14 18:17:29 -07001816 #ifndef XNN_NO_QU8_OPERATORS
1817 init_flags |= XNN_INIT_FLAG_QU8;
Frank Barchard20255152021-08-11 14:01:45 -07001818
Frank Barcharda962f1e2021-08-02 13:52:15 -07001819 #if XNN_ENABLE_ASSEMBLY
Frank Barchard20255152021-08-11 14:01:45 -07001820 if (cpuinfo_has_arm_neon_dot()) {
Frank Barchard8b698022021-08-26 11:17:32 -07001821 switch (cpuinfo_get_core(0)->uarch) {
1822 case cpuinfo_uarch_cortex_a55:
Frank Barcharda49e41f2021-08-31 20:30:24 -07001823 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1824 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1825 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1826 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
Frank Barchard8b698022021-08-26 11:17:32 -07001827 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
1828 xnn_params.qu8.gemm.mr = 4;
Frank Barcharda49e41f2021-08-31 20:30:24 -07001829 xnn_params.qu8.gemm.nr = 16;
Frank Barchard8b698022021-08-26 11:17:32 -07001830 xnn_params.qu8.gemm.log2_kr = 2;
1831 break;
1832 default:
1833 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1834 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1835 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1836 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1837 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
1838 xnn_params.qu8.gemm.mr = 4;
1839 xnn_params.qu8.gemm.nr = 16;
1840 xnn_params.qu8.gemm.log2_kr = 2;
1841 break;
1842 }
Frank Barchard20255152021-08-11 14:01:45 -07001843 } else {
1844 switch (cpuinfo_get_core(0)->uarch) {
1845 case cpuinfo_uarch_cortex_a53:
1846 case cpuinfo_uarch_cortex_a55r0:
Frank Barchard20255152021-08-11 14:01:45 -07001847 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53);
1848 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53);
1849 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1850 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1851 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
1852 xnn_params.qu8.gemm.mr = 4;
1853 xnn_params.qu8.gemm.nr = 16;
1854 break;
Frank Barchardf479a1c2021-08-03 10:20:30 -07001855
Frank Barchard20255152021-08-11 14:01:45 -07001856 case cpuinfo_uarch_cortex_a57:
1857 case cpuinfo_uarch_cortex_a72:
1858 case cpuinfo_uarch_cortex_a73:
1859 case cpuinfo_uarch_cortex_a75:
1860 case cpuinfo_uarch_cortex_a76:
1861 case cpuinfo_uarch_exynos_m1:
1862 case cpuinfo_uarch_exynos_m2:
1863 case cpuinfo_uarch_exynos_m3:
1864 case cpuinfo_uarch_exynos_m4:
1865 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75);
1866 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75);
1867 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1868 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1869 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
1870 xnn_params.qu8.gemm.mr = 4;
1871 xnn_params.qu8.gemm.nr = 16;
1872 break;
Frank Barchardf479a1c2021-08-03 10:20:30 -07001873
Frank Barchard20255152021-08-11 14:01:45 -07001874 case cpuinfo_uarch_kryo:
1875 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
1876 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
1877 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1878 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1879 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
1880 xnn_params.qu8.gemm.mr = 4;
1881 xnn_params.qu8.gemm.nr = 16;
1882 break;
1883
1884 default:
1885 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75);
1886 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75);
1887 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1888 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1889 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
1890 xnn_params.qu8.gemm.mr = 4;
1891 xnn_params.qu8.gemm.nr = 16;
1892 break;
1893 }
Frank Barchardf479a1c2021-08-03 10:20:30 -07001894 }
Frank Barchardc37b8da2021-09-01 00:35:19 -07001895 #if XNN_MAX_UARCH_TYPES > 1
1896 {
1897 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
1898 const uint32_t mr = xnn_params.qu8.gemm.mr;
1899 const uint32_t nr = xnn_params.qu8.gemm.nr;
1900 const uint32_t log2_kr = xnn_params.qu8.gemm.log2_kr;
1901 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
1902 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
1903 if (uarch_info == NULL) {
1904 /* No more microarchitectures in the system */
1905 break;
1906 }
1907
1908 switch (uarch_info->uarch) {
1909 case cpuinfo_uarch_cortex_a53:
1910 case cpuinfo_uarch_cortex_a55r0:
1911 if (mr == 4 && nr == 16 && log2_kr == 0) {
1912 xnn_params.qu8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53;
1913 xnn_params.qu8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53;
1914 }
1915 break;
1916
1917 case cpuinfo_uarch_cortex_a55:
1918 if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
1919 xnn_params.qu8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
1920 xnn_params.qu8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
1921 }
1922 break;
1923 default:
1924 break;
1925 }
1926 }
1927 }
1928 #endif // XNN_MAX_UARCH_TYPES > 1
Frank Barchard20255152021-08-11 14:01:45 -07001929 #else // !XNN_ENABLE_ASSEMBLY
1930 if (cpuinfo_has_arm_neon_dot()) {
1931 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
1932 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
1933 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1934 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1935 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
1936 xnn_params.qu8.gemm.mr = 4;
1937 xnn_params.qu8.gemm.nr = 16;
1938 xnn_params.qu8.gemm.log2_kr = 2;
1939 } else {
1940 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
1941 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
1942 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1943 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1944 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
1945 xnn_params.qu8.gemm.mr = 4;
1946 xnn_params.qu8.gemm.nr = 16;
Marat Dukhan947805b2021-12-07 14:32:09 -08001947 }
Frank Barchard20255152021-08-11 14:01:45 -07001948 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhanaefaef32020-04-09 07:09:34 -07001949
Frank Barchard354cbc62021-09-27 21:42:41 -07001950 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8;
Marat Dukhan73a899a2021-07-27 00:10:38 -07001951 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
Frank Barchard354cbc62021-09-27 21:42:41 -07001952 xnn_params.qu8.dwconv[0].channel_tile = 16;
Marat Dukhan08b7a972020-07-14 18:17:29 -07001953 xnn_params.qu8.dwconv[0].primary_tile = 9;
Frank Barchard354cbc62021-09-27 21:42:41 -07001954 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8;
Marat Dukhan73a899a2021-07-27 00:10:38 -07001955 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
Frank Barchard354cbc62021-09-27 21:42:41 -07001956 xnn_params.qu8.dwconv[1].channel_tile = 8;
Marat Dukhan81721352021-07-15 18:26:08 -07001957 xnn_params.qu8.dwconv[1].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001958
Marat Dukhan08b7a972020-07-14 18:17:29 -07001959 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08001960 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8,
1961 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8,
Marat Dukhan3c949a32022-01-09 20:12:33 -08001962 .init.qu8 = xnn_init_qu8_avgpool_minmax_neon_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08001963 .primary_tile = 9,
1964 .incremental_tile = 8,
1965 .channel_tile = 8,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07001966 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07001967 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan85755042022-01-13 01:46:05 -08001968 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
1969 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
1970 .init.qu8 = xnn_init_qu8_avgpool_minmax_rndnu_neon_params,
1971 .update.qu8 = xnn_update_qu8_avgpool_minmax_rndnu_neon_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08001972 .row_tile = 7,
1973 .channel_tile = 8,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07001974 };
Marat Dukhandb007cd2021-07-20 23:42:39 -07001975 xnn_params.qu8.vadd = (struct vbinary_parameters) {
Frank Barchard0a3093c2021-08-31 09:58:11 -07001976 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__neon_ld64_x32,
1977 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x32,
1978 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x32,
Marat Dukhan64287252021-09-07 16:20:03 -07001979 .init.qu8_addsub = xnn_init_qu8_add_minmax_neon_params,
Marat Dukhandb007cd2021-07-20 23:42:39 -07001980 .element_tile = 8,
1981 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07001982 xnn_params.qu8.vmul = (struct vbinary_parameters) {
Marat Dukhan33a98fa2022-01-13 00:08:57 -08001983 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
1984 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
1985 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
1986 .init.qu8_mul = xnn_init_qu8_mul_minmax_rndnu_neon_params,
Marat Dukhan0853b8a2021-08-03 01:01:53 -07001987 .element_tile = 16,
1988 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07001989 #endif // XNN_NO_QU8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07001990
Frank Barchardb40ee632021-12-30 11:10:02 -08001991 /**************************** S8 AArch64 micro-kernels ****************************/
Marat Dukhan23147532021-08-16 07:26:56 -07001992 #ifndef XNN_NO_S8_OPERATORS
1993 init_flags |= XNN_INIT_FLAG_S8;
1994
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07001995 xnn_params.s8.clamp = (struct vunary_parameters) {
1996 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__neon_x64,
1997 .init.s8_minmax = xnn_init_s8_minmax_neon_params,
1998 .element_tile = 64,
1999 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08002000 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
2001 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__neon_c16,
2002 .pixel_tile = 1,
2003 .channel_tile = 16,
2004 };
Marat Dukhan23147532021-08-16 07:26:56 -07002005 xnn_params.s8.maxpool = (struct maxpool_parameters) {
2006 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__neon_c16,
Marat Dukhandc5c1482021-08-16 09:03:15 -07002007 .init.s8 = xnn_init_s8_minmax_neon_params,
Marat Dukhan23147532021-08-16 07:26:56 -07002008 .mr = 9,
2009 .qr = 8,
2010 };
2011 #endif // XNN_NO_S8_OPERATORS
2012
Frank Barchardb40ee632021-12-30 11:10:02 -08002013 /**************************** U8 AArch64 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002014 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07002015 init_flags |= XNN_INIT_FLAG_U8;
2016
Marat Dukhan94912792021-08-16 21:40:30 -07002017 xnn_params.u8.clamp = (struct vunary_parameters) {
2018 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__neon_x64,
2019 .init.u8_minmax = xnn_init_u8_minmax_neon_params,
2020 .element_tile = 64,
2021 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08002022 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
2023 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__neon_c16,
2024 .pixel_tile = 1,
2025 .channel_tile = 16,
2026 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002027 xnn_params.u8.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07002028 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16,
Marat Dukhan2ea50a02021-08-16 12:59:19 -07002029 .init.u8 = xnn_init_u8_minmax_neon_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002030 .mr = 9,
2031 .qr = 8,
2032 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002033 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
2034 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
2035 #endif // XNN_NO_U8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07002036
Frank Barchardb40ee632021-12-30 11:10:02 -08002037 /**************************** X8 AArch64 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002038 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07002039 init_flags |= XNN_INIT_FLAG_X8;
2040
Marat Dukhan98e054b2021-09-13 09:43:50 -07002041 xnn_params.x8.lut = xnn_x8_lut_ukernel__neon_tbx128x4_x64;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002042 xnn_params.x8.zip = (struct zip_parameters) {
2043 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__neon,
2044 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__neon,
2045 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__neon,
2046 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__neon,
2047 };
2048 #endif // XNN_NO_X8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07002049
Frank Barchardb40ee632021-12-30 11:10:02 -08002050 /**************************** F16 AArch64 micro-kernels ****************************/
Frank Barchard7e2cbb02020-06-12 01:22:13 -07002051 #ifndef XNN_NO_F16_OPERATORS
Marat Dukhan8d5d2592020-06-19 12:48:57 -07002052 if (cpuinfo_has_arm_neon_fp16_arith()) {
2053 init_flags |= XNN_INIT_FLAG_F16;
Frank Barchard7c3826e2021-06-07 15:14:16 -07002054 xnn_params.f16.gemm.mr = 6;
2055 xnn_params.f16.gemm.nr = 16;
Marat Dukhan854fb6b2020-06-19 12:33:44 -07002056
Frank Barchard6b73c4f2020-06-26 18:40:40 -07002057 #if XNN_ENABLE_ASSEMBLY
Frank Barchard7c3826e2021-06-07 15:14:16 -07002058 switch (cpuinfo_get_core(0)->uarch) {
2059 case cpuinfo_uarch_cortex_a55:
2060 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55);
2061 break;
2062
Frank Barchard07f4a892021-06-07 18:26:08 -07002063 case cpuinfo_uarch_cortex_a75:
Frank Barchard7b48ddc2021-06-11 13:00:49 -07002064 case cpuinfo_uarch_cortex_x1:
Frank Barchard07f4a892021-06-07 18:26:08 -07002065 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75);
2066 break;
2067
Frank Barchard7c3826e2021-06-07 15:14:16 -07002068 default:
2069 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
2070 break;
2071 }
Frank Barchard6b73c4f2020-06-26 18:40:40 -07002072 xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
Frank Barchard7c3826e2021-06-07 15:14:16 -07002073
2074 #if XNN_MAX_UARCH_TYPES > 1
2075 {
2076 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2077 const uint32_t mr = xnn_params.f16.gemm.mr;
2078 const uint32_t nr = xnn_params.f16.gemm.nr;
2079 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2080 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2081 if (uarch_info == NULL) {
2082 /* No more microarchitectures in the system */
2083 break;
2084 }
2085
2086 switch (uarch_info->uarch) {
2087 case cpuinfo_uarch_cortex_a55:
2088 if (mr == 6 && nr == 16) {
2089 xnn_params.f16.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55;
2090 }
2091 break;
Frank Barchard07f4a892021-06-07 18:26:08 -07002092
Frank Barchardd2f454e2021-06-08 10:47:16 -07002093 case cpuinfo_uarch_cortex_a55r0:
2094 if (mr == 6 && nr == 16) {
2095 xnn_params.f16.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64;
2096 }
2097 break;
2098
Frank Barchard07f4a892021-06-07 18:26:08 -07002099 /* Cortex A75 is the medium core Exynos 9820 (M4) */
2100 case cpuinfo_uarch_cortex_a75:
2101 if (mr == 6 && nr == 16) {
2102 xnn_params.f16.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75;
2103 }
2104 break;
2105
Frank Barchard7c3826e2021-06-07 15:14:16 -07002106 default:
2107 break;
2108 }
2109 }
2110 }
2111 #endif // XNN_MAX_UARCH_TYPES > 1
2112 #else // XNN_ENABLE_ASSEMBLY
Frank Barchard6b73c4f2020-06-26 18:40:40 -07002113 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2114 xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
Frank Barchard7c3826e2021-06-07 15:14:16 -07002115 #endif // XNN_ENABLE_ASSEMBLY
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002116 xnn_params.f16.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002117 xnn_params.f16.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
Marat Dukhanc4302c22022-01-06 19:27:03 -08002118 xnn_params.f16.gemm.init.f16 = xnn_init_f16_scaleminmax_neon_params;
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002119
2120 xnn_params.f16.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith;
Marat Dukhan645af972022-01-09 22:50:27 -08002121 xnn_params.f16.dwconv[0].init.f16 = xnn_init_f16_minmax_neon_params;
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002122 xnn_params.f16.dwconv[0].channel_tile = 16;
2123 xnn_params.f16.dwconv[0].primary_tile = 4;
2124
2125 xnn_params.f16.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith;
Marat Dukhan645af972022-01-09 22:50:27 -08002126 xnn_params.f16.dwconv[1].init.f16 = xnn_init_f16_minmax_neon_params;
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002127 xnn_params.f16.dwconv[1].channel_tile = 16;
2128 xnn_params.f16.dwconv[1].primary_tile = 9;
2129
2130 xnn_params.f16.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2;
Marat Dukhan645af972022-01-09 22:50:27 -08002131 xnn_params.f16.dwconv[2].init.f16 = xnn_init_f16_minmax_neon_params;
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002132 xnn_params.f16.dwconv[2].channel_tile = 8;
2133 xnn_params.f16.dwconv[2].primary_tile = 25;
2134
Marat Dukhan8d5d2592020-06-19 12:48:57 -07002135 xnn_params.f16.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08002136 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8,
2137 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8,
2138 .init.f16 = xnn_init_f16_scaleminmax_neon_params,
2139 .update.f16 = xnn_update_f16_scaleminmax_neon_params,
2140 .row_tile = 7,
2141 .channel_tile = 8,
Marat Dukhan8d5d2592020-06-19 12:48:57 -07002142 };
Frank Barchard01898c02020-06-23 21:49:50 -07002143 xnn_params.f16.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002144 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vadd_minmax_ukernel__neonfp16arith_x16,
2145 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16,
2146 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16,
Marat Dukhan645af972022-01-09 22:50:27 -08002147 .init.f16_minmax = xnn_init_f16_minmax_neon_params,
Frank Barchard01898c02020-06-23 21:49:50 -07002148 .element_tile = 16,
2149 };
Frank Barchard0ea6a772020-09-09 15:26:31 -07002150 xnn_params.f16.vmul = (struct vbinary_parameters) {
2151 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmul_minmax_ukernel__neonfp16arith_x16,
2152 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16,
2153 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16,
Marat Dukhan645af972022-01-09 22:50:27 -08002154 .init.f16_minmax = xnn_init_f16_minmax_neon_params,
Frank Barchard0ea6a772020-09-09 15:26:31 -07002155 .element_tile = 16,
2156 };
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002157 xnn_params.f16.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07002158 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f16_vmulcaddc_minmax_ukernel_c8__neonfp16arith_2x,
Marat Dukhan645af972022-01-09 22:50:27 -08002159 .init.f16 = xnn_init_f16_minmax_neon_params,
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002160 .channel_tile = 8,
2161 .row_tile = 2,
2162 };
Marat Dukhan561d0682021-12-23 16:12:35 -08002163 xnn_params.f16.hswish = (struct vunary_parameters) {
2164 .ukernel = (xnn_univector_ukernel_function) xnn_f16_vhswish_ukernel__neonfp16arith_x16,
Marat Dukhan751f6222022-01-09 23:10:04 -08002165 .init.f16_hswish = xnn_init_f16_hswish_neon_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08002166 .element_tile = 16,
2167 };
Marat Dukhan8d5d2592020-06-19 12:48:57 -07002168 }
Frank Barchard7e2cbb02020-06-12 01:22:13 -07002169 #endif // XNN_NO_F16_OPERATORS
2170
Frank Barchardb40ee632021-12-30 11:10:02 -08002171 /**************************** F32 AArch64 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002172 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07002173 init_flags |= XNN_INIT_FLAG_F32;
2174
Marat Dukhandfe47b92020-12-14 02:48:43 -08002175 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Frank Barchard0d1052c2020-03-23 17:28:13 -07002176 #if XNN_ENABLE_ASSEMBLY
Frank Barchard143a1102021-06-15 09:15:34 -07002177 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2178 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2179 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2180 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002181 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002182 xnn_params.f32.gemm.mr = 6;
2183 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002184 #else // !XNN_ENABLE_ASSEMBLY
Marat Dukhanaefaef32020-04-09 07:09:34 -07002185 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64);
2186 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64);
2187 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2188 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002189 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002190 xnn_params.f32.gemm.mr = 6;
2191 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002192 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhandfe47b92020-12-14 02:48:43 -08002193 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
Frank Barchard0d1052c2020-03-23 17:28:13 -07002194 #if XNN_ENABLE_ASSEMBLY
2195 switch (cpuinfo_get_core(0)->uarch) {
2196 case cpuinfo_uarch_cortex_a57:
Frank Barchard143a1102021-06-15 09:15:34 -07002197 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75);
2198 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75);
2199 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
2200 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002201 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002202 xnn_params.f32.gemm.mr = 6;
2203 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002204 break;
2205 case cpuinfo_uarch_cortex_a72:
Frank Barchard143a1102021-06-15 09:15:34 -07002206 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75);
2207 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75);
2208 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2209 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002210 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002211 xnn_params.f32.gemm.mr = 4;
2212 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002213 break;
2214 case cpuinfo_uarch_cortex_a75:
2215 case cpuinfo_uarch_cortex_a76:
2216 case cpuinfo_uarch_exynos_m3:
2217 case cpuinfo_uarch_exynos_m4:
Frank Barchard143a1102021-06-15 09:15:34 -07002218 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2219 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2220 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2221 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002222 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002223 xnn_params.f32.gemm.mr = 6;
2224 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002225 break;
2226 case cpuinfo_uarch_exynos_m1:
2227 case cpuinfo_uarch_exynos_m2:
Marat Dukhanaefaef32020-04-09 07:09:34 -07002228 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma);
2229 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma);
2230 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma);
2231 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002232 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002233 xnn_params.f32.gemm.mr = 6;
2234 xnn_params.f32.gemm.nr = 8;
2235 xnn_params.f32.gemm.log2_sr = 2;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002236 break;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002237 case cpuinfo_uarch_cortex_a53:
2238 case cpuinfo_uarch_cortex_a55r0:
Marat Dukhanaefaef32020-04-09 07:09:34 -07002239 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53);
2240 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53);
2241 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
2242 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002243 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002244 xnn_params.f32.gemm.mr = 6;
2245 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002246 break;
Frank Barchardf975ee02021-11-05 16:01:00 -07002247 case cpuinfo_uarch_cortex_a35:
Frank Barchard0d1052c2020-03-23 17:28:13 -07002248 case cpuinfo_uarch_cortex_a55:
Marat Dukhanaefaef32020-04-09 07:09:34 -07002249 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55);
2250 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55);
2251 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
2252 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002253 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002254 xnn_params.f32.gemm.mr = 6;
2255 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002256 break;
2257 case cpuinfo_uarch_cortex_a73:
Marat Dukhanaefaef32020-04-09 07:09:34 -07002258 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73);
2259 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73);
Frank Barchard143a1102021-06-15 09:15:34 -07002260 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2261 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002262 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002263 xnn_params.f32.gemm.mr = 6;
2264 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002265 break;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002266 case cpuinfo_uarch_cortex_a77:
2267 case cpuinfo_uarch_exynos_m5:
2268 case cpuinfo_uarch_kryo:
Frank Barchard143a1102021-06-15 09:15:34 -07002269 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75);
2270 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75);
2271 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
2272 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002273 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002274 xnn_params.f32.gemm.mr = 4;
2275 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002276 break;
Frank Barchard990b2af2021-06-14 11:49:15 -07002277 case cpuinfo_uarch_cortex_a78:
2278 case cpuinfo_uarch_cortex_x1:
2279 default:
2280 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128);
Frank Barchard79cd5f92021-06-21 17:34:59 -07002281 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128);
Frank Barchard990b2af2021-06-14 11:49:15 -07002282 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2283 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2284 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2285 xnn_params.f32.gemm.mr = 6;
2286 xnn_params.f32.gemm.nr = 8;
2287 break;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002288 }
Marat Dukhan05702cf2020-03-26 15:41:33 -07002289 #if XNN_MAX_UARCH_TYPES > 1
2290 {
2291 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2292 const uint32_t mr = xnn_params.f32.gemm.mr;
2293 const uint32_t nr = xnn_params.f32.gemm.nr;
2294 const uint32_t log2_sr = xnn_params.f32.gemm.log2_sr;
2295 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2296 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2297 if (uarch_info == NULL) {
2298 /* No more microarchitectures in the system */
2299 break;
2300 }
2301
2302 switch (uarch_info->uarch) {
2303 case cpuinfo_uarch_cortex_a53:
2304 case cpuinfo_uarch_cortex_a55r0:
2305 if (mr == 6 && nr == 8 && log2_sr == 0) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07002306 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53;
2307 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53;
2308 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2309 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
Marat Dukhan05702cf2020-03-26 15:41:33 -07002310 } else if (mr == 4 && nr == 8 && log2_sr == 0) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07002311 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53;
2312 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53;
2313 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2314 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
Marat Dukhan05702cf2020-03-26 15:41:33 -07002315 }
2316 break;
2317 case cpuinfo_uarch_cortex_a55:
2318 if (mr == 6 && nr == 8 && log2_sr == 0) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07002319 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55;
2320 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55;
2321 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2322 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
Marat Dukhan05702cf2020-03-26 15:41:33 -07002323 } else if (mr == 4 && nr == 8 && log2_sr == 0) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07002324 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55;
2325 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55;
2326 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2327 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
Marat Dukhan05702cf2020-03-26 15:41:33 -07002328 }
2329 break;
2330 default:
2331 break;
2332 }
2333 }
2334 }
2335 #endif // XNN_MAX_UARCH_TYPES > 1
Frank Barchard0d1052c2020-03-23 17:28:13 -07002336 #else // !XNN_ENABLE_ASSEMBLY
Marat Dukhanaefaef32020-04-09 07:09:34 -07002337 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64);
2338 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64);
2339 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2340 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002341 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002342 xnn_params.f32.gemm.mr = 6;
2343 xnn_params.f32.gemm.nr = 8;
Marat Dukhan31677ad2020-10-13 23:59:31 -07002344 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhandfe47b92020-12-14 02:48:43 -08002345 #endif // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Marat Dukhanaefaef32020-04-09 07:09:34 -07002346 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__neonfma_lane_ld64);
2347 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__neonfma_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002348 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002349 xnn_params.f32.gemm2.mr = 4;
2350 xnn_params.f32.gemm2.nr = 2;
2351
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002352 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__neonfma;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002353 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanf5425ea2020-04-24 01:46:00 -07002354 xnn_params.f32.dwconv[0].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002355 xnn_params.f32.dwconv[0].primary_tile = 3;
2356
2357 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__neonfma;
2358 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
2359 xnn_params.f32.dwconv[1].channel_tile = 8;
2360 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002361
Marat Dukhandfe47b92020-12-14 02:48:43 -08002362 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002363 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
2364 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2365 xnn_params.f32.dwconv[2].channel_tile = 8;
2366 xnn_params.f32.dwconv[2].primary_tile = 9;
Marat Dukhandfe47b92020-12-14 02:48:43 -08002367 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
Frank Barchard0d1052c2020-03-23 17:28:13 -07002368 switch (cpuinfo_get_core(0)->uarch) {
2369 case cpuinfo_uarch_kryo:
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002370 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
2371 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2372 xnn_params.f32.dwconv[2].channel_tile = 8;
2373 xnn_params.f32.dwconv[2].primary_tile = 9;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002374 break;
2375 #if XNN_ENABLE_ASSEMBLY
2376 case cpuinfo_uarch_cortex_a53:
2377 case cpuinfo_uarch_cortex_a55r0:
2378 case cpuinfo_uarch_cortex_a55:
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002379 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma_cortex_a55;
2380 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2381 xnn_params.f32.dwconv[2].channel_tile = 4;
2382 xnn_params.f32.dwconv[2].primary_tile = 9;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002383 break;
2384 #endif // XNN_ENABLE_ASSEMBLY
2385 default:
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002386 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
2387 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2388 xnn_params.f32.dwconv[2].channel_tile = 8;
2389 xnn_params.f32.dwconv[2].primary_tile = 9;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002390 break;
2391 }
Marat Dukhandfe47b92020-12-14 02:48:43 -08002392 #endif // XNN_PLATFORM_IOS && XNN_PLATFORM_MAC
Marat Dukhanaefaef32020-04-09 07:09:34 -07002393
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002394 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__neonfma_acc2;
2395 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
2396 xnn_params.f32.dwconv[3].channel_tile = 8;
2397 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002398
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002399 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08002400 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__neon_c4,
2401 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4,
2402 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
2403 .primary_tile = 9,
2404 .incremental_tile = 8,
2405 .channel_tile = 4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002406 };
2407 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08002408 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4,
2409 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4,
2410 .primary_tile = 9,
2411 .incremental_tile = 8,
2412 .channel_tile = 4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002413 };
2414 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08002415 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4,
2416 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
2417 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
2418 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
2419 .row_tile = 7,
2420 .channel_tile = 4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002421 };
2422 xnn_params.f32.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07002423 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4,
Marat Dukhan91ae1652021-08-15 19:19:49 -07002424 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002425 .mr = 9,
2426 .qr = 8,
2427 };
2428 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -07002429 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__neon_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002430 .mr = 4,
2431 };
2432 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -07002433 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__neon_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002434 .mr = 9,
2435 };
2436 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -07002437 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__neon_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002438 .mr = 9,
2439 .qr = 8,
2440 };
Marat Dukhan660fd192020-03-10 04:55:30 -07002441 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
2442 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__neonfma_c8,
Marat Dukhan69722492019-11-11 19:55:50 -08002443 .pixel_tile = 1,
2444 .channel_tile = 8,
2445 };
Marat Dukhane5efb162021-12-31 10:26:13 -08002446 xnn_params.f32.abs = (struct vunary_parameters) {
2447 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__neon_x8,
2448 .element_tile = 8,
2449 };
Marat Dukhan94912792021-08-16 21:40:30 -07002450 xnn_params.f32.clamp = (struct vunary_parameters) {
2451 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__neon_x8,
2452 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
2453 .element_tile = 8,
2454 };
Marat Dukhan4a79ff22022-01-01 12:16:48 -08002455 xnn_params.f32.elu = (struct vunary_parameters) {
2456 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x16,
2457 .init.f32_elu = xnn_init_f32_elu_neonfma_rr1_lut16_p3_params,
2458 .element_tile = 16,
2459 };
Marat Dukhan561d0682021-12-23 16:12:35 -08002460 xnn_params.f32.hswish = (struct vunary_parameters) {
2461 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__neon_x16,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08002462 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08002463 .element_tile = 16,
2464 };
Marat Dukhan2894e992021-12-30 08:29:48 -08002465 xnn_params.f32.lrelu = (struct vunary_parameters) {
2466 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__neon_x8,
2467 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
2468 .element_tile = 8,
2469 };
Marat Dukhane5efb162021-12-31 10:26:13 -08002470 xnn_params.f32.neg = (struct vunary_parameters) {
2471 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__neon_x8,
2472 .element_tile = 8,
2473 };
Marat Dukhan0e801372022-01-04 00:10:41 -08002474 xnn_params.f32.rndne = (struct vunary_parameters) {
2475 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8,
2476 .element_tile = 8,
2477 };
2478 xnn_params.f32.rndz = (struct vunary_parameters) {
2479 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8,
2480 .element_tile = 8,
2481 };
2482 xnn_params.f32.rndu = (struct vunary_parameters) {
2483 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8,
2484 .element_tile = 8,
2485 };
2486 xnn_params.f32.rndd = (struct vunary_parameters) {
2487 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8,
2488 .element_tile = 8,
2489 };
Marat Dukhance834ad2022-01-03 00:22:01 -08002490 xnn_params.f32.sigmoid = (struct vunary_parameters) {
2491 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x16,
2492 .init.f32_sigmoid = xnn_init_f32_sigmoid_neonfma_rr1_lut64_p2_params,
2493 .element_tile = 16,
2494 };
Marat Dukhane5efb162021-12-31 10:26:13 -08002495 xnn_params.f32.sqr = (struct vunary_parameters) {
2496 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__neon_x8,
2497 .element_tile = 8,
2498 };
Marat Dukhane72b2822021-12-30 14:46:58 -08002499 xnn_params.f32.sqrt = (struct vunary_parameters) {
2500 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__neon_sqrt_x4,
2501 .element_tile = 4,
2502 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002503 xnn_params.f32.prelu = (struct prelu_parameters) {
Marat Dukhan69c3f2c2019-11-06 12:30:01 -08002504 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
2505 .row_tile = 2,
2506 .channel_tile = 8,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002507 };
Marat Dukhan4a5c7712022-01-05 22:43:13 -08002508 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
2509 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x16,
2510 .init = xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params,
2511 .element_tile = 16,
2512 };
Marat Dukhan1edc4542020-01-27 12:40:13 -08002513 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__neon;
Marat Dukhanb1a0fc32019-12-02 19:32:02 -08002514 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002515 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__neon_x8,
2516 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
2517 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08002518 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhanb1a0fc32019-12-02 19:32:02 -08002519 .element_tile = 8,
2520 };
Marat Dukhan69180502019-12-06 15:00:31 -08002521 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002522 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__neon_x8,
2523 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__neon_x8,
2524 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__neon_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08002525 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan69180502019-12-06 15:00:31 -08002526 .element_tile = 8,
2527 };
Marat Dukhan79e7f842019-12-05 14:35:50 -08002528 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002529 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
2530 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
2531 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
Marat Dukhan79e7f842019-12-05 14:35:50 -08002532 .element_tile = 8,
2533 };
2534 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002535 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__neon_x8,
2536 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
2537 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
Marat Dukhan79e7f842019-12-05 14:35:50 -08002538 .element_tile = 8,
2539 };
Marat Dukhan1e782c42019-11-21 17:02:40 -08002540 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002541 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__neon_x8,
2542 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
2543 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08002544 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhanca2733c2019-11-15 23:21:17 -08002545 .element_tile = 8,
2546 };
Marat Dukhan05f3f6d2019-12-03 15:13:53 -08002547 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002548 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__neon_x8,
2549 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__neon_x8,
2550 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__neon_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08002551 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan05f3f6d2019-12-03 15:13:53 -08002552 .element_tile = 8,
2553 };
Marat Dukhanf7399262020-06-05 10:58:44 -07002554 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002555 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__neon_x8,
2556 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
2557 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
Marat Dukhanf7399262020-06-05 10:58:44 -07002558 .element_tile = 8,
2559 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002560 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07002561 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07002562 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan49e6ee92019-11-06 15:55:29 -08002563 .channel_tile = 4,
2564 .row_tile = 2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002565 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08002566 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -08002567 init_flags |= XNN_INIT_FLAG_CHW_OPT;
2568
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002569 xnn_params.f32.spmm = (struct spmm_parameters) {
Marat Dukhan4baa2ac2021-02-11 10:04:09 -08002570 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__neonfma_pipelined,
2571 .mr = 32,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002572 .nr = 1,
XNNPACK Teamb455b122019-09-27 18:10:33 -07002573 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002574 xnn_params.f32.spmm2 = (struct spmm_parameters) {
Marat Dukhan4baa2ac2021-02-11 10:04:09 -08002575 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x2__neonfma,
2576 .mr = 32,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002577 .nr = 2,
2578 };
2579 xnn_params.f32.spmm4 = (struct spmm_parameters) {
Marat Dukhan4baa2ac2021-02-11 10:04:09 -08002580 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x4__neonfma,
2581 .mr = 32,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002582 .nr = 4,
2583 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07002584 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002585 .ukernel_with_symm_padding =
Marat Dukhan1f29b802020-05-15 23:46:39 -07002586 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002587 .output_channel_tile = 4,
2588 .output_height_tile = 2,
2589 .output_width_tile = 2,
2590 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07002591 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
2592 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002593 .output_width_tile = 4,
2594 .output_height_tile = 3,
2595 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07002596 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
Marat Dukhan82f0c322020-10-25 19:17:35 -07002597 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4_acc2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002598 .output_width_tile = 4,
Marat Dukhan82f0c322020-10-25 19:17:35 -07002599 .output_height_tile = 2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002600 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07002601 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
Marat Dukhan149f0ea2020-10-26 12:50:33 -07002602 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4,
Marat Dukhana99918a2019-11-15 14:40:12 -08002603 .output_width_tile = 4,
Marat Dukhan149f0ea2020-10-26 12:50:33 -07002604 .output_height_tile = 4,
Marat Dukhana99918a2019-11-15 14:40:12 -08002605 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07002606 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
2607 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2,
Marat Dukhana99918a2019-11-15 14:40:12 -08002608 .output_width_tile = 4,
2609 .output_height_tile = 1,
2610 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07002611 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
2612 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__neon_x4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002613 .channel_tile = 4,
2614 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -07002615 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
Artsiom Ablavatski2202c812021-01-22 14:16:43 -08002616 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__neonfma_p8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07002617 .channel_tile = 1,
Artsiom Ablavatskib3ffd582021-03-31 13:00:08 -07002618 .pixel_tile = 8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07002619 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08002620 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002621 #endif // XNN_NO_F32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07002622
Frank Barchardb40ee632021-12-30 11:10:02 -08002623 /*************************** VCVT AArch64 micro-kernels ***************************/
Marat Dukhanaf2ba002021-10-24 14:21:41 -07002624 #ifndef XNN_NO_VCVT_OPERATORS
2625 init_flags |= XNN_INIT_FLAG_VCVT;
2626
Marat Dukhan134f9842021-12-29 19:57:31 -08002627 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
2628 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neonfp16_x16,
2629 .element_tile = 16,
2630 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08002631 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
2632 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__neonfp16_x16,
2633 .element_tile = 16,
2634 };
Marat Dukhaned2d7762021-12-03 23:51:19 -08002635 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
2636 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neonv8_x32,
2637 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neonv8_params,
2638 .element_tile = 32,
2639 };
2640 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
2641 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neonv8_x32,
2642 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neonv8_params,
2643 .element_tile = 32,
2644 };
Marat Dukhanf92206b2021-12-10 17:02:07 -08002645 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
2646 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__neon_x32,
2647 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_neon_params,
2648 .element_tile = 32,
2649 };
2650 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
2651 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__neon_x32,
2652 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_neon_params,
2653 .element_tile = 32,
2654 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07002655 #endif // XNN_NO_VCVT_OPERATORS
2656
Frank Barchardb40ee632021-12-30 11:10:02 -08002657 /**************************** X32 AArch64 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002658 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07002659 init_flags |= XNN_INIT_FLAG_X32;
2660
Marat Dukhan57dccd82020-04-14 00:53:10 -07002661 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__neon;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002662 xnn_params.x32.zip = (struct zip_parameters) {
2663 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__neon,
2664 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__neon,
2665 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__neon,
2666 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
2667 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08002668 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -08002669 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
2670 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08002671 .channel_tile = 1,
2672 .pixel_tile = 1,
2673 };
2674 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002675 #endif // XNN_NO_X32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07002676
Frank Barchardb40ee632021-12-30 11:10:02 -08002677 /**************************** XX AArch64 micro-kernels ****************************/
Marat Dukhan048931b2020-11-24 20:53:54 -08002678 #ifndef XNN_NO_XX_OPERATORS
2679 init_flags |= XNN_INIT_FLAG_XX;
2680
2681 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
Marat Dukhan933051b2021-08-07 16:26:15 -07002682 xnn_params.xx.fill = (struct fill_parameters) {
2683 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__neon_x64,
2684 .row_tile = 1,
2685 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -07002686 xnn_params.xx.pad = (struct pad_parameters) {
2687 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__neon,
2688 .row_tile = 1,
2689 };
Marat Dukhan048931b2020-11-24 20:53:54 -08002690 #endif
2691
Marat Dukhan933051b2021-08-07 16:26:15 -07002692#elif XNN_ARCH_X86 || XNN_ARCH_X86_64
2693 if (!cpuinfo_has_x86_sse2()) {
2694 xnn_log_error("XNNPACK initialization failed: SSE2 is not supported");
2695 return;
2696 }
2697
Frank Barchardb40ee632021-12-30 11:10:02 -08002698 /**************************** QC8 x86 micro-kernels ****************************/
Marat Dukhan5e353862021-06-15 09:03:25 -07002699 #ifndef XNN_NO_QC8_OPERATORS
2700 init_flags |= XNN_INIT_FLAG_QC8;
2701
2702 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
2703 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
2704 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
2705 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
2706 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
2707 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_avx512_params;
2708 xnn_params.qc8.gemm.mr = 4;
2709 xnn_params.qc8.gemm.nr = 16;
2710 xnn_params.qc8.gemm.log2_kr = 3;
2711 } else if (cpuinfo_has_x86_xop()) {
2712 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
2713 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
2714 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
2715 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
2716 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
2717 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse4_params;
2718 xnn_params.qc8.gemm.mr = 2;
2719 xnn_params.qc8.gemm.nr = 4;
2720 xnn_params.qc8.gemm.log2_kr = 3;
2721 } else if (cpuinfo_has_x86_avx2()) {
2722 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
2723 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
2724 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
2725 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
2726 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_avx2_params;
2727 xnn_params.qc8.gemm.mr = 3;
2728 xnn_params.qc8.gemm.nr = 8;
2729 xnn_params.qc8.gemm.log2_kr = 3;
2730 } else if (cpuinfo_has_x86_avx()) {
2731 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
2732 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
2733 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
2734 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
2735 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse4_params;
2736 xnn_params.qc8.gemm.mr = 2;
2737 xnn_params.qc8.gemm.nr = 4;
2738 xnn_params.qc8.gemm.log2_kr = 3;
2739 } else if (cpuinfo_has_x86_sse4_1()) {
2740 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
2741 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
2742 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
2743 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
2744 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse4_params;
2745 xnn_params.qc8.gemm.mr = 3;
2746 xnn_params.qc8.gemm.nr = 4;
2747 xnn_params.qc8.gemm.log2_kr = 3;
2748 } else {
2749 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
2750 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
2751 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
2752 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
2753 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse2_params;
2754 xnn_params.qc8.gemm.mr = 3;
2755 xnn_params.qc8.gemm.nr = 4;
2756 xnn_params.qc8.gemm.log2_kr = 3;
2757 }
2758
2759 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
2760 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
2761 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_avx512_params;
2762 xnn_params.qc8.dwconv[0].channel_tile = 32;
2763 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
2764 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_avx512_params;
2765 xnn_params.qc8.dwconv[1].channel_tile = 32;
2766 } else if (cpuinfo_has_x86_xop()) {
2767 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
Marat Dukhan28480592021-07-27 23:52:27 -07002768 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16;
Marat Dukhan5e353862021-06-15 09:03:25 -07002769 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse4_params;
2770 xnn_params.qc8.dwconv[0].channel_tile = 16;
Marat Dukhan28480592021-07-27 23:52:27 -07002771 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul16_add16;
Marat Dukhan5e353862021-06-15 09:03:25 -07002772 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse4_params;
2773 xnn_params.qc8.dwconv[1].channel_tile = 16;
2774 } else if (cpuinfo_has_x86_avx2()) {
2775 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
2776 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_avx2_params;
2777 xnn_params.qc8.dwconv[0].channel_tile = 16;
2778 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
2779 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_avx2_params;
2780 xnn_params.qc8.dwconv[1].channel_tile = 16;
2781 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan28480592021-07-27 23:52:27 -07002782 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16;
Marat Dukhan5e353862021-06-15 09:03:25 -07002783 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse4_params;
2784 xnn_params.qc8.dwconv[0].channel_tile = 16;
Marat Dukhan28480592021-07-27 23:52:27 -07002785 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16;
Marat Dukhan5e353862021-06-15 09:03:25 -07002786 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse4_params;
2787 xnn_params.qc8.dwconv[1].channel_tile = 16;
2788 } else if (cpuinfo_has_x86_sse4_1()) {
2789 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16;
2790 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse4_params;
2791 xnn_params.qc8.dwconv[0].channel_tile = 8;
2792 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16;
2793 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse4_params;
2794 xnn_params.qc8.dwconv[1].channel_tile = 8;
2795 } else if (cpuinfo_has_x86_sse2()) {
2796 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16;
2797 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse2_params;
2798 xnn_params.qc8.dwconv[0].channel_tile = 8;
2799 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16;
2800 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse2_params;
2801 xnn_params.qc8.dwconv[1].channel_tile = 8;
2802 }
2803 xnn_params.qc8.dwconv[0].primary_tile = 9;
2804 xnn_params.qc8.dwconv[1].primary_tile = 25;
2805 #endif // XNN_NO_QC8_OPERATORS
2806
Frank Barchardb40ee632021-12-30 11:10:02 -08002807 /**************************** QS8 x86 micro-kernels ****************************/
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002808 #ifndef XNN_NO_QS8_OPERATORS
2809 init_flags |= XNN_INIT_FLAG_QS8;
2810
Marat Dukhanbb00b1d2020-08-10 11:37:23 -07002811 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhan71855ee2021-05-25 19:05:06 -07002812 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
2813 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
2814 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
2815 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
2816 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
Marat Dukhanbb00b1d2020-08-10 11:37:23 -07002817 xnn_params.qs8.gemm.mr = 4;
2818 xnn_params.qs8.gemm.nr = 16;
2819 xnn_params.qs8.gemm.log2_kr = 3;
2820 } else if (cpuinfo_has_x86_xop()) {
Marat Dukhan75215d82020-08-07 23:08:03 -07002821 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
Marat Dukhanc46e6712021-06-01 19:00:16 -07002822 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
2823 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
2824 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
2825 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
2826 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan75215d82020-08-07 23:08:03 -07002827 xnn_params.qs8.gemm.mr = 2;
2828 xnn_params.qs8.gemm.nr = 4;
2829 xnn_params.qs8.gemm.log2_kr = 3;
2830 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan9b474cf2021-05-25 16:37:48 -07002831 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
2832 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
2833 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
2834 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
2835 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002836 xnn_params.qs8.gemm.mr = 3;
2837 xnn_params.qs8.gemm.nr = 8;
2838 xnn_params.qs8.gemm.log2_kr = 3;
Marat Dukhana3c16332021-04-02 15:03:27 -07002839 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhanc46e6712021-06-01 19:00:16 -07002840 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
2841 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
2842 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
2843 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
2844 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhana3c16332021-04-02 15:03:27 -07002845 xnn_params.qs8.gemm.mr = 2;
2846 xnn_params.qs8.gemm.nr = 4;
2847 xnn_params.qs8.gemm.log2_kr = 3;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002848 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhanc46e6712021-06-01 19:00:16 -07002849 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
2850 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
2851 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
2852 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
2853 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002854 xnn_params.qs8.gemm.mr = 3;
2855 xnn_params.qs8.gemm.nr = 4;
2856 xnn_params.qs8.gemm.log2_kr = 3;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002857 } else {
Marat Dukhanc46e6712021-06-01 19:00:16 -07002858 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
2859 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
2860 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
2861 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
2862 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002863 xnn_params.qs8.gemm.mr = 3;
2864 xnn_params.qs8.gemm.nr = 4;
2865 xnn_params.qs8.gemm.log2_kr = 3;
2866 }
2867
Marat Dukhan2ffc5e62020-09-06 22:33:38 -07002868 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhan71855ee2021-05-25 19:05:06 -07002869 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
2870 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
Marat Dukhan2ffc5e62020-09-06 22:33:38 -07002871 xnn_params.qs8.dwconv[0].channel_tile = 32;
Marat Dukhan71855ee2021-05-25 19:05:06 -07002872 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
2873 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07002874 xnn_params.qs8.dwconv[1].channel_tile = 32;
Marat Dukhan3fd4e272021-04-10 11:16:42 -07002875 } else if (cpuinfo_has_x86_xop()) {
2876 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
Marat Dukhan02f06e32021-07-27 14:33:47 -07002877 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07002878 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan3fd4e272021-04-10 11:16:42 -07002879 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhan09668562021-07-26 16:52:20 -07002880 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07002881 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07002882 xnn_params.qs8.dwconv[1].channel_tile = 16;
Marat Dukhan2ffc5e62020-09-06 22:33:38 -07002883 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan9b474cf2021-05-25 16:37:48 -07002884 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
2885 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
Marat Dukhand65a1522020-08-04 19:28:18 -07002886 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhan9b474cf2021-05-25 16:37:48 -07002887 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
2888 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07002889 xnn_params.qs8.dwconv[1].channel_tile = 16;
Marat Dukhanfa0ab852021-04-02 17:30:49 -07002890 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan09668562021-07-26 16:52:20 -07002891 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07002892 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhanfa0ab852021-04-02 17:30:49 -07002893 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhan09668562021-07-26 16:52:20 -07002894 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07002895 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07002896 xnn_params.qs8.dwconv[1].channel_tile = 16;
Marat Dukhand65a1522020-08-04 19:28:18 -07002897 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhan09668562021-07-26 16:52:20 -07002898 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07002899 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhand65a1522020-08-04 19:28:18 -07002900 xnn_params.qs8.dwconv[0].channel_tile = 8;
Marat Dukhan09668562021-07-26 16:52:20 -07002901 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07002902 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07002903 xnn_params.qs8.dwconv[1].channel_tile = 8;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002904 } else if (cpuinfo_has_x86_sse2()) {
Marat Dukhan09668562021-07-26 16:52:20 -07002905 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07002906 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
Marat Dukhand65a1522020-08-04 19:28:18 -07002907 xnn_params.qs8.dwconv[0].channel_tile = 8;
Marat Dukhan09668562021-07-26 16:52:20 -07002908 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07002909 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07002910 xnn_params.qs8.dwconv[1].channel_tile = 8;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002911 }
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002912 xnn_params.qs8.dwconv[0].primary_tile = 9;
Marat Dukhan4ed14882021-05-12 17:50:40 -07002913 xnn_params.qs8.dwconv[1].primary_tile = 25;
Marat Dukhan9e0b5392020-08-07 02:29:34 -07002914
2915 if (cpuinfo_has_x86_sse4_1()) {
2916 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan9e258d62022-01-12 10:50:51 -08002917 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8,
2918 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8,
Marat Dukhan53f41062022-01-11 19:44:57 -08002919 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_sse4_params,
2920 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_sse4_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08002921 .row_tile = 7,
2922 .channel_tile = 8,
Marat Dukhan9e0b5392020-08-07 02:29:34 -07002923 };
Marat Dukhan53f41062022-01-11 19:44:57 -08002924 } else {
Marat Dukhan9e0b5392020-08-07 02:29:34 -07002925 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan9e258d62022-01-12 10:50:51 -08002926 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8,
2927 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8,
Marat Dukhan53f41062022-01-11 19:44:57 -08002928 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_sse2_params,
2929 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_sse2_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08002930 .row_tile = 7,
2931 .channel_tile = 8,
Marat Dukhan9e0b5392020-08-07 02:29:34 -07002932 };
2933 }
Marat Dukhanff209482020-09-03 14:26:53 -07002934
Marat Dukhane76049a2021-07-22 14:48:59 -07002935 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
2936 xnn_params.qs8.vadd = (struct vbinary_parameters) {
2937 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16,
2938 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
2939 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
Marat Dukhan64287252021-09-07 16:20:03 -07002940 .init.qs8_addsub = xnn_init_qs8_add_minmax_avx512_params,
Marat Dukhane76049a2021-07-22 14:48:59 -07002941 .element_tile = 16,
2942 };
2943 } else if (cpuinfo_has_x86_xop()) {
Marat Dukhanbb9225e2020-09-06 22:40:56 -07002944 xnn_params.qs8.vadd = (struct vbinary_parameters) {
2945 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8,
2946 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
2947 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07002948 .init.qs8_addsub = xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhanbb9225e2020-09-06 22:40:56 -07002949 .element_tile = 8,
2950 };
Marat Dukhan3eac69c2021-07-21 01:42:29 -07002951 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan7679b1e2021-07-20 18:32:23 -07002952 xnn_params.qs8.vadd = (struct vbinary_parameters) {
2953 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_x16,
2954 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
2955 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
Marat Dukhan64287252021-09-07 16:20:03 -07002956 .init.qs8_addsub = xnn_init_qs8_add_minmax_avx2_params,
Marat Dukhan7679b1e2021-07-20 18:32:23 -07002957 .element_tile = 16,
2958 };
Marat Dukhane9c4b962021-04-02 16:56:55 -07002959 } else if (cpuinfo_has_x86_avx()) {
2960 xnn_params.qs8.vadd = (struct vbinary_parameters) {
2961 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_x8,
2962 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
2963 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07002964 .init.qs8_addsub = xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhane9c4b962021-04-02 16:56:55 -07002965 .element_tile = 8,
2966 };
Marat Dukhanbb9225e2020-09-06 22:40:56 -07002967 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhanff209482020-09-03 14:26:53 -07002968 xnn_params.qs8.vadd = (struct vbinary_parameters) {
2969 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8,
2970 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
2971 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07002972 .init.qs8_addsub = xnn_init_qs8_add_minmax_sse4_mul16_params,
Marat Dukhanff209482020-09-03 14:26:53 -07002973 .element_tile = 8,
2974 };
2975 } else {
2976 xnn_params.qs8.vadd = (struct vbinary_parameters) {
2977 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8,
2978 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
2979 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07002980 .init.qs8_addsub = xnn_init_qs8_add_minmax_sse2_params,
Marat Dukhanff209482020-09-03 14:26:53 -07002981 .element_tile = 8,
2982 };
2983 }
Marat Dukhan0853b8a2021-08-03 01:01:53 -07002984 if (cpuinfo_has_x86_avx()) {
2985 xnn_params.qs8.vmul = (struct vbinary_parameters) {
2986 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16,
2987 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
2988 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
2989 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse4_params,
2990 .element_tile = 16,
2991 };
2992 } else if (cpuinfo_has_x86_sse4_1()) {
2993 xnn_params.qs8.vmul = (struct vbinary_parameters) {
2994 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
2995 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
2996 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
2997 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse4_params,
2998 .element_tile = 16,
2999 };
3000 } else {
3001 xnn_params.qs8.vmul = (struct vbinary_parameters) {
3002 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3003 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3004 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3005 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse2_params,
3006 .element_tile = 8,
3007 };
3008 }
Marat Dukhan07e50402020-08-05 17:16:53 -07003009 #endif // XNN_NO_QS8_OPERATORS
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07003010
Frank Barchardb40ee632021-12-30 11:10:02 -08003011 /**************************** QU8 x86 micro-kernels ****************************/
Marat Dukhan08b7a972020-07-14 18:17:29 -07003012 #ifndef XNN_NO_QU8_OPERATORS
3013 init_flags |= XNN_INIT_FLAG_QU8;
Marat Dukhan854fb6b2020-06-19 12:33:44 -07003014
Marat Dukhan3cf2e222021-07-08 11:38:45 -07003015 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3016 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3017 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3018 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3019 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3020 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3021 xnn_params.qu8.gemm.mr = 4;
3022 xnn_params.qu8.gemm.nr = 16;
3023 xnn_params.qu8.gemm.log2_kr = 3;
3024 } else if (cpuinfo_has_x86_xop()) {
Marat Dukhan902ef7f2021-07-02 16:11:06 -07003025 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3026 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3027 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3028 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3029 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3030 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3031 xnn_params.qu8.gemm.mr = 2;
3032 xnn_params.qu8.gemm.nr = 4;
3033 xnn_params.qu8.gemm.log2_kr = 3;
3034 } else if (cpuinfo_has_x86_avx2()) {
3035 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3036 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
3037 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3038 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3039 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3040 xnn_params.qu8.gemm.mr = 3;
3041 xnn_params.qu8.gemm.nr = 8;
3042 xnn_params.qu8.gemm.log2_kr = 3;
3043 } else if (cpuinfo_has_x86_avx()) {
3044 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3045 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3046 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3047 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3048 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3049 xnn_params.qu8.gemm.mr = 2;
3050 xnn_params.qu8.gemm.nr = 4;
3051 xnn_params.qu8.gemm.log2_kr = 3;
3052 } else if (cpuinfo_has_x86_sse4_1()) {
3053 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3054 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3055 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3056 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3057 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3058 xnn_params.qu8.gemm.mr = 3;
3059 xnn_params.qu8.gemm.nr = 4;
3060 xnn_params.qu8.gemm.log2_kr = 3;
3061 } else {
3062 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3063 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3064 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3065 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3066 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3067 xnn_params.qu8.gemm.mr = 3;
3068 xnn_params.qu8.gemm.nr = 4;
3069 xnn_params.qu8.gemm.log2_kr = 3;
3070 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07003071
Marat Dukhanabee3a72021-07-09 09:04:52 -07003072 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3073 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
3074 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3075 xnn_params.qu8.dwconv[0].channel_tile = 32;
3076 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
3077 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3078 xnn_params.qu8.dwconv[1].channel_tile = 32;
3079 } else if (cpuinfo_has_x86_xop()) {
3080 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3081 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32;
3082 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3083 xnn_params.qu8.dwconv[0].channel_tile = 16;
3084 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32;
3085 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3086 xnn_params.qu8.dwconv[1].channel_tile = 16;
3087 } else if (cpuinfo_has_x86_avx2()) {
3088 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
3089 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3090 xnn_params.qu8.dwconv[0].channel_tile = 16;
3091 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
3092 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3093 xnn_params.qu8.dwconv[1].channel_tile = 16;
3094 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhancaa7fc72021-07-27 07:48:24 -07003095 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16;
Marat Dukhanabee3a72021-07-09 09:04:52 -07003096 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3097 xnn_params.qu8.dwconv[0].channel_tile = 16;
Marat Dukhancaa7fc72021-07-27 07:48:24 -07003098 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16;
Marat Dukhanabee3a72021-07-09 09:04:52 -07003099 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3100 xnn_params.qu8.dwconv[1].channel_tile = 16;
3101 } else if (cpuinfo_has_x86_sse4_1()) {
3102 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16;
3103 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3104 xnn_params.qu8.dwconv[0].channel_tile = 8;
3105 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16;
3106 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3107 xnn_params.qu8.dwconv[1].channel_tile = 8;
3108 } else if (cpuinfo_has_x86_sse2()) {
3109 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16;
3110 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3111 xnn_params.qu8.dwconv[0].channel_tile = 8;
3112 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16;
3113 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3114 xnn_params.qu8.dwconv[1].channel_tile = 8;
3115 }
Marat Dukhan08b7a972020-07-14 18:17:29 -07003116 xnn_params.qu8.dwconv[0].primary_tile = 9;
Marat Dukhanabee3a72021-07-09 09:04:52 -07003117 xnn_params.qu8.dwconv[1].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003118
Marat Dukhan08b7a972020-07-14 18:17:29 -07003119 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08003120 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__sse2_c8,
3121 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8,
Marat Dukhan3c949a32022-01-09 20:12:33 -08003122 .init.qu8 = xnn_init_qu8_avgpool_minmax_sse2_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08003123 .primary_tile = 9,
3124 .incremental_tile = 8,
3125 .channel_tile = 8,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003126 };
Marat Dukhand1f53e42022-01-12 22:34:51 -08003127 if (cpuinfo_has_x86_sse4_1()) {
3128 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
3129 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8,
3130 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8,
3131 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_sse4_params,
3132 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_sse4_params,
3133 .row_tile = 7,
3134 .channel_tile = 8,
3135 };
3136 } else {
3137 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
3138 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8,
3139 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8,
3140 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_sse2_params,
3141 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_sse2_params,
3142 .row_tile = 7,
3143 .channel_tile = 8,
3144 };
3145 }
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003146
Marat Dukhane76049a2021-07-22 14:48:59 -07003147 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3148 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3149 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16,
3150 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
3151 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
Marat Dukhan64287252021-09-07 16:20:03 -07003152 .init.qu8_addsub = xnn_init_qu8_add_minmax_avx512_params,
Marat Dukhane76049a2021-07-22 14:48:59 -07003153 .element_tile = 16,
3154 };
3155 } else if (cpuinfo_has_x86_xop()) {
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003156 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3157 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__xop_mul32_ld32_x8,
3158 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
3159 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07003160 .init.qu8_addsub = xnn_init_qu8_add_minmax_sse4_params,
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003161 .element_tile = 8,
3162 };
3163 } else if (cpuinfo_has_x86_avx2()) {
3164 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3165 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_x16,
3166 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
3167 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
Marat Dukhan64287252021-09-07 16:20:03 -07003168 .init.qu8_addsub = xnn_init_qu8_add_minmax_avx2_params,
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003169 .element_tile = 16,
3170 };
3171 } else if (cpuinfo_has_x86_avx()) {
3172 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3173 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx_mul32_ld32_x8,
3174 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
3175 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07003176 .init.qu8_addsub = xnn_init_qu8_add_minmax_sse4_params,
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003177 .element_tile = 8,
3178 };
3179 } else if (cpuinfo_has_x86_sse4_1()) {
3180 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3181 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__sse41_mul16_ld64_x8,
3182 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
3183 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07003184 .init.qu8_addsub = xnn_init_qu8_add_minmax_sse2_params,
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003185 .element_tile = 8,
3186 };
3187 } else {
3188 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3189 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__sse2_mul16_ld64_x8,
3190 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
3191 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07003192 .init.qu8_addsub = xnn_init_qu8_add_minmax_sse2_params,
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003193 .element_tile = 8,
3194 };
3195 }
Marat Dukhan0853b8a2021-08-03 01:01:53 -07003196 if (cpuinfo_has_x86_avx()) {
3197 xnn_params.qu8.vmul = (struct vbinary_parameters) {
3198 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3199 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3200 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3201 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
3202 .element_tile = 16,
3203 };
3204 } else if (cpuinfo_has_x86_sse4_1()) {
3205 xnn_params.qu8.vmul = (struct vbinary_parameters) {
3206 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3207 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3208 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3209 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
3210 .element_tile = 16,
3211 };
3212 } else {
3213 xnn_params.qu8.vmul = (struct vbinary_parameters) {
3214 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3215 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3216 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3217 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
3218 .element_tile = 8,
3219 };
3220 }
Marat Dukhan08b7a972020-07-14 18:17:29 -07003221 #endif // XNN_NO_QU8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07003222
Frank Barchardb40ee632021-12-30 11:10:02 -08003223 /**************************** U8 x86 micro-kernels ****************************/
Marat Dukhan23147532021-08-16 07:26:56 -07003224 #ifndef XNN_NO_S8_OPERATORS
3225 init_flags |= XNN_INIT_FLAG_S8;
3226
3227 if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07003228 xnn_params.s8.clamp = (struct vunary_parameters) {
3229 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__sse41_x64,
3230 .init.s8_minmax = xnn_init_s8_minmax_sse4_params,
3231 .element_tile = 64,
3232 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08003233 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
3234 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__sse41_c16,
3235 .pixel_tile = 1,
3236 .channel_tile = 16,
3237 };
Marat Dukhan23147532021-08-16 07:26:56 -07003238 xnn_params.s8.maxpool = (struct maxpool_parameters) {
3239 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__sse41_c16,
3240 .init.s8 = xnn_init_s8_minmax_sse4_params,
3241 .mr = 9,
3242 .qr = 8,
3243 };
3244 } else {
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07003245 xnn_params.s8.clamp = (struct vunary_parameters) {
3246 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__sse2_x64,
3247 .init.s8_minmax = xnn_init_s8_minmax_sse2_params,
3248 .element_tile = 64,
3249 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08003250 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
3251 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__sse2_c8,
3252 .pixel_tile = 1,
3253 .channel_tile = 8,
3254 };
Marat Dukhan23147532021-08-16 07:26:56 -07003255 xnn_params.s8.maxpool = (struct maxpool_parameters) {
3256 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__sse2_c16,
3257 .init.s8 = xnn_init_s8_minmax_sse2_params,
3258 .mr = 9,
3259 .qr = 8,
3260 };
3261 }
Marat Dukhan94912792021-08-16 21:40:30 -07003262 #endif // XNN_NO_S8_OPERATORS
Marat Dukhan23147532021-08-16 07:26:56 -07003263
Frank Barchardb40ee632021-12-30 11:10:02 -08003264 /**************************** U8 x86 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003265 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07003266 init_flags |= XNN_INIT_FLAG_U8;
3267
Marat Dukhan94912792021-08-16 21:40:30 -07003268 xnn_params.u8.clamp = (struct vunary_parameters) {
3269 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__sse2_x64,
3270 .init.u8_minmax = xnn_init_u8_minmax_sse2_params,
3271 .element_tile = 64,
3272 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08003273 if (cpuinfo_has_x86_sse4_1()) {
3274 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
3275 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__sse41_c16,
3276 .pixel_tile = 1,
3277 .channel_tile = 16,
3278 };
3279 } else {
3280 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
3281 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__sse2_c8,
3282 .pixel_tile = 1,
3283 .channel_tile = 8,
3284 };
3285 }
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003286 xnn_params.u8.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07003287 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16,
Marat Dukhan91ae1652021-08-15 19:19:49 -07003288 .init.u8 = xnn_init_u8_minmax_sse2_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003289 .mr = 9,
3290 .qr = 8,
3291 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003292 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
3293 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__sse2;
3294 #endif // XNN_NO_U8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07003295
Frank Barchardb40ee632021-12-30 11:10:02 -08003296 /**************************** X8 x86 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003297 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07003298 init_flags |= XNN_INIT_FLAG_X8;
3299
Marat Dukhan98e054b2021-09-13 09:43:50 -07003300 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3301 xnn_params.x8.lut = xnn_x8_lut_ukernel__avx512skx_vpshufb_x64;
3302 } else if (cpuinfo_has_x86_avx2()) {
3303 xnn_params.x8.lut = xnn_x8_lut_ukernel__avx2_x128;
3304 } else if (cpuinfo_has_x86_avx()) {
3305 xnn_params.x8.lut = xnn_x8_lut_ukernel__avx_x64;
3306 } else {
3307 // Note: SSSE3 version is usually slower than scalar
3308 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
3309 }
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003310 xnn_params.x8.zip = (struct zip_parameters) {
3311 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__sse2,
3312 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__sse2,
3313 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__sse2,
3314 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__sse2,
3315 };
3316 #endif // XNN_NO_X8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07003317
Marat Dukhan8f920a62022-01-19 14:56:23 -08003318 /**************************** F16 x86 micro-kernels ****************************/
3319 #ifndef XNN_NO_F16_OPERATORS
3320 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx2()) {
3321 init_flags |= XNN_INIT_FLAG_F16;
3322
3323 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast);
3324 xnn_params.f16.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast);
3325 xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast);
3326 xnn_params.f16.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast);
3327 xnn_params.f16.gemm.init.f16 = xnn_init_f16_scaleminmax_avx_params;
3328 xnn_params.f16.gemm.mr = 4;
3329 xnn_params.f16.gemm.nr = 16;
3330
3331 xnn_params.f16.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x4__fma3;
3332 xnn_params.f16.dwconv[0].init.f16 = xnn_init_f16_minmax_avx_params;
3333 xnn_params.f16.dwconv[0].channel_tile = 16;
3334 xnn_params.f16.dwconv[0].primary_tile = 4;
3335
3336 xnn_params.f16.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x9__fma3;
3337 xnn_params.f16.dwconv[1].init.f16 = xnn_init_f16_minmax_avx_params;
3338 xnn_params.f16.dwconv[1].channel_tile = 16;
3339 xnn_params.f16.dwconv[1].primary_tile = 9;
3340
3341 xnn_params.f16.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up8x25__fma3_acc2;
3342 xnn_params.f16.dwconv[2].init.f16 = xnn_init_f16_minmax_avx_params;
3343 xnn_params.f16.dwconv[2].channel_tile = 8;
3344 xnn_params.f16.dwconv[2].primary_tile = 25;
3345
3346 xnn_params.f16.gavgpool = (struct gavgpool_parameters) {
3347 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8,
3348 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8,
3349 .init.f16 = xnn_init_f16_scaleminmax_avx_params,
3350 .update.f16 = xnn_update_f16_scaleminmax_avx_params,
3351 .row_tile = 7,
3352 .channel_tile = 8,
3353 };
3354 xnn_params.f16.vadd = (struct vbinary_parameters) {
3355 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vadd_minmax_ukernel__f16c_x16,
3356 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__f16c_x16,
3357 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__f16c_x16,
3358 .init.f16_minmax = xnn_init_f16_minmax_avx_params,
3359 .element_tile = 16,
3360 };
3361 xnn_params.f16.vmul = (struct vbinary_parameters) {
3362 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmul_minmax_ukernel__f16c_x16,
3363 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__f16c_x16,
3364 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__f16c_x16,
3365 .init.f16_minmax = xnn_init_f16_minmax_avx_params,
3366 .element_tile = 16,
3367 };
3368 xnn_params.f16.vmulcaddc = (struct vmulcaddc_parameters) {
3369 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f16_vmulcaddc_minmax_ukernel_c8__fma3_2x,
3370 .init.f16 = xnn_init_f16_minmax_avx_params,
3371 .channel_tile = 8,
3372 .row_tile = 2,
3373 };
3374 xnn_params.f16.hswish = (struct vunary_parameters) {
3375 .ukernel = (xnn_univector_ukernel_function) xnn_f16_vhswish_ukernel__f16c_x16,
3376 .init.f16_hswish = xnn_init_f16_hswish_avx_params,
3377 .element_tile = 16,
3378 };
3379 }
3380 #endif // XNN_NO_F16_OPERATORS
3381
Frank Barchardb40ee632021-12-30 11:10:02 -08003382 /**************************** F32 x86 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003383 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07003384 init_flags |= XNN_INIT_FLAG_F32;
3385
Marat Dukhan0f349c42019-11-27 11:58:54 -08003386 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07003387 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast);
3388 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast);
3389 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast);
3390 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003391 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003392 xnn_params.f32.gemm.mr = 7;
3393 xnn_params.f32.gemm.nr = 16;
Marat Dukhan48976702022-01-10 18:18:04 -08003394 } else if (cpuinfo_has_x86_fma3()) {
Marat Dukhan27121322019-12-09 14:57:40 -08003395 switch (cpuinfo_get_core(0)->uarch) {
3396 case cpuinfo_uarch_zen:
Marat Dukhanb3801eb2020-03-12 13:41:11 -07003397 case cpuinfo_uarch_dhyana:
Marat Dukhanaefaef32020-04-09 07:09:34 -07003398 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast);
3399 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast);
3400 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast);
3401 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003402 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003403 xnn_params.f32.gemm.mr = 4;
3404 xnn_params.f32.gemm.nr = 16;
3405 xnn_params.f32.gemm.log2_sr = 2;
Marat Dukhan27121322019-12-09 14:57:40 -08003406 break;
3407 default:
Marat Dukhanaefaef32020-04-09 07:09:34 -07003408 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast);
3409 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast);
3410 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast);
3411 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003412 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003413 xnn_params.f32.gemm.mr = 5;
3414 xnn_params.f32.gemm.nr = 16;
Marat Dukhan27121322019-12-09 14:57:40 -08003415 break;
3416 }
Marat Dukhan48976702022-01-10 18:18:04 -08003417 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07003418 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast);
3419 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast);
3420 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast);
3421 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003422 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003423 xnn_params.f32.gemm.mr = 5;
3424 xnn_params.f32.gemm.nr = 16;
Marat Dukhan1025ea32019-11-21 16:01:08 -08003425 } else {
Marat Dukhanaefaef32020-04-09 07:09:34 -07003426 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__sse_load1);
3427 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__sse_load1);
3428 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__sse_load1);
3429 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__sse_load1);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003430 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_sse_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003431 xnn_params.f32.gemm.mr = 4;
3432 xnn_params.f32.gemm.nr = 8;
Marat Dukhan1025ea32019-11-21 16:01:08 -08003433 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07003434 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__sse);
3435 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__sse);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003436 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_sse_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003437 xnn_params.f32.gemm2.mr = 4;
3438 xnn_params.f32.gemm2.nr = 2;
3439 xnn_params.f32.gemm2.log2_kr = 2;
3440
Marat Dukhan479f87e2019-11-27 15:17:06 -08003441 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003442 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__avx512f;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003443 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003444 xnn_params.f32.dwconv[0].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003445 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003446
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003447 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__avx512f;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003448 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003449 xnn_params.f32.dwconv[1].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003450 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003451
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003452 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__avx512f;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003453 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003454 xnn_params.f32.dwconv[2].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003455 xnn_params.f32.dwconv[2].primary_tile = 9;
3456
3457 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x25__avx512f;
3458 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
3459 xnn_params.f32.dwconv[3].channel_tile = 16;
3460 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhan48976702022-01-10 18:18:04 -08003461 } else if (cpuinfo_has_x86_fma3()) {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003462 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__fma3;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003463 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003464 xnn_params.f32.dwconv[0].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003465 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003466
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003467 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__fma3;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003468 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003469 xnn_params.f32.dwconv[1].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003470 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003471
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003472 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__fma3;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003473 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_avx_params;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003474 xnn_params.f32.dwconv[2].channel_tile = 16;
3475 xnn_params.f32.dwconv[2].primary_tile = 9;
3476
3477 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__fma3;
3478 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_avx_params;
3479 xnn_params.f32.dwconv[3].channel_tile = 8;
3480 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhan48976702022-01-10 18:18:04 -08003481 } else if (cpuinfo_has_x86_avx()) {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003482 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__avx;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003483 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003484 xnn_params.f32.dwconv[0].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003485 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003486
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003487 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__avx;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003488 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003489 xnn_params.f32.dwconv[1].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003490 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003491
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003492 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__avx;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003493 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_avx_params;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003494 xnn_params.f32.dwconv[2].channel_tile = 16;
3495 xnn_params.f32.dwconv[2].primary_tile = 9;
3496
3497 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__avx;
3498 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_avx_params;
3499 xnn_params.f32.dwconv[3].channel_tile = 8;
3500 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhan17ec5f32019-11-22 13:34:16 -08003501 } else {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003502 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__sse;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003503 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_sse_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003504 xnn_params.f32.dwconv[0].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003505 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003506
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003507 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__sse;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003508 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_sse_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003509 xnn_params.f32.dwconv[1].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003510 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003511
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003512 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__sse;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003513 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_sse_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003514 xnn_params.f32.dwconv[2].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003515 xnn_params.f32.dwconv[2].primary_tile = 9;
3516
3517 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__sse;
3518 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_sse_params;
3519 xnn_params.f32.dwconv[3].channel_tile = 8;
3520 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhan17ec5f32019-11-22 13:34:16 -08003521 }
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003522 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08003523 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__sse_c4,
3524 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4,
Marat Dukhan84598222022-01-09 21:39:22 -08003525 .init.f32 = xnn_init_f32_scaleminmax_sse_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08003526 .primary_tile = 9,
3527 .incremental_tile = 8,
3528 .channel_tile = 4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003529 };
3530 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08003531 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4,
3532 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4,
3533 .primary_tile = 9,
3534 .incremental_tile = 8,
3535 .channel_tile = 4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003536 };
3537 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08003538 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4,
3539 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4,
Marat Dukhan84598222022-01-09 21:39:22 -08003540 .init.f32 = xnn_init_f32_scaleminmax_sse_params,
3541 .update.f32 = xnn_update_f32_scaleminmax_sse_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08003542 .row_tile = 7,
3543 .channel_tile = 4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003544 };
3545 xnn_params.f32.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07003546 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4,
Marat Dukhan91ae1652021-08-15 19:19:49 -07003547 .init.f32 = xnn_init_f32_minmax_sse_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003548 .mr = 9,
3549 .qr = 8,
3550 };
3551 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07003552 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__sse2_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003553 .mr = 4,
3554 };
3555 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07003556 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__sse2_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003557 .mr = 9,
3558 };
3559 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07003560 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003561 .mr = 9,
3562 .qr = 8,
3563 };
Marat Dukhan660fd192020-03-10 04:55:30 -07003564 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
3565 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__sse_c8,
Marat Dukhan69722492019-11-11 19:55:50 -08003566 .pixel_tile = 1,
3567 .channel_tile = 8,
3568 };
Marat Dukhane2c3f292019-11-27 15:40:54 -08003569 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhane5efb162021-12-31 10:26:13 -08003570 xnn_params.f32.abs = (struct vunary_parameters) {
3571 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__avx512f_x16,
3572 .init.f32_abs = xnn_init_f32_abs_avx512_params,
3573 .element_tile = 16,
3574 };
Marat Dukhan48976702022-01-10 18:18:04 -08003575 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhane5efb162021-12-31 10:26:13 -08003576 xnn_params.f32.abs = (struct vunary_parameters) {
3577 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__avx_x16,
3578 .init.f32_abs = xnn_init_f32_abs_avx_params,
3579 .element_tile = 16,
3580 };
Marat Dukhan5020b962020-06-08 13:30:10 -07003581 } else {
Marat Dukhane5efb162021-12-31 10:26:13 -08003582 xnn_params.f32.abs = (struct vunary_parameters) {
3583 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__sse_x8,
3584 .init.f32_abs = xnn_init_f32_abs_sse_params,
3585 .element_tile = 8,
3586 };
Marat Dukhan5020b962020-06-08 13:30:10 -07003587 }
3588 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan94912792021-08-16 21:40:30 -07003589 xnn_params.f32.clamp = (struct vunary_parameters) {
3590 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__avx512f_x16,
3591 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
3592 .element_tile = 16,
3593 };
Marat Dukhan48976702022-01-10 18:18:04 -08003594 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan94912792021-08-16 21:40:30 -07003595 xnn_params.f32.clamp = (struct vunary_parameters) {
3596 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__avx_x16,
3597 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
3598 .element_tile = 16,
3599 };
Marat Dukhane2c3f292019-11-27 15:40:54 -08003600 } else {
Marat Dukhan94912792021-08-16 21:40:30 -07003601 xnn_params.f32.clamp = (struct vunary_parameters) {
3602 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__sse_x8,
3603 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
3604 .element_tile = 8,
3605 };
Marat Dukhane2c3f292019-11-27 15:40:54 -08003606 }
Marat Dukhan662faa02019-12-09 22:48:16 -08003607 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08003608 xnn_params.f32.elu = (struct vunary_parameters) {
3609 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x64,
3610 .init.f32_elu = xnn_init_f32_elu_avx512_rr1_lut16_p3_params,
3611 .element_tile = 64,
3612 };
Marat Dukhan48976702022-01-10 18:18:04 -08003613 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08003614 xnn_params.f32.elu = (struct vunary_parameters) {
3615 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56,
3616 .init.f32_elu = xnn_init_f32_elu_avx2_rr1_lut4_p4_params,
3617 .element_tile = 56,
3618 };
Marat Dukhan48976702022-01-10 18:18:04 -08003619 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08003620 xnn_params.f32.elu = (struct vunary_parameters) {
3621 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x32,
3622 .init.f32_elu = xnn_init_f32_elu_avx_rr2_lut4_p4_params,
3623 .element_tile = 32,
3624 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08003625 } else {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08003626 xnn_params.f32.elu = (struct vunary_parameters) {
3627 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__sse2_rr2_lut16_p3_x12,
3628 .init.f32_elu = xnn_init_f32_elu_sse2_rr2_lut16_p3_params,
3629 .element_tile = 12,
3630 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08003631 }
3632 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan561d0682021-12-23 16:12:35 -08003633 xnn_params.f32.hswish = (struct vunary_parameters) {
3634 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__avx512f_x16,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08003635 .init.f32_hswish = xnn_init_f32_hswish_avx512_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08003636 .element_tile = 16,
3637 };
Marat Dukhan48976702022-01-10 18:18:04 -08003638 } else if (cpuinfo_has_x86_fma3()) {
Marat Dukhan561d0682021-12-23 16:12:35 -08003639 xnn_params.f32.hswish = (struct vunary_parameters) {
3640 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__fma3_x16,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08003641 .init.f32_hswish = xnn_init_f32_hswish_avx_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08003642 .element_tile = 16,
3643 };
Marat Dukhan48976702022-01-10 18:18:04 -08003644 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan561d0682021-12-23 16:12:35 -08003645 xnn_params.f32.hswish = (struct vunary_parameters) {
3646 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__avx_x16,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08003647 .init.f32_hswish = xnn_init_f32_hswish_avx_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08003648 .element_tile = 16,
3649 };
Marat Dukhan662faa02019-12-09 22:48:16 -08003650 } else {
Marat Dukhan561d0682021-12-23 16:12:35 -08003651 xnn_params.f32.hswish = (struct vunary_parameters) {
3652 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__sse_x8,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08003653 .init.f32_hswish = xnn_init_f32_hswish_sse_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08003654 .element_tile = 8,
3655 };
Marat Dukhan662faa02019-12-09 22:48:16 -08003656 }
Marat Dukhan5020b962020-06-08 13:30:10 -07003657 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan2894e992021-12-30 08:29:48 -08003658 xnn_params.f32.lrelu = (struct vunary_parameters) {
3659 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__avx512f_x16,
3660 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
3661 .element_tile = 16,
3662 };
Marat Dukhan48976702022-01-10 18:18:04 -08003663 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan2894e992021-12-30 08:29:48 -08003664 xnn_params.f32.lrelu = (struct vunary_parameters) {
3665 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__avx_x16,
3666 .init.f32_lrelu = xnn_init_f32_lrelu_avx_params,
3667 .element_tile = 16,
3668 };
Marat Dukhan0d3f4672020-06-25 16:42:58 -07003669 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhan2894e992021-12-30 08:29:48 -08003670 xnn_params.f32.lrelu = (struct vunary_parameters) {
3671 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__sse41_x8,
3672 .init.f32_lrelu = xnn_init_f32_lrelu_sse_params,
3673 .element_tile = 8,
3674 };
Marat Dukhan28813332020-06-10 18:05:38 -07003675 } else {
Marat Dukhan2894e992021-12-30 08:29:48 -08003676 xnn_params.f32.lrelu = (struct vunary_parameters) {
3677 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__sse_x8,
3678 .init.f32_lrelu = xnn_init_f32_lrelu_sse_params,
3679 .element_tile = 8,
3680 };
Marat Dukhan28813332020-06-10 18:05:38 -07003681 }
3682 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhane5efb162021-12-31 10:26:13 -08003683 xnn_params.f32.neg = (struct vunary_parameters) {
3684 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__avx512f_x16,
3685 .init.f32_neg = xnn_init_f32_neg_avx512_params,
3686 .element_tile = 16,
3687 };
Marat Dukhan48976702022-01-10 18:18:04 -08003688 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhane5efb162021-12-31 10:26:13 -08003689 xnn_params.f32.neg = (struct vunary_parameters) {
3690 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__avx_x16,
3691 .init.f32_neg = xnn_init_f32_neg_avx_params,
3692 .element_tile = 16,
3693 };
Marat Dukhan5020b962020-06-08 13:30:10 -07003694 } else {
Marat Dukhane5efb162021-12-31 10:26:13 -08003695 xnn_params.f32.neg = (struct vunary_parameters) {
3696 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__sse_x8,
3697 .init.f32_neg = xnn_init_f32_neg_sse_params,
3698 .element_tile = 8,
3699 };
Marat Dukhan5020b962020-06-08 13:30:10 -07003700 }
Marat Dukhan64e52512020-06-09 13:41:16 -07003701 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan0e801372022-01-04 00:10:41 -08003702 xnn_params.f32.rndne = (struct vunary_parameters) {
3703 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__avx512f_x16,
3704 .element_tile = 16,
3705 };
3706 xnn_params.f32.rndz = (struct vunary_parameters) {
3707 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__avx512f_x16,
3708 .element_tile = 16,
3709 };
3710 xnn_params.f32.rndu = (struct vunary_parameters) {
3711 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__avx512f_x16,
3712 .element_tile = 16,
3713 };
3714 xnn_params.f32.rndd = (struct vunary_parameters) {
3715 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__avx512f_x16,
3716 .element_tile = 16,
3717 };
Marat Dukhan48976702022-01-10 18:18:04 -08003718 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan0e801372022-01-04 00:10:41 -08003719 xnn_params.f32.rndne = (struct vunary_parameters) {
3720 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__avx_x16,
3721 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
3722 .element_tile = 16,
3723 };
3724 xnn_params.f32.rndz = (struct vunary_parameters) {
3725 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__avx_x16,
3726 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
3727 .element_tile = 16,
3728 };
3729 xnn_params.f32.rndu = (struct vunary_parameters) {
3730 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__avx_x16,
3731 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
3732 .element_tile = 16,
3733 };
3734 xnn_params.f32.rndd = (struct vunary_parameters) {
3735 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__avx_x16,
3736 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
3737 .element_tile = 16,
3738 };
Marat Dukhan64e52512020-06-09 13:41:16 -07003739 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhan0e801372022-01-04 00:10:41 -08003740 xnn_params.f32.rndne = (struct vunary_parameters) {
3741 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__sse41_x8,
3742 .element_tile = 8,
3743 };
3744 xnn_params.f32.rndz = (struct vunary_parameters) {
3745 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__sse41_x8,
3746 .element_tile = 8,
3747 };
3748 xnn_params.f32.rndu = (struct vunary_parameters) {
3749 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__sse41_x8,
3750 .element_tile = 8,
3751 };
3752 xnn_params.f32.rndd = (struct vunary_parameters) {
3753 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__sse41_x8,
3754 .element_tile = 8,
3755 };
Marat Dukhan64e52512020-06-09 13:41:16 -07003756 } else {
Marat Dukhan0e801372022-01-04 00:10:41 -08003757 xnn_params.f32.rndne = (struct vunary_parameters) {
3758 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__sse2_x8,
3759 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
3760 .element_tile = 8,
3761 };
3762 xnn_params.f32.rndz = (struct vunary_parameters) {
3763 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__sse2_x8,
3764 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
3765 .element_tile = 8,
3766 };
3767 xnn_params.f32.rndu = (struct vunary_parameters) {
3768 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__sse2_x8,
3769 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
3770 .element_tile = 8,
3771 };
3772 xnn_params.f32.rndd = (struct vunary_parameters) {
3773 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__sse2_x8,
3774 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
3775 .element_tile = 8,
3776 };
Marat Dukhan64e52512020-06-09 13:41:16 -07003777 }
Marat Dukhand9ca7e62020-09-23 23:45:29 -07003778 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhance834ad2022-01-03 00:22:01 -08003779 xnn_params.f32.sigmoid = (struct vunary_parameters) {
3780 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x64,
3781 .init.f32_sigmoid = xnn_init_f32_sigmoid_avx512_rr2_lut32_p2_params,
3782 .element_tile = 64,
3783 };
Marat Dukhan48976702022-01-10 18:18:04 -08003784 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhance834ad2022-01-03 00:22:01 -08003785 xnn_params.f32.sigmoid = (struct vunary_parameters) {
3786 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_x40,
3787 .init.f32_sigmoid = xnn_init_f32_sigmoid_avx2_rr1_p5_params,
3788 .element_tile = 40,
3789 };
Marat Dukhan48976702022-01-10 18:18:04 -08003790 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhance834ad2022-01-03 00:22:01 -08003791 xnn_params.f32.sigmoid = (struct vunary_parameters) {
3792 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_x40,
3793 .init.f32_sigmoid = xnn_init_f32_sigmoid_avx_rr2_p5_params,
3794 .element_tile = 40,
3795 };
Marat Dukhan6dd71362020-09-17 23:11:21 -07003796 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhance834ad2022-01-03 00:22:01 -08003797 xnn_params.f32.sigmoid = (struct vunary_parameters) {
3798 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_x8,
3799 .init.f32_sigmoid = xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params,
3800 .element_tile = 8,
3801 };
Marat Dukhanfa0a4322020-01-06 16:14:29 -08003802 } else {
Marat Dukhance834ad2022-01-03 00:22:01 -08003803 xnn_params.f32.sigmoid = (struct vunary_parameters) {
3804 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__sse2_rr2_lut64_p2_div_x8,
3805 .init.f32_sigmoid = xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params,
3806 .element_tile = 8,
3807 };
Marat Dukhanfa0a4322020-01-06 16:14:29 -08003808 }
Marat Dukhan90eca0a2020-03-11 00:52:23 -07003809 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhane5efb162021-12-31 10:26:13 -08003810 xnn_params.f32.sqr = (struct vunary_parameters) {
3811 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__avx512f_x16,
3812 .element_tile = 16,
3813 };
Marat Dukhan48976702022-01-10 18:18:04 -08003814 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhane5efb162021-12-31 10:26:13 -08003815 xnn_params.f32.sqr = (struct vunary_parameters) {
3816 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__avx_x16,
3817 .init.f32_default = xnn_init_f32_default_avx_params,
3818 .element_tile = 16,
3819 };
Marat Dukhan5020b962020-06-08 13:30:10 -07003820 } else {
Marat Dukhane5efb162021-12-31 10:26:13 -08003821 xnn_params.f32.sqr = (struct vunary_parameters) {
3822 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__sse_x8,
3823 .element_tile = 8,
3824 };
Marat Dukhan5020b962020-06-08 13:30:10 -07003825 }
Marat Dukhan48976702022-01-10 18:18:04 -08003826 if (cpuinfo_has_x86_avx()) {
Marat Dukhane72b2822021-12-30 14:46:58 -08003827 xnn_params.f32.sqrt = (struct vunary_parameters) {
3828 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__avx_sqrt_x8,
3829 .init.f32_sqrt = xnn_init_f32_sqrt_avx_params,
3830 .element_tile = 8,
3831 };
Marat Dukhan6804bbd2020-06-30 19:26:11 -07003832 } else {
Marat Dukhane72b2822021-12-30 14:46:58 -08003833 xnn_params.f32.sqrt = (struct vunary_parameters) {
3834 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__sse_sqrt_x4,
3835 .element_tile = 4,
3836 };
Marat Dukhan6804bbd2020-06-30 19:26:11 -07003837 }
Marat Dukhan5020b962020-06-08 13:30:10 -07003838 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan90eca0a2020-03-11 00:52:23 -07003839 xnn_params.f32.prelu = (struct prelu_parameters) {
3840 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__avx512f_2x16,
3841 .row_tile = 2,
3842 .channel_tile = 16,
3843 };
Marat Dukhan48976702022-01-10 18:18:04 -08003844 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan90eca0a2020-03-11 00:52:23 -07003845 xnn_params.f32.prelu = (struct prelu_parameters) {
3846 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__avx_2x16,
3847 .row_tile = 2,
3848 .channel_tile = 16,
3849 };
Marat Dukhan39b5e942020-06-24 15:03:48 -07003850 } else if (cpuinfo_has_x86_sse4_1()) {
3851 xnn_params.f32.prelu = (struct prelu_parameters) {
3852 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__sse41_2x8,
3853 .row_tile = 2,
3854 .channel_tile = 8,
3855 };
Marat Dukhan90eca0a2020-03-11 00:52:23 -07003856 } else {
3857 xnn_params.f32.prelu = (struct prelu_parameters) {
3858 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__sse2_2x8,
3859 .row_tile = 2,
3860 .channel_tile = 8,
3861 };
3862 }
Marat Dukhan4a5c7712022-01-05 22:43:13 -08003863 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
3864 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_x20_acc2,
3865 .init = xnn_init_f32_expminus_sse2_rr2_p5_params,
3866 .element_tile = 20,
3867 };
Marat Dukhan1edc4542020-01-27 12:40:13 -08003868 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__sse;
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003869 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
3870 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003871 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__avx512f_x32,
3872 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx512f_x32,
3873 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx512f_x32,
Marat Dukhanf6004972021-12-30 11:23:02 -08003874 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003875 .element_tile = 32,
3876 };
3877 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003878 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__avx512f_x32,
3879 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__avx512f_x32,
3880 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__avx512f_x32,
Marat Dukhanf6004972021-12-30 11:23:02 -08003881 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003882 .element_tile = 32,
3883 };
3884 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003885 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__avx512f_x32,
3886 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx512f_x32,
3887 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx512f_x32,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003888 .element_tile = 32,
3889 };
3890 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003891 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__avx512f_x32,
3892 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx512f_x32,
3893 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx512f_x32,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003894 .element_tile = 32,
3895 };
3896 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003897 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__avx512f_x32,
3898 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
3899 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
Marat Dukhanf6004972021-12-30 11:23:02 -08003900 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003901 .element_tile = 32,
3902 };
3903 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003904 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__avx512f_x32,
3905 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__avx512f_x32,
3906 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__avx512f_x32,
Marat Dukhanf6004972021-12-30 11:23:02 -08003907 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003908 .element_tile = 32,
3909 };
Marat Dukhanf7399262020-06-05 10:58:44 -07003910 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003911 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__avx512f_x32,
3912 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx512f_x32,
3913 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx512f_x32,
Marat Dukhanf7399262020-06-05 10:58:44 -07003914 .element_tile = 32,
3915 };
Marat Dukhan48976702022-01-10 18:18:04 -08003916 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003917 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003918 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__avx_x16,
3919 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx_x16,
3920 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08003921 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003922 .element_tile = 16,
3923 };
3924 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003925 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__avx_x16,
3926 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__avx_x16,
3927 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__avx_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08003928 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003929 .element_tile = 16,
3930 };
3931 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003932 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__avx_x16,
3933 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx_x16,
3934 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx_x16,
Marat Dukhan98c52152021-12-30 13:31:00 -08003935 .init.f32_default = xnn_init_f32_default_avx_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003936 .element_tile = 16,
3937 };
3938 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003939 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__avx_x16,
3940 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx_x16,
3941 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx_x16,
Marat Dukhan98c52152021-12-30 13:31:00 -08003942 .init.f32_default = xnn_init_f32_default_avx_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003943 .element_tile = 16,
3944 };
3945 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003946 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__avx_x16,
3947 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx_x16,
3948 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08003949 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003950 .element_tile = 16,
3951 };
3952 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003953 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__avx_x16,
3954 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__avx_x16,
3955 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__avx_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08003956 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003957 .element_tile = 16,
3958 };
Marat Dukhanf7399262020-06-05 10:58:44 -07003959 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003960 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__avx_x16,
3961 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx_x16,
3962 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx_x16,
Marat Dukhan98c52152021-12-30 13:31:00 -08003963 .init.f32_default = xnn_init_f32_default_avx_params,
Marat Dukhanf7399262020-06-05 10:58:44 -07003964 .element_tile = 16,
3965 };
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003966 } else {
3967 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003968 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__sse_x8,
3969 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__sse_x8,
3970 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__sse_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08003971 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003972 .element_tile = 8,
3973 };
3974 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003975 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__sse_x8,
3976 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__sse_x8,
3977 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__sse_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08003978 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003979 .element_tile = 8,
3980 };
3981 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003982 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__sse_x8,
3983 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
3984 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003985 .element_tile = 8,
3986 };
3987 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003988 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__sse_x8,
3989 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__sse_x8,
3990 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__sse_x8,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003991 .element_tile = 8,
3992 };
3993 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07003994 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__sse_x8,
3995 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__sse_x8,
3996 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__sse_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08003997 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08003998 .element_tile = 8,
3999 };
4000 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004001 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__sse_x8,
4002 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__sse_x8,
4003 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__sse_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08004004 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004005 .element_tile = 8,
4006 };
Marat Dukhanf7399262020-06-05 10:58:44 -07004007 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004008 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__sse_x8,
4009 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__sse_x8,
4010 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__sse_x8,
Marat Dukhanf7399262020-06-05 10:58:44 -07004011 .element_tile = 8,
4012 };
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004013 }
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004014 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07004015 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07004016 .init.f32 = xnn_init_f32_minmax_sse_params,
Marat Dukhan49e6ee92019-11-06 15:55:29 -08004017 .channel_tile = 4,
4018 .row_tile = 2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004019 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08004020 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -08004021 // Sparse microkernels on x86 currently target only SSE, and on processors
4022 // with AVX ISA dense inference is expected to be faster than sparse.
4023 if (!cpuinfo_has_x86_avx()) {
4024 init_flags |= XNN_INIT_FLAG_CHW_OPT;
4025 }
4026
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004027 xnn_params.f32.spmm = (struct spmm_parameters) {
Frank Barchard4fd38b22020-10-30 17:10:11 -07004028 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__sse,
4029 .mr = 32,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004030 .nr = 1,
4031 };
Erich Elsen5b2e07a2020-06-09 03:27:59 -07004032 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
4033 .ukernel_with_symm_padding =
4034 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2,
4035 .output_channel_tile = 4,
4036 .output_height_tile = 2,
4037 .output_width_tile = 2,
4038 };
Marat Dukhan48976702022-01-10 18:18:04 -08004039 if (cpuinfo_has_x86_ssse3()) {
Frank Barchard0b18cb32020-11-23 10:50:44 -08004040 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
4041 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4_acc2,
Frank Barchard0b18cb32020-11-23 10:50:44 -08004042 .output_width_tile = 4,
4043 .output_height_tile = 2,
4044 };
4045 } else {
4046 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
4047 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2,
Frank Barchard0b18cb32020-11-23 10:50:44 -08004048 .output_width_tile = 4,
4049 .output_height_tile = 2,
4050 };
4051 }
Marat Dukhanbf715f92020-10-23 20:17:00 -07004052 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
4053 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004054 .output_width_tile = 4,
4055 .output_height_tile = 1,
4056 };
Marat Dukhand0503892020-10-30 08:22:04 -07004057 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
4058 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4,
Marat Dukhand0503892020-10-30 08:22:04 -07004059 .output_width_tile = 4,
4060 .output_height_tile = 4,
4061 };
Marat Dukhanccca2142020-10-30 17:32:45 -07004062 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
4063 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4,
Marat Dukhanccca2142020-10-30 17:32:45 -07004064 .output_width_tile = 4,
4065 .output_height_tile = 2,
4066 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07004067 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
4068 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__sse_x4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004069 .channel_tile = 4,
4070 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -07004071 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
Artsiom Ablavatskib3ffd582021-03-31 13:00:08 -07004072 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__sse_p8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07004073 .channel_tile = 1,
Artsiom Ablavatskib3ffd582021-03-31 13:00:08 -07004074 .pixel_tile = 8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07004075 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08004076 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004077 #endif // XNN_NO_F32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07004078
Frank Barchardb40ee632021-12-30 11:10:02 -08004079 /*************************** VCVT x86 micro-kernels ***************************/
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004080 #ifndef XNN_NO_VCVT_OPERATORS
4081 init_flags |= XNN_INIT_FLAG_VCVT;
4082
4083 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhan134f9842021-12-29 19:57:31 -08004084 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4085 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__avx512skx_x16,
4086 .element_tile = 16,
4087 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08004088 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4089 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__avx512skx_x16,
4090 .element_tile = 16,
4091 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004092 } else if (cpuinfo_has_x86_f16c()) {
Marat Dukhan134f9842021-12-29 19:57:31 -08004093 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4094 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__f16c_x16,
4095 .element_tile = 16,
4096 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08004097 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4098 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__f16c_x16,
4099 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_f16c_params,
4100 .element_tile = 16,
4101 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004102 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan134f9842021-12-29 19:57:31 -08004103 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4104 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__avx_int16_x16,
4105 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
4106 .element_tile = 16,
4107 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08004108 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4109 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__avx_x24,
4110 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
4111 .element_tile = 24,
4112 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004113 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhan134f9842021-12-29 19:57:31 -08004114 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4115 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__sse41_int16_x16,
4116 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
4117 .element_tile = 16,
4118 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08004119 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4120 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__sse41_x8,
4121 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
4122 .element_tile = 8,
4123 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004124 } else {
Marat Dukhan134f9842021-12-29 19:57:31 -08004125 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4126 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__sse2_int16_x32,
4127 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
4128 .element_tile = 32,
4129 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08004130 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4131 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__sse2_x16,
4132 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
4133 .element_tile = 16,
4134 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004135 }
Marat Dukhan2edf8632021-12-14 23:17:14 -08004136 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
4137 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4138 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx512skx_x128,
4139 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx512_params,
4140 .element_tile = 128,
4141 };
4142 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan0d399ca2021-12-14 19:25:50 -08004143 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4144 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx2_x64,
4145 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx2_params,
4146 .element_tile = 64,
4147 };
4148 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhanb91432c2021-12-14 16:52:09 -08004149 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4150 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx_x32,
4151 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx_params,
4152 .element_tile = 32,
4153 };
4154 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhaned2d7762021-12-03 23:51:19 -08004155 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4156 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__sse41_x32,
4157 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_sse4_params,
4158 .element_tile = 32,
4159 };
4160 } else {
4161 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4162 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__sse2_x32,
4163 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_sse2_params,
4164 .element_tile = 32,
4165 };
4166 }
Marat Dukhan2edf8632021-12-14 23:17:14 -08004167 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
4168 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4169 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx512skx_x128,
4170 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx512_params,
4171 .element_tile = 128,
4172 };
4173 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan0d399ca2021-12-14 19:25:50 -08004174 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4175 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx2_x64,
4176 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx2_params,
4177 .element_tile = 64,
4178 };
4179 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhanb91432c2021-12-14 16:52:09 -08004180 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4181 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx_x32,
4182 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx_params,
4183 .element_tile = 32,
4184 };
4185 } else {
4186 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4187 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__sse2_x32,
4188 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_sse2_params,
4189 .element_tile = 32,
4190 };
4191 }
Marat Dukhan98393ad2021-12-15 11:07:40 -08004192 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
4193 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4194 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx512skx_x32,
4195 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx512_params,
4196 .element_tile = 32,
4197 };
4198 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4199 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx512skx_x32,
4200 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx512_params,
4201 .element_tile = 32,
4202 };
4203 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan7b5f7792021-12-15 00:29:39 -08004204 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4205 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx2_x16,
4206 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params,
4207 .element_tile = 16,
4208 };
4209 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4210 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx2_x16,
4211 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx_params,
4212 .element_tile = 16,
4213 };
4214 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhancd4089f2021-12-14 23:53:33 -08004215 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4216 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx_x32,
4217 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params,
4218 .element_tile = 32,
4219 };
4220 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4221 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx_x32,
4222 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx_params,
4223 .element_tile = 32,
4224 };
4225 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhanf92206b2021-12-10 17:02:07 -08004226 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4227 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__sse41_x16,
4228 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_sse4_params,
4229 .element_tile = 16,
4230 };
4231 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4232 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__sse41_x16,
4233 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_sse4_params,
4234 .element_tile = 16,
4235 };
4236 } else {
4237 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4238 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__sse2_x32,
4239 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_sse2_params,
4240 .element_tile = 32,
4241 };
4242 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4243 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__sse2_x32,
4244 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_sse2_params,
4245 .element_tile = 32,
4246 };
4247 }
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004248 #endif // XNN_NO_VCVT_OPERATORS
4249
Frank Barchardb40ee632021-12-30 11:10:02 -08004250 /**************************** X32 x86 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004251 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07004252 init_flags |= XNN_INIT_FLAG_X32;
4253
Marat Dukhan57dccd82020-04-14 00:53:10 -07004254 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__sse2;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004255 xnn_params.x32.zip = (struct zip_parameters) {
4256 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__sse2,
4257 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__sse2,
4258 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__sse2,
4259 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__sse2,
4260 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08004261 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -08004262 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
4263 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08004264 .channel_tile = 1,
4265 .pixel_tile = 1,
4266 };
4267 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004268 #endif // XNN_NO_X32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07004269
Frank Barchardb40ee632021-12-30 11:10:02 -08004270 /**************************** XX x86 micro-kernels ****************************/
Marat Dukhan048931b2020-11-24 20:53:54 -08004271 #ifndef XNN_NO_XX_OPERATORS
4272 init_flags |= XNN_INIT_FLAG_XX;
4273
4274 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
Marat Dukhan933051b2021-08-07 16:26:15 -07004275 xnn_params.xx.fill = (struct fill_parameters) {
4276 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__sse2_x64,
4277 .row_tile = 1,
4278 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -07004279 xnn_params.xx.pad = (struct pad_parameters) {
4280 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__sse2,
4281 .row_tile = 1,
4282 };
Marat Dukhan048931b2020-11-24 20:53:54 -08004283 #endif
4284
Marat Dukhan4c617792021-12-21 15:47:58 -08004285#elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
Marat Dukhan933051b2021-08-07 16:26:15 -07004286
Frank Barchardb40ee632021-12-30 11:10:02 -08004287 /**************************** QC8 WAsm SIMD micro-kernels****************************/
Marat Dukhan898d5852021-06-30 21:18:34 -07004288 #ifndef XNN_NO_QS8_OPERATORS
4289 init_flags |= XNN_INIT_FLAG_QC8;
4290
Marat Dukhan189c1d02021-09-03 15:39:54 -07004291 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
Marat Dukhan0f1ed942021-12-08 23:25:50 -08004292 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64);
4293 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64);
4294 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64);
4295 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64);
Marat Dukhan189c1d02021-09-03 15:39:54 -07004296 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
Marat Dukhan0f1ed942021-12-08 23:25:50 -08004297 xnn_params.qc8.gemm.mr = 4;
Marat Dukhan189c1d02021-09-03 15:39:54 -07004298 xnn_params.qc8.gemm.nr = 4;
4299 xnn_params.qc8.gemm.log2_kr = 1;
Marat Dukhan0f1ed942021-12-08 23:25:50 -08004300 xnn_params.qc8.gemm.log2_sr = 2;
Marat Dukhan189c1d02021-09-03 15:39:54 -07004301 #else
4302 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
4303 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
4304 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
4305 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
4306 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
4307 xnn_params.qc8.gemm.mr = 3;
4308 xnn_params.qc8.gemm.nr = 4;
4309 xnn_params.qc8.gemm.log2_kr = 3;
4310 #endif
Marat Dukhan898d5852021-06-30 21:18:34 -07004311
Marat Dukhan9cedb592021-08-17 17:25:24 -07004312 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16_add16;
Marat Dukhan898d5852021-06-30 21:18:34 -07004313 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
Marat Dukhan9cedb592021-08-17 17:25:24 -07004314 xnn_params.qc8.dwconv[0].channel_tile = 16;
Marat Dukhan898d5852021-06-30 21:18:34 -07004315 xnn_params.qc8.dwconv[0].primary_tile = 9;
Marat Dukhan9cedb592021-08-17 17:25:24 -07004316 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__wasmsimd_mul16_add16;
Marat Dukhan898d5852021-06-30 21:18:34 -07004317 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
Marat Dukhan9cedb592021-08-17 17:25:24 -07004318 xnn_params.qc8.dwconv[1].channel_tile = 16;
Marat Dukhan898d5852021-06-30 21:18:34 -07004319 xnn_params.qc8.dwconv[1].primary_tile = 25;
4320 #endif // XNN_NO_QC8_OPERATORS
4321
Frank Barchardb40ee632021-12-30 11:10:02 -08004322 /**************************** QS8 WAsm SIMD micro-kernels****************************/
Marat Dukhan07e50402020-08-05 17:16:53 -07004323 #ifndef XNN_NO_QS8_OPERATORS
4324 init_flags |= XNN_INIT_FLAG_QS8;
4325
Marat Dukhan189c1d02021-09-03 15:39:54 -07004326 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
Marat Dukhan0f1ed942021-12-08 23:25:50 -08004327 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64);
4328 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64);
4329 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64);
4330 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64);
Marat Dukhan189c1d02021-09-03 15:39:54 -07004331 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
Marat Dukhan0f1ed942021-12-08 23:25:50 -08004332 xnn_params.qs8.gemm.mr = 4;
Marat Dukhan189c1d02021-09-03 15:39:54 -07004333 xnn_params.qs8.gemm.nr = 4;
4334 xnn_params.qs8.gemm.log2_kr = 1;
Marat Dukhan0f1ed942021-12-08 23:25:50 -08004335 xnn_params.qs8.gemm.log2_sr = 2;
Marat Dukhan189c1d02021-09-03 15:39:54 -07004336 #else // XNN_WASMSIMD_VERSION >= 88
4337 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
4338 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
4339 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
4340 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
4341 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
4342 xnn_params.qs8.gemm.mr = 3;
4343 xnn_params.qs8.gemm.nr = 4;
4344 xnn_params.qs8.gemm.log2_kr = 3;
4345 #endif // XNN_WASMSIMD_VERSION >= 88
Marat Dukhan07e50402020-08-05 17:16:53 -07004346
Marat Dukhan9cedb592021-08-17 17:25:24 -07004347 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16_add16;
Marat Dukhan400e7cb2021-08-07 15:14:54 -07004348 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
Marat Dukhan9cedb592021-08-17 17:25:24 -07004349 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhan07e50402020-08-05 17:16:53 -07004350 xnn_params.qs8.dwconv[0].primary_tile = 9;
Marat Dukhan9cedb592021-08-17 17:25:24 -07004351 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__wasmsimd_mul16_add16;
Marat Dukhan400e7cb2021-08-07 15:14:54 -07004352 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
Marat Dukhan9cedb592021-08-17 17:25:24 -07004353 xnn_params.qs8.dwconv[1].channel_tile = 16;
Marat Dukhan4ed14882021-05-12 17:50:40 -07004354 xnn_params.qs8.dwconv[1].primary_tile = 25;
Marat Dukhan9e0b5392020-08-07 02:29:34 -07004355
4356 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan9e258d62022-01-12 10:50:51 -08004357 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16,
4358 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16,
Marat Dukhan53f41062022-01-11 19:44:57 -08004359 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params,
4360 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_wasmsimd_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08004361 .row_tile = 7,
Marat Dukhan9e258d62022-01-12 10:50:51 -08004362 .channel_tile = 16,
Marat Dukhan9e0b5392020-08-07 02:29:34 -07004363 };
Marat Dukhanff209482020-09-03 14:26:53 -07004364
4365 xnn_params.qs8.vadd = (struct vbinary_parameters) {
Marat Dukhane20a8732021-12-07 17:11:37 -08004366 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__wasmsimd_x32,
4367 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32,
4368 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32,
Marat Dukhan64287252021-09-07 16:20:03 -07004369 .init.qs8_addsub = xnn_init_qs8_add_minmax_wasmsimd_params,
Marat Dukhane20a8732021-12-07 17:11:37 -08004370 .element_tile = 32,
Marat Dukhanff209482020-09-03 14:26:53 -07004371 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07004372 xnn_params.qs8.vmul = (struct vbinary_parameters) {
4373 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4374 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4375 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4376 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_wasmsimd_params,
4377 .element_tile = 8,
4378 };
Marat Dukhan07e50402020-08-05 17:16:53 -07004379 #endif // XNN_NO_QS8_OPERATORS
4380
Frank Barchardb40ee632021-12-30 11:10:02 -08004381 /**************************** QU8 WAsm SIMD micro-kernels****************************/
Marat Dukhan08b7a972020-07-14 18:17:29 -07004382 #ifndef XNN_NO_QU8_OPERATORS
4383 init_flags |= XNN_INIT_FLAG_QU8;
Marat Dukhan854fb6b2020-06-19 12:33:44 -07004384
Marat Dukhan189c1d02021-09-03 15:39:54 -07004385 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
4386 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128);
4387 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128);
4388 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128);
4389 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128);
4390 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
4391 xnn_params.qu8.gemm.mr = 3;
4392 xnn_params.qu8.gemm.nr = 4;
4393 xnn_params.qu8.gemm.log2_kr = 1;
4394 #else // XNN_WASMSIMD_VERSION >= 88
4395 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64);
4396 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64);
4397 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64);
4398 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64);
4399 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
4400 xnn_params.qu8.gemm.mr = 3;
4401 xnn_params.qu8.gemm.nr = 4;
4402 xnn_params.qu8.gemm.log2_kr = 3;
4403 #endif // XNN_WASMSIMD_VERSION >= 88
Marat Dukhanaefaef32020-04-09 07:09:34 -07004404
Marat Dukhana97e9752021-07-15 16:30:41 -07004405 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__wasmsimd_mul16;
4406 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
4407 xnn_params.qu8.dwconv[0].channel_tile = 8;
Marat Dukhan08b7a972020-07-14 18:17:29 -07004408 xnn_params.qu8.dwconv[0].primary_tile = 9;
Marat Dukhana97e9752021-07-15 16:30:41 -07004409 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__wasmsimd_mul16;
4410 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
4411 xnn_params.qu8.dwconv[1].channel_tile = 8;
4412 xnn_params.qu8.dwconv[1].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004413
Marat Dukhan08b7a972020-07-14 18:17:29 -07004414 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08004415 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
4416 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
4417 .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
4418 .primary_tile = 9,
4419 .incremental_tile = 8,
4420 .channel_tile = 1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004421 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07004422 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhand1f53e42022-01-12 22:34:51 -08004423 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16,
4424 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16,
4425 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params,
4426 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_wasmsimd_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08004427 .row_tile = 7,
Marat Dukhand1f53e42022-01-12 22:34:51 -08004428 .channel_tile = 16,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004429 };
Marat Dukhandb007cd2021-07-20 23:42:39 -07004430
4431 xnn_params.qu8.vadd = (struct vbinary_parameters) {
Marat Dukhane20a8732021-12-07 17:11:37 -08004432 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__wasmsimd_x32,
4433 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_x32,
4434 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_x32,
Marat Dukhan64287252021-09-07 16:20:03 -07004435 .init.qu8_addsub = xnn_init_qu8_add_minmax_wasmsimd_params,
Marat Dukhane20a8732021-12-07 17:11:37 -08004436 .element_tile = 32,
Marat Dukhandb007cd2021-07-20 23:42:39 -07004437 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07004438 xnn_params.qu8.vmul = (struct vbinary_parameters) {
4439 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4440 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4441 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4442 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_wasmsimd_params,
4443 .element_tile = 8,
4444 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07004445 #endif // XNN_NO_QU8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07004446
Frank Barchardb40ee632021-12-30 11:10:02 -08004447 /**************************** S8 WAsm SIMD micro-kernels****************************/
Marat Dukhandc5c1482021-08-16 09:03:15 -07004448 #ifndef XNN_NO_S8_OPERATORS
4449 init_flags |= XNN_INIT_FLAG_S8;
4450
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07004451 xnn_params.s8.clamp = (struct vunary_parameters) {
4452 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__wasmsimd_x64,
4453 .init.s8_minmax = xnn_init_s8_minmax_wasmsimd_params,
4454 .element_tile = 64,
4455 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08004456 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
4457 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
4458 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__wasmsimd_dot16x2_c8,
4459 .pixel_tile = 1,
4460 .channel_tile = 8,
4461 };
4462 #else // XNN_WASMSIMD_VERSION >= 88
4463 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
4464 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__wasmsimd_mul32_c8,
4465 .pixel_tile = 1,
4466 .channel_tile = 8,
4467 };
4468 #endif // XNN_WASMSIMD_VERSION >= 88
Marat Dukhandc5c1482021-08-16 09:03:15 -07004469 xnn_params.s8.maxpool = (struct maxpool_parameters) {
4470 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__wasmsimd_c16,
4471 .init.s8 = xnn_init_s8_minmax_wasmsimd_params,
4472 .mr = 9,
4473 .qr = 8,
4474 };
4475 #endif // XNN_NO_S8_OPERATORS
4476
Frank Barchardb40ee632021-12-30 11:10:02 -08004477 /**************************** U8 WAsm SIMD micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004478 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07004479 init_flags |= XNN_INIT_FLAG_U8;
4480
Marat Dukhan94912792021-08-16 21:40:30 -07004481 xnn_params.u8.clamp = (struct vunary_parameters) {
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07004482 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__wasmsimd_x64,
4483 .init.u8_minmax = xnn_init_u8_minmax_wasmsimd_params,
4484 .element_tile = 64,
Marat Dukhan94912792021-08-16 21:40:30 -07004485 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08004486 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
4487 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
4488 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__wasmsimd_dot16x2_c8,
4489 .pixel_tile = 1,
4490 .channel_tile = 8,
4491 };
4492 #else // XNN_WASMSIMD_VERSION >= 88
4493 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
4494 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__wasmsimd_mul32_c8,
4495 .pixel_tile = 1,
4496 .channel_tile = 8,
4497 };
4498 #endif // XNN_WASMSIMD_VERSION >= 88
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004499 xnn_params.u8.maxpool = (struct maxpool_parameters) {
Marat Dukhanf1589422021-08-15 20:37:06 -07004500 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__wasmsimd_c16,
4501 .init.u8 = xnn_init_u8_minmax_wasmsimd_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004502 .mr = 9,
4503 .qr = 8,
4504 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004505 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
4506 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
4507 #endif // XNN_NO_U8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07004508
Frank Barchardb40ee632021-12-30 11:10:02 -08004509 /**************************** X8 WAsm SIMD micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004510 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07004511 init_flags |= XNN_INIT_FLAG_X8;
4512
Marat Dukhand67539d2021-09-08 23:06:03 -07004513 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004514 xnn_params.x8.zip = (struct zip_parameters) {
4515 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
4516 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
4517 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
4518 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
4519 };
4520 #endif // XNN_NO_X8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07004521
Frank Barchardb40ee632021-12-30 11:10:02 -08004522 /**************************** F32 WAsm SIMD micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004523 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07004524 init_flags |= XNN_INIT_FLAG_F32;
4525
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004526 if (is_wasm_x86) {
Frank Barchard0725b8d2020-12-07 11:07:35 -08004527 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
4528 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
4529 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
4530 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
Marat Dukhan688f6d82020-07-14 17:02:11 -07004531 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat);
4532 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat);
4533 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
4534 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
Marat Dukhan802808c2020-06-16 11:01:17 -07004535 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__wasmsimd_splat);
4536 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__wasmsimd_splat);
4537 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
4538 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004539 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004540 xnn_params.f32.gemm.mr = 4;
4541 xnn_params.f32.gemm.nr = 8;
Marat Dukhane39e6462020-07-09 01:33:36 -07004542
4543 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
4544 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
4545 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
4546 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004547 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhane39e6462020-07-09 01:33:36 -07004548 xnn_params.f32.gemm2.mr = 4;
4549 xnn_params.f32.gemm2.nr = 2;
4550 xnn_params.f32.gemm2.log2_kr = 2;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004551 } else {
Frank Barchard0725b8d2020-12-07 11:07:35 -08004552 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
4553 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
4554 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
4555 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
Marat Dukhan688f6d82020-07-14 17:02:11 -07004556 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat);
4557 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat);
4558 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
4559 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
Marat Dukhan802808c2020-06-16 11:01:17 -07004560 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_5x8__wasmsimd_splat);
4561 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_5x8__wasmsimd_splat);
4562 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
4563 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004564 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07004565 xnn_params.f32.gemm.mr = 5;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004566 xnn_params.f32.gemm.nr = 8;
Marat Dukhane39e6462020-07-09 01:33:36 -07004567
4568 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
4569 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
4570 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
4571 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004572 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhane39e6462020-07-09 01:33:36 -07004573 xnn_params.f32.gemm2.mr = 4;
4574 xnn_params.f32.gemm2.nr = 2;
4575 xnn_params.f32.gemm2.log2_kr = 2;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004576 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07004577
Marat Dukhanac014d72020-06-16 08:36:47 -07004578 if (is_wasm_x86) {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004579 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__wasmsimd_x86;
4580 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x3__wasmsimd;
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004581 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhanac014d72020-06-16 08:36:47 -07004582 xnn_params.f32.dwconv[0].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004583 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004584
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004585 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86;
4586 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x4__wasmsimd;
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004587 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhanac014d72020-06-16 08:36:47 -07004588 xnn_params.f32.dwconv[1].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004589 xnn_params.f32.dwconv[1].primary_tile = 4;
4590
4591 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__wasmsimd_x86;
4592 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x9__wasmsimd;
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004593 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004594 xnn_params.f32.dwconv[2].channel_tile = 8;
4595 xnn_params.f32.dwconv[2].primary_tile = 9;
Marat Dukhanac014d72020-06-16 08:36:47 -07004596 } else {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004597 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x3__wasmsimd_arm;
4598 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x3__wasmsimd;
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004599 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhanac014d72020-06-16 08:36:47 -07004600 xnn_params.f32.dwconv[0].channel_tile = 4;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004601 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004602
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004603 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_arm;
4604 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x4__wasmsimd;
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004605 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhanac014d72020-06-16 08:36:47 -07004606 xnn_params.f32.dwconv[1].channel_tile = 4;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004607 xnn_params.f32.dwconv[1].primary_tile = 4;
4608
4609 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__wasmsimd_arm;
4610 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x9__wasmsimd;
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004611 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004612 xnn_params.f32.dwconv[2].channel_tile = 4;
4613 xnn_params.f32.dwconv[2].primary_tile = 9;
Marat Dukhanac014d72020-06-16 08:36:47 -07004614 }
4615
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004616 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x25__wasmsimd_arm;
4617 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x25__wasmsimd;
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004618 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004619 xnn_params.f32.dwconv[3].channel_tile = 4;
4620 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004621
Marat Dukhan3b7432d2020-07-16 17:46:32 -07004622 if (is_wasm_x86) {
4623 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08004624 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_x86_c4,
4625 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
Marat Dukhan84598222022-01-09 21:39:22 -08004626 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08004627 .primary_tile = 9,
4628 .incremental_tile = 8,
4629 .channel_tile = 4,
Marat Dukhan3b7432d2020-07-16 17:46:32 -07004630 };
Marat Dukhan1483c532020-07-16 18:08:19 -07004631 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08004632 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_x86_c4,
4633 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
4634 .primary_tile = 9,
4635 .incremental_tile = 8,
4636 .channel_tile = 4,
Marat Dukhan1483c532020-07-16 18:08:19 -07004637 };
Marat Dukhanc6016802020-07-16 18:51:28 -07004638 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08004639 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4,
4640 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4,
Marat Dukhan84598222022-01-09 21:39:22 -08004641 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
4642 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08004643 .row_tile = 7,
4644 .channel_tile = 4,
Marat Dukhanc6016802020-07-16 18:51:28 -07004645 };
Marat Dukhan3b7432d2020-07-16 17:46:32 -07004646 } else {
4647 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08004648 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_arm_c4,
4649 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
Marat Dukhan84598222022-01-09 21:39:22 -08004650 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08004651 .primary_tile = 9,
4652 .incremental_tile = 8,
4653 .channel_tile = 4,
Marat Dukhan3b7432d2020-07-16 17:46:32 -07004654 };
Marat Dukhan1483c532020-07-16 18:08:19 -07004655 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08004656 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_arm_c4,
4657 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
4658 .primary_tile = 9,
4659 .incremental_tile = 8,
4660 .channel_tile = 4,
Marat Dukhan1483c532020-07-16 18:08:19 -07004661 };
Marat Dukhanc6016802020-07-16 18:51:28 -07004662 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08004663 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4,
4664 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4,
Marat Dukhan84598222022-01-09 21:39:22 -08004665 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
4666 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08004667 .row_tile = 7,
4668 .channel_tile = 4,
Marat Dukhanc6016802020-07-16 18:51:28 -07004669 };
Marat Dukhan3b7432d2020-07-16 17:46:32 -07004670 }
Marat Dukhanf6e24802020-07-08 22:20:40 -07004671 if (is_wasm_x86) {
4672 xnn_params.f32.maxpool = (struct maxpool_parameters) {
4673 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004674 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
Marat Dukhanf6e24802020-07-08 22:20:40 -07004675 .mr = 9,
4676 .qr = 8,
4677 };
4678 } else {
4679 xnn_params.f32.maxpool = (struct maxpool_parameters) {
4680 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004681 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
Marat Dukhanf6e24802020-07-08 22:20:40 -07004682 .mr = 9,
4683 .qr = 8,
4684 };
4685 }
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004686 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
Marat Dukhan40f05522020-07-16 22:33:12 -07004687 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__wasmsimd_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004688 .mr = 4,
4689 };
4690 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
Marat Dukhan40f05522020-07-16 22:33:12 -07004691 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__wasmsimd_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004692 .mr = 9,
4693 };
4694 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
Marat Dukhan40f05522020-07-16 22:33:12 -07004695 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__wasmsimd_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004696 .mr = 9,
4697 .qr = 8,
4698 };
Marat Dukhan660fd192020-03-10 04:55:30 -07004699 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
Marat Dukhan00d1d6e2020-07-09 01:37:27 -07004700 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__wasmsimd_c8,
Marat Dukhan69722492019-11-11 19:55:50 -08004701 .pixel_tile = 1,
4702 .channel_tile = 8,
4703 };
Marat Dukhane5efb162021-12-31 10:26:13 -08004704 xnn_params.f32.abs = (struct vunary_parameters) {
4705 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__wasmsimd_x8,
4706 .init.f32_abs = xnn_init_f32_abs_wasmsimd_params,
4707 .element_tile = 16,
4708 };
Marat Dukhanc303fe62020-06-26 10:09:25 -07004709 if (is_wasm_x86) {
Marat Dukhan94912792021-08-16 21:40:30 -07004710 xnn_params.f32.clamp = (struct vunary_parameters) {
4711 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__wasmsimd_x86_x8,
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004712 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Marat Dukhan94912792021-08-16 21:40:30 -07004713 .element_tile = 8,
4714 };
Marat Dukhanc303fe62020-06-26 10:09:25 -07004715 } else {
Marat Dukhan94912792021-08-16 21:40:30 -07004716 xnn_params.f32.clamp = (struct vunary_parameters) {
4717 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__wasmsimd_arm_x8,
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004718 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Marat Dukhan94912792021-08-16 21:40:30 -07004719 .element_tile = 8,
4720 };
Marat Dukhanc303fe62020-06-26 10:09:25 -07004721 }
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08004722 if (is_wasm_x86) {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08004723 xnn_params.f32.elu = (struct vunary_parameters) {
4724 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x20,
4725 .init.f32_elu = xnn_init_f32_elu_wasmsimd_rr2_p6_params,
4726 .element_tile = 20,
4727 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08004728 } else {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08004729 xnn_params.f32.elu = (struct vunary_parameters) {
4730 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x20,
4731 .init.f32_elu = xnn_init_f32_elu_wasmsimd_rr2_p6_params,
4732 .element_tile = 20,
4733 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08004734 }
Marat Dukhan561d0682021-12-23 16:12:35 -08004735 xnn_params.f32.hswish = (struct vunary_parameters) {
4736 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__wasmsimd_x16,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08004737 .init.f32_hswish = xnn_init_f32_hswish_wasmsimd_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08004738 .element_tile = 16,
4739 };
Marat Dukhanf4935a22020-07-16 15:59:10 -07004740 if (is_wasm_x86) {
Marat Dukhan2894e992021-12-30 08:29:48 -08004741 xnn_params.f32.lrelu = (struct vunary_parameters) {
4742 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__wasmsimd_minmax_x8,
4743 .init.f32_lrelu = xnn_init_f32_lrelu_wasmsimd_params,
4744 .element_tile = 8,
4745 };
Marat Dukhanf4935a22020-07-16 15:59:10 -07004746 } else {
Marat Dukhan2894e992021-12-30 08:29:48 -08004747 xnn_params.f32.lrelu = (struct vunary_parameters) {
4748 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__wasmsimd_bitselect_x8,
4749 .init.f32_lrelu = xnn_init_f32_lrelu_wasmsimd_params,
4750 .element_tile = 8,
4751 };
Marat Dukhanf4935a22020-07-16 15:59:10 -07004752 }
Marat Dukhane5efb162021-12-31 10:26:13 -08004753 xnn_params.f32.neg = (struct vunary_parameters) {
4754 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__wasmsimd_x8,
4755 .init.f32_neg = xnn_init_f32_neg_wasmsimd_params,
4756 .element_tile = 16,
4757 };
Marat Dukhan6674d692021-05-05 22:27:00 -07004758 xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_vrelu_ukernel__wasmsimd_x16;
Marat Dukhan189c1d02021-09-03 15:39:54 -07004759 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 91)
Marat Dukhan0e801372022-01-04 00:10:41 -08004760 xnn_params.f32.rndne = (struct vunary_parameters) {
4761 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__wasmsimd_native_x8,
4762 .element_tile = 8,
4763 };
4764 xnn_params.f32.rndz = (struct vunary_parameters) {
4765 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_native_x8,
4766 .element_tile = 8,
4767 };
4768 xnn_params.f32.rndu = (struct vunary_parameters) {
4769 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__wasmsimd_native_x8,
4770 .element_tile = 8,
4771 };
4772 xnn_params.f32.rndd = (struct vunary_parameters) {
4773 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__wasmsimd_native_x8,
4774 .element_tile = 8,
4775 };
Marat Dukhan189c1d02021-09-03 15:39:54 -07004776 #else // XNN_WASMSIMD_VERSION >= 91
Marat Dukhan0e801372022-01-04 00:10:41 -08004777 xnn_params.f32.rndne = (struct vunary_parameters) {
4778 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__wasmsimd_addsub_x8,
4779 .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
4780 .element_tile = 8,
4781 };
Marat Dukhan189c1d02021-09-03 15:39:54 -07004782 if (is_wasm_x86) {
Marat Dukhan0e801372022-01-04 00:10:41 -08004783 xnn_params.f32.rndz = (struct vunary_parameters) {
4784 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_addsub_x8,
4785 .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
4786 .element_tile = 8,
4787 };
Marat Dukhan189c1d02021-09-03 15:39:54 -07004788 } else {
Marat Dukhan0e801372022-01-04 00:10:41 -08004789 xnn_params.f32.rndz = (struct vunary_parameters) {
4790 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_cvt_x8,
4791 .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
4792 .element_tile = 8,
4793 };
Marat Dukhan189c1d02021-09-03 15:39:54 -07004794 }
Marat Dukhan0e801372022-01-04 00:10:41 -08004795 xnn_params.f32.rndu = (struct vunary_parameters) {
4796 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__wasmsimd_addsub_x8,
4797 .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
4798 .element_tile = 8,
4799 };
4800 xnn_params.f32.rndd = (struct vunary_parameters) {
4801 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__wasmsimd_addsub_x8,
4802 .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
4803 .element_tile = 8,
4804 };
Marat Dukhan189c1d02021-09-03 15:39:54 -07004805 #endif // XNN_WASMSIMD_VERSION >= 91
Marat Dukhance834ad2022-01-03 00:22:01 -08004806 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4807 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_p5_div_x16,
4808 .init.f32_sigmoid = xnn_init_f32_sigmoid_wasmsimd_rr2_p5_params,
4809 .element_tile = 16,
4810 };
Marat Dukhane5efb162021-12-31 10:26:13 -08004811 xnn_params.f32.sqr = (struct vunary_parameters) {
4812 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__wasmsimd_x8,
4813 .element_tile = 16,
4814 };
Marat Dukhane72b2822021-12-30 14:46:58 -08004815 xnn_params.f32.sqrt = (struct vunary_parameters) {
4816 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_x8,
4817 .element_tile = 8,
4818 };
Marat Dukhan195f8eb2020-06-25 12:50:57 -07004819 if (is_wasm_x86) {
4820 xnn_params.f32.prelu = (struct prelu_parameters) {
Marat Dukhan78299282020-07-15 17:38:06 -07004821 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasmsimd_minmax_2x8,
Marat Dukhan195f8eb2020-06-25 12:50:57 -07004822 .row_tile = 2,
4823 .channel_tile = 8,
4824 };
4825 } else {
4826 xnn_params.f32.prelu = (struct prelu_parameters) {
Marat Dukhan78299282020-07-15 17:38:06 -07004827 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasmsimd_bitselect_2x8,
Marat Dukhan195f8eb2020-06-25 12:50:57 -07004828 .row_tile = 2,
4829 .channel_tile = 8,
4830 };
4831 }
Marat Dukhan4a5c7712022-01-05 22:43:13 -08004832 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
4833 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_x16_acc2,
4834 .init = xnn_init_f32_expminus_wasmsimd_rr2_p5_params,
4835 .element_tile = 16,
4836 };
Marat Dukhancdc56552020-06-26 19:49:41 -07004837 if (is_wasm_x86) {
Marat Dukhan0bf8afa2021-09-20 10:02:18 -07004838 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__wasmsimd_x86;
Marat Dukhancdc56552020-06-26 19:49:41 -07004839 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004840 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_x16,
4841 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16,
4842 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16,
4843 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x16,
4844 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
4845 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08004846 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchard9c7308f2020-08-31 17:03:01 -07004847 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004848 };
4849 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardb392f8e2020-10-27 10:46:44 -07004850 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16,
4851 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x16,
4852 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x16,
4853 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__wasmsimd_x16,
4854 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__wasmsimd_x16,
4855 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08004856 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchardb392f8e2020-10-27 10:46:44 -07004857 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004858 };
4859 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004860 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_x86_x16,
4861 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16,
4862 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16,
4863 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004864 };
4865 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004866 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_x86_x16,
4867 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x16,
4868 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x16,
Frank Barchard9c7308f2020-08-31 17:03:01 -07004869 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004870 };
4871 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004872 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16,
4873 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16,
4874 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16,
4875 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x16,
4876 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
4877 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08004878 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchard9c7308f2020-08-31 17:03:01 -07004879 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004880 };
4881 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004882 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16,
4883 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16,
4884 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16,
4885 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x16,
4886 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x16,
4887 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08004888 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchard9c7308f2020-08-31 17:03:01 -07004889 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004890 };
4891 } else {
Marat Dukhan0bf8afa2021-09-20 10:02:18 -07004892 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__wasmsimd_arm;
Marat Dukhancdc56552020-06-26 19:49:41 -07004893 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004894 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_x16,
4895 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16,
4896 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16,
4897 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x16,
4898 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
4899 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08004900 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchard9c7308f2020-08-31 17:03:01 -07004901 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004902 };
4903 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardb392f8e2020-10-27 10:46:44 -07004904 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16,
4905 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x16,
4906 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x16,
4907 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__wasmsimd_x16,
4908 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__wasmsimd_x16,
4909 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08004910 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchardb392f8e2020-10-27 10:46:44 -07004911 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004912 };
4913 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004914 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_arm_x16,
4915 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16,
4916 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16,
4917 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004918 };
4919 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004920 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_arm_x16,
4921 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x16,
4922 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x16,
4923 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004924 };
4925 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004926 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16,
4927 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16,
4928 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16,
4929 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x16,
4930 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
4931 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08004932 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchard9c7308f2020-08-31 17:03:01 -07004933 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004934 };
4935 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004936 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16,
4937 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16,
4938 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16,
4939 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x16,
4940 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x16,
4941 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08004942 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchard9c7308f2020-08-31 17:03:01 -07004943 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07004944 };
4945 }
Marat Dukhanf7399262020-06-05 10:58:44 -07004946 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07004947 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__wasmsimd_x16,
4948 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16,
4949 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16,
4950 .element_tile = 16,
Marat Dukhanf7399262020-06-05 10:58:44 -07004951 };
Marat Dukhand816f622020-07-15 10:14:39 -07004952 if (is_wasm_x86) {
4953 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07004954 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_x86_2x,
Marat Dukhand57186a2021-12-30 11:37:24 -08004955 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
Marat Dukhand816f622020-07-15 10:14:39 -07004956 .channel_tile = 4,
4957 .row_tile = 2,
4958 };
4959 } else {
4960 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07004961 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_arm_2x,
Marat Dukhand57186a2021-12-30 11:37:24 -08004962 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
Marat Dukhand816f622020-07-15 10:14:39 -07004963 .channel_tile = 4,
4964 .row_tile = 2,
4965 };
4966 }
Erich Elsen6e80fdc2020-06-09 15:35:37 -07004967 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -08004968 init_flags |= XNN_INIT_FLAG_CHW_OPT;
4969
Frank Barchard498cb502020-11-16 23:50:04 -08004970 if (is_wasm_x86) {
4971 xnn_params.f32.spmm = (struct spmm_parameters) {
4972 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86,
4973 .mr = 32,
4974 .nr = 1,
4975 };
4976 } else {
4977 xnn_params.f32.spmm = (struct spmm_parameters) {
4978 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm,
4979 .mr = 32,
4980 .nr = 1,
4981 };
4982 }
Erich Elsen0a1970e2020-06-10 09:24:59 -07004983 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
4984 .ukernel_with_symm_padding =
Frank Barchard22136062020-11-24 18:44:46 -08004985 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2,
Erich Elsen0a1970e2020-06-10 09:24:59 -07004986 .output_channel_tile = 4,
4987 .output_height_tile = 2,
4988 .output_width_tile = 2,
4989 };
Frank Barcharddb5c32d2020-11-16 23:58:42 -08004990 if (is_wasm_x86) {
4991 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08004992 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08004993 .output_width_tile = 4,
Frank Barchard97883b82020-11-23 13:01:03 -08004994 .output_height_tile = 2,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08004995 };
4996 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08004997 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_1x4_acc2,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08004998 .output_width_tile = 4,
4999 .output_height_tile = 1,
5000 };
5001 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08005002 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005003 .output_width_tile = 4,
5004 .output_height_tile = 3,
5005 };
5006 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08005007 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005008 .output_width_tile = 4,
5009 .output_height_tile = 1,
5010 };
5011 } else {
5012 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08005013 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005014 .output_width_tile = 4,
Frank Barchard97883b82020-11-23 13:01:03 -08005015 .output_height_tile = 2,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005016 };
5017 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08005018 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_1x4_acc4,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005019 .output_width_tile = 4,
5020 .output_height_tile = 1,
5021 };
5022 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08005023 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005024 .output_width_tile = 4,
5025 .output_height_tile = 3,
5026 };
5027 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08005028 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005029 .output_width_tile = 4,
5030 .output_height_tile = 1,
5031 };
5032 }
Marat Dukhanc5045bf2020-07-27 18:16:35 -07005033 if (is_wasm_x86) {
5034 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
5035 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4,
5036 .channel_tile = 4,
5037 };
5038 } else {
5039 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
5040 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4,
5041 .channel_tile = 4,
5042 };
5043 }
Artsiom Ablavatski97918102020-10-27 15:52:59 -07005044 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
5045 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8,
5046 .channel_tile = 1,
Artsiom Ablavatskib3ffd582021-03-31 13:00:08 -07005047 .pixel_tile = 8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07005048 };
Erich Elsen6e80fdc2020-06-09 15:35:37 -07005049 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005050 #endif // XNN_NO_F32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07005051
Frank Barchardb40ee632021-12-30 11:10:02 -08005052 /*************************** VCVT WAsm SIMD micro-kernels***************************/
Marat Dukhanaf2ba002021-10-24 14:21:41 -07005053 #ifndef XNN_NO_VCVT_OPERATORS
5054 init_flags |= XNN_INIT_FLAG_VCVT;
5055
Marat Dukhan134f9842021-12-29 19:57:31 -08005056 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
5057 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x16,
5058 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_wasmsimd_int16_params,
5059 .element_tile = 16,
5060 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08005061 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
5062 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__wasmsimd_x24,
5063 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_wasmsimd_params,
5064 .element_tile = 24,
5065 };
Marat Dukhand52d20b2021-12-05 09:50:25 -08005066 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
5067 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_x32,
5068 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_wasmsimd_magic_params,
5069 .element_tile = 32,
5070 };
5071 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
5072 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_x32,
5073 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_wasmsimd_magic_params,
5074 .element_tile = 32,
5075 };
Marat Dukhanf92206b2021-12-10 17:02:07 -08005076 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
5077 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__wasmsimd_x32,
5078 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_wasmsimd_params,
5079 .element_tile = 32,
5080 };
5081 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
5082 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__wasmsimd_x32,
5083 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_wasmsimd_params,
5084 .element_tile = 32,
5085 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07005086 #endif // XNN_NO_VCVT_OPERATORS
5087
Frank Barchardb40ee632021-12-30 11:10:02 -08005088 /**************************** X32 WAsm SIMD micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005089 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07005090 init_flags |= XNN_INIT_FLAG_X32;
5091
Marat Dukhan9d4bfa22020-07-16 19:07:04 -07005092 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__wasmsimd;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005093 xnn_params.x32.zip = (struct zip_parameters) {
Marat Dukhane3b78762020-07-16 20:02:58 -07005094 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__wasmsimd,
5095 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__wasmsimd,
5096 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__wasmsimd,
5097 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__wasmsimd,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005098 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08005099 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -08005100 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
5101 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08005102 .channel_tile = 1,
5103 .pixel_tile = 1,
5104 };
5105 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005106 #endif // XNN_NO_X32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07005107
Frank Barchardb40ee632021-12-30 11:10:02 -08005108 /**************************** XX WAsm SIMD micro-kernels****************************/
Marat Dukhan048931b2020-11-24 20:53:54 -08005109 #ifndef XNN_NO_XX_OPERATORS
5110 init_flags |= XNN_INIT_FLAG_XX;
5111
5112 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
Marat Dukhan933051b2021-08-07 16:26:15 -07005113 xnn_params.xx.fill = (struct fill_parameters) {
5114 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__wasmsimd_x64,
5115 .row_tile = 1,
5116 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -07005117 xnn_params.xx.pad = (struct pad_parameters) {
5118 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__wasmsimd,
5119 .row_tile = 1,
5120 };
Marat Dukhan048931b2020-11-24 20:53:54 -08005121 #endif
5122
Marat Dukhan933051b2021-08-07 16:26:15 -07005123#elif XNN_ARCH_WASM
5124
Frank Barchardb40ee632021-12-30 11:10:02 -08005125 /**************************** QC8 WAsm micro-kernels****************************/
Marat Dukhan898d5852021-06-30 21:18:34 -07005126 #ifndef XNN_NO_QC8_OPERATORS
5127 init_flags |= XNN_INIT_FLAG_QC8;
5128
5129 if (is_wasm_x86) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005130 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5131 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5132 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5133 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5134 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_scalar_imagic_params;
Marat Dukhan898d5852021-06-30 21:18:34 -07005135 xnn_params.qc8.gemm.mr = 2;
5136 xnn_params.qc8.gemm.nr = 2;
5137 } else {
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005138 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5139 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5140 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5141 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
Marat Dukhan2ac722e2022-01-04 01:54:20 -08005142 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_scalar_fmagic_params;
Marat Dukhan898d5852021-06-30 21:18:34 -07005143 xnn_params.qc8.gemm.mr = 4;
5144 xnn_params.qc8.gemm.nr = 4;
5145 }
5146
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005147 if (is_wasm_x86) {
5148 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic;
5149 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_scalar_imagic_params;
5150 xnn_params.qc8.dwconv[0].channel_tile = 2;
5151 xnn_params.qc8.dwconv[0].primary_tile = 9;
5152 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up1x25__scalar_imagic;
5153 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_scalar_imagic_params;
5154 xnn_params.qc8.dwconv[1].channel_tile = 1;
5155 xnn_params.qc8.dwconv[1].primary_tile = 25;
5156 } else {
5157 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic;
5158 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_scalar_fmagic_params;
5159 xnn_params.qc8.dwconv[0].channel_tile = 2;
5160 xnn_params.qc8.dwconv[0].primary_tile = 9;
5161 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x25__wasm_fmagic;
5162 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_scalar_fmagic_params;
5163 xnn_params.qc8.dwconv[1].channel_tile = 2;
5164 xnn_params.qc8.dwconv[1].primary_tile = 25;
5165 }
Marat Dukhan898d5852021-06-30 21:18:34 -07005166 #endif // XNN_NO_QC8_OPERATORS
5167
Frank Barchardb40ee632021-12-30 11:10:02 -08005168 /**************************** QS8 WAsm micro-kernels****************************/
Marat Dukhan803c1f82021-05-12 00:13:37 -07005169 #ifndef XNN_NO_QS8_OPERATORS
5170 init_flags |= XNN_INIT_FLAG_QS8;
5171
5172 if (is_wasm_x86) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005173 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5174 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5175 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5176 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5177 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
Marat Dukhan803c1f82021-05-12 00:13:37 -07005178 xnn_params.qs8.gemm.mr = 2;
5179 xnn_params.qs8.gemm.nr = 2;
5180 } else {
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005181 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5182 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5183 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5184 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
Marat Dukhan2ac722e2022-01-04 01:54:20 -08005185 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan803c1f82021-05-12 00:13:37 -07005186 xnn_params.qs8.gemm.mr = 4;
5187 xnn_params.qs8.gemm.nr = 4;
5188 }
5189
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005190 if (is_wasm_x86) {
5191 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic;
5192 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
5193 xnn_params.qs8.dwconv[0].channel_tile = 2;
5194 xnn_params.qs8.dwconv[0].primary_tile = 9;
5195 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x25__scalar_imagic;
5196 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
5197 xnn_params.qs8.dwconv[1].channel_tile = 1;
5198 xnn_params.qs8.dwconv[1].primary_tile = 25;
5199 } else {
5200 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic;
5201 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
5202 xnn_params.qs8.dwconv[0].channel_tile = 2;
5203 xnn_params.qs8.dwconv[0].primary_tile = 9;
5204 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x25__wasm_fmagic;
5205 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
5206 xnn_params.qs8.dwconv[1].channel_tile = 2;
5207 xnn_params.qs8.dwconv[1].primary_tile = 25;
5208 }
Marat Dukhan803c1f82021-05-12 00:13:37 -07005209
5210 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan847ff5e2022-01-11 20:31:06 -08005211 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4,
5212 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4,
Marat Dukhan53f41062022-01-11 19:44:57 -08005213 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
5214 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08005215 .row_tile = 7,
5216 .channel_tile = 4,
Marat Dukhan803c1f82021-05-12 00:13:37 -07005217 };
5218
5219 xnn_params.qs8.vadd = (struct vbinary_parameters) {
5220 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x4,
5221 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
5222 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
Marat Dukhan64287252021-09-07 16:20:03 -07005223 .init.qs8_addsub = xnn_init_qs8_add_minmax_scalar_params,
Marat Dukhan803c1f82021-05-12 00:13:37 -07005224 .element_tile = 4,
5225 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07005226 xnn_params.qs8.vmul = (struct vbinary_parameters) {
5227 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
5228 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
5229 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
5230 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
5231 .element_tile = 4,
5232 };
Marat Dukhan803c1f82021-05-12 00:13:37 -07005233 #endif // XNN_NO_QS8_OPERATORS
5234
Frank Barchardb40ee632021-12-30 11:10:02 -08005235 /**************************** QU8 WAsm micro-kernels****************************/
Marat Dukhan08b7a972020-07-14 18:17:29 -07005236 #ifndef XNN_NO_QU8_OPERATORS
5237 init_flags |= XNN_INIT_FLAG_QU8;
Marat Dukhan854fb6b2020-06-19 12:33:44 -07005238
Marat Dukhan3d76e552021-07-15 18:54:01 -07005239 if (is_wasm_x86) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005240 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5241 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5242 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5243 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5244 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
Marat Dukhan3d76e552021-07-15 18:54:01 -07005245 xnn_params.qu8.gemm.mr = 2;
5246 xnn_params.qu8.gemm.nr = 2;
5247 } else {
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005248 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5249 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5250 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5251 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
Marat Dukhan2ac722e2022-01-04 01:54:20 -08005252 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan3d76e552021-07-15 18:54:01 -07005253 xnn_params.qu8.gemm.mr = 4;
5254 xnn_params.qu8.gemm.nr = 4;
5255 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07005256
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005257 if (is_wasm_x86) {
5258 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic;
5259 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
5260 xnn_params.qu8.dwconv[0].channel_tile = 2;
5261 xnn_params.qu8.dwconv[0].primary_tile = 9;
5262 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x25__scalar_imagic;
5263 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
5264 xnn_params.qu8.dwconv[1].channel_tile = 1;
5265 xnn_params.qu8.dwconv[1].primary_tile = 25;
5266 } else {
5267 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic;
5268 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
5269 xnn_params.qu8.dwconv[0].channel_tile = 2;
5270 xnn_params.qu8.dwconv[0].primary_tile = 9;
5271 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x25__wasm_fmagic;
5272 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
5273 xnn_params.qu8.dwconv[1].channel_tile = 2;
5274 xnn_params.qu8.dwconv[1].primary_tile = 25;
5275 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07005276
Marat Dukhan08b7a972020-07-14 18:17:29 -07005277 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08005278 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
5279 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
5280 .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
5281 .primary_tile = 9,
5282 .incremental_tile = 8,
5283 .channel_tile = 1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005284 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07005285 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhand1f53e42022-01-12 22:34:51 -08005286 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4,
5287 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4,
5288 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
5289 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08005290 .row_tile = 7,
Marat Dukhand1f53e42022-01-12 22:34:51 -08005291 .channel_tile = 4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005292 };
Marat Dukhandb007cd2021-07-20 23:42:39 -07005293
5294 xnn_params.qu8.vadd = (struct vbinary_parameters) {
5295 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x4,
5296 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
5297 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
Marat Dukhan64287252021-09-07 16:20:03 -07005298 .init.qu8_addsub = xnn_init_qu8_add_minmax_scalar_params,
Marat Dukhandb007cd2021-07-20 23:42:39 -07005299 .element_tile = 4,
5300 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07005301 xnn_params.qu8.vmul = (struct vbinary_parameters) {
5302 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
5303 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
5304 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
5305 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
5306 .element_tile = 4,
5307 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07005308 #endif // XNN_NO_QU8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07005309
Frank Barchardb40ee632021-12-30 11:10:02 -08005310 /**************************** S8 WAsm micro-kernels****************************/
Marat Dukhandc5c1482021-08-16 09:03:15 -07005311 #ifndef XNN_NO_S8_OPERATORS
5312 init_flags |= XNN_INIT_FLAG_S8;
5313
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07005314 xnn_params.s8.clamp = (struct vunary_parameters) {
5315 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
5316 .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
5317 .element_tile = 4,
5318 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08005319 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
5320 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
5321 .pixel_tile = 1,
5322 .channel_tile = 1,
5323 };
Marat Dukhandc5c1482021-08-16 09:03:15 -07005324 xnn_params.s8.maxpool = (struct maxpool_parameters) {
5325 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
5326 .init.s8 = xnn_init_s8_minmax_scalar_params,
5327 .mr = 9,
5328 .qr = 8,
5329 };
5330 #endif // XNN_NO_S8_OPERATORS
5331
Frank Barchardb40ee632021-12-30 11:10:02 -08005332 /**************************** U8 WAsm micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005333 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07005334 init_flags |= XNN_INIT_FLAG_U8;
5335
Marat Dukhan94912792021-08-16 21:40:30 -07005336 xnn_params.u8.clamp = (struct vunary_parameters) {
5337 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
5338 .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
5339 .element_tile = 4,
5340 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08005341 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
5342 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
5343 .pixel_tile = 1,
5344 .channel_tile = 1,
5345 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005346 xnn_params.u8.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07005347 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -07005348 .init.u8 = xnn_init_u8_minmax_scalar_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005349 .mr = 9,
5350 .qr = 8,
5351 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005352 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
5353 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
5354 #endif // XNN_NO_U8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07005355
Frank Barchardb40ee632021-12-30 11:10:02 -08005356 /**************************** X8 WAsm micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005357 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07005358 init_flags |= XNN_INIT_FLAG_X8;
5359
Marat Dukhand67539d2021-09-08 23:06:03 -07005360 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005361 xnn_params.x8.zip = (struct zip_parameters) {
5362 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
5363 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
5364 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
5365 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
5366 };
5367 #endif // XNN_NO_X8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07005368
Frank Barchardb40ee632021-12-30 11:10:02 -08005369 /**************************** F32 WAsm micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005370 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07005371 init_flags |= XNN_INIT_FLAG_F32;
5372
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005373 if (is_wasm_x86) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07005374 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_2x4__scalar);
5375 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_2x4__scalar);
5376 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__wasm);
5377 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__wasm);
Marat Dukhan467f6362020-05-22 23:21:55 -07005378 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_2x4__scalar);
5379 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_2x4__scalar);
5380 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__wasm);
5381 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__wasm);
Marat Dukhan869c62d2020-04-09 17:17:55 -07005382 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_2x4__scalar);
5383 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_2x4__scalar);
5384 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__wasm);
5385 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__wasm);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07005386 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005387 xnn_params.f32.gemm.mr = 2;
5388 xnn_params.f32.gemm.nr = 4;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005389 } else {
Marat Dukhanaefaef32020-04-09 07:09:34 -07005390 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__wasm);
5391 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__wasm);
5392 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__wasm);
5393 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__wasm);
Marat Dukhan467f6362020-05-22 23:21:55 -07005394 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__wasm);
5395 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__wasm);
5396 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__wasm);
5397 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__wasm);
Marat Dukhan869c62d2020-04-09 17:17:55 -07005398 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__wasm);
5399 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__wasm);
5400 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__wasm);
5401 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__wasm);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07005402 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005403 xnn_params.f32.gemm.mr = 4;
5404 xnn_params.f32.gemm.nr = 4;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005405 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07005406 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__wasm);
5407 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__wasm),
Marat Dukhan869c62d2020-04-09 17:17:55 -07005408 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__wasm);
5409 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__wasm),
Marat Dukhan104ae5e2021-05-24 13:41:57 -07005410 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005411 xnn_params.f32.gemm2.mr = 4;
5412 xnn_params.f32.gemm2.nr = 2;
5413
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005414 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__wasm_acc2;
5415 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__wasm_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07005416 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005417 xnn_params.f32.dwconv[0].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005418 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005419
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005420 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__wasm_acc2;
5421 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__wasm_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07005422 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005423 xnn_params.f32.dwconv[1].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005424 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005425
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005426 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__wasm_acc2;
5427 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__wasm_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07005428 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005429 xnn_params.f32.dwconv[2].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005430 xnn_params.f32.dwconv[2].primary_tile = 9;
5431
5432 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__wasm_acc2;
5433 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__wasm_acc2;
5434 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
5435 xnn_params.f32.dwconv[3].channel_tile = 1;
5436 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005437
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005438 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08005439 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasm_c1,
5440 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasm_c1,
5441 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5442 .primary_tile = 9,
5443 .incremental_tile = 8,
5444 .channel_tile = 1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005445 };
5446 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08005447 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasm_c1,
5448 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasm_c1,
5449 .primary_tile = 9,
5450 .incremental_tile = 8,
5451 .channel_tile = 1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005452 };
5453 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08005454 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1,
5455 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1,
5456 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5457 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
5458 .row_tile = 7,
5459 .channel_tile = 1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005460 };
5461 xnn_params.f32.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07005462 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasm_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -07005463 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005464 .mr = 9,
5465 .qr = 8,
5466 };
5467 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07005468 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005469 .mr = 4,
5470 };
5471 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07005472 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005473 .mr = 9,
5474 };
5475 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07005476 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005477 .mr = 9,
5478 .qr = 8,
5479 };
Marat Dukhan660fd192020-03-10 04:55:30 -07005480 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
5481 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
Marat Dukhan69722492019-11-11 19:55:50 -08005482 .pixel_tile = 1,
5483 .channel_tile = 2,
5484 };
Marat Dukhane5efb162021-12-31 10:26:13 -08005485 xnn_params.f32.abs = (struct vunary_parameters) {
5486 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4,
5487 .element_tile = 4,
5488 };
Marat Dukhan94912792021-08-16 21:40:30 -07005489 xnn_params.f32.clamp = (struct vunary_parameters) {
5490 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__wasm_x4,
5491 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
5492 .element_tile = 4,
5493 };
Marat Dukhanc303fe62020-06-26 10:09:25 -07005494 if (is_wasm_x86) {
Marat Dukhan561d0682021-12-23 16:12:35 -08005495 xnn_params.f32.hswish = (struct vunary_parameters) {
5496 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08005497 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08005498 .element_tile = 4,
5499 };
Marat Dukhanc303fe62020-06-26 10:09:25 -07005500 } else {
Marat Dukhan561d0682021-12-23 16:12:35 -08005501 xnn_params.f32.hswish = (struct vunary_parameters) {
5502 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__wasm_x4,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08005503 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08005504 .element_tile = 4,
5505 };
Marat Dukhanc303fe62020-06-26 10:09:25 -07005506 }
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08005507 if (is_wasm_x86) {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08005508 xnn_params.f32.elu = (struct vunary_parameters) {
5509 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x2,
5510 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
5511 .element_tile = 2,
5512 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08005513 } else {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08005514 xnn_params.f32.elu = (struct vunary_parameters) {
5515 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasm_rr2_p6_x6,
5516 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_p6_params,
5517 .element_tile = 6,
5518 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08005519 }
Marat Dukhan2894e992021-12-30 08:29:48 -08005520 xnn_params.f32.lrelu = (struct vunary_parameters) {
5521 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4,
5522 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
5523 .element_tile = 4,
5524 };
Marat Dukhane5efb162021-12-31 10:26:13 -08005525 xnn_params.f32.neg = (struct vunary_parameters) {
5526 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4,
5527 .element_tile = 4,
5528 };
Frank Barchard62c5e232020-07-21 17:42:19 -07005529 if (is_wasm_x86) {
Marat Dukhan6674d692021-05-05 22:27:00 -07005530 xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_vrelu_ukernel__scalar_x8;
Frank Barchard62c5e232020-07-21 17:42:19 -07005531 } else {
Marat Dukhan6674d692021-05-05 22:27:00 -07005532 xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_vrelu_ukernel__wasm_x8;
Frank Barchard62c5e232020-07-21 17:42:19 -07005533 }
Marat Dukhan0e801372022-01-04 00:10:41 -08005534 xnn_params.f32.rndne = (struct vunary_parameters) {
5535 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x4,
5536 .element_tile = 4,
5537 };
5538 xnn_params.f32.rndz = (struct vunary_parameters) {
5539 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x4,
5540 .element_tile = 4,
5541 };
5542 xnn_params.f32.rndu = (struct vunary_parameters) {
5543 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x4,
5544 .element_tile = 4,
5545 };
5546 xnn_params.f32.rndd = (struct vunary_parameters) {
5547 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x4,
5548 .element_tile = 4,
5549 };
Marat Dukhance834ad2022-01-03 00:22:01 -08005550 xnn_params.f32.sigmoid = (struct vunary_parameters) {
5551 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
5552 .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
5553 .element_tile = 2,
5554 };
Marat Dukhane5efb162021-12-31 10:26:13 -08005555 xnn_params.f32.sqr = (struct vunary_parameters) {
5556 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4,
5557 .element_tile = 4,
5558 };
Marat Dukhane72b2822021-12-30 14:46:58 -08005559 xnn_params.f32.sqrt = (struct vunary_parameters) {
5560 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
5561 .element_tile = 1,
5562 };
Marat Dukhan7c1f8082020-06-25 13:26:20 -07005563 if (is_wasm_x86) {
5564 xnn_params.f32.prelu = (struct prelu_parameters) {
5565 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
5566 .row_tile = 2,
5567 .channel_tile = 4,
5568 };
5569 } else {
5570 xnn_params.f32.prelu = (struct prelu_parameters) {
5571 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasm_2x4,
5572 .row_tile = 2,
5573 .channel_tile = 4,
5574 };
5575 }
Marat Dukhan4a5c7712022-01-05 22:43:13 -08005576 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
5577 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
5578 .init = xnn_init_f32_expminus_scalar_rr2_p5_params,
5579 .element_tile = 4,
5580 };
Marat Dukhan1edc4542020-01-27 12:40:13 -08005581 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
Marat Dukhanb1a0fc32019-12-02 19:32:02 -08005582 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005583 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasm_x8,
5584 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x8,
5585 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08005586 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhanb1a0fc32019-12-02 19:32:02 -08005587 .element_tile = 8,
5588 };
Marat Dukhan69180502019-12-06 15:00:31 -08005589 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardb392f8e2020-10-27 10:46:44 -07005590 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasm_x8,
5591 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasm_x8,
5592 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasm_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08005593 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Frank Barchardb392f8e2020-10-27 10:46:44 -07005594 .element_tile = 8,
Marat Dukhan69180502019-12-06 15:00:31 -08005595 };
Marat Dukhan79e7f842019-12-05 14:35:50 -08005596 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005597 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasm_x8,
5598 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x8,
5599 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x8,
Marat Dukhan79e7f842019-12-05 14:35:50 -08005600 .element_tile = 8,
5601 };
5602 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005603 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasm_x8,
5604 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x8,
5605 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x8,
Marat Dukhan79e7f842019-12-05 14:35:50 -08005606 .element_tile = 8,
5607 };
Marat Dukhan1e782c42019-11-21 17:02:40 -08005608 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005609 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasm_x8,
5610 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x8,
5611 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08005612 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhanca2733c2019-11-15 23:21:17 -08005613 .element_tile = 8,
5614 };
Marat Dukhan05f3f6d2019-12-03 15:13:53 -08005615 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005616 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasm_x8,
5617 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasm_x8,
5618 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasm_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08005619 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan05f3f6d2019-12-03 15:13:53 -08005620 .element_tile = 8,
5621 };
Marat Dukhanf7399262020-06-05 10:58:44 -07005622 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005623 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
5624 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
5625 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
Marat Dukhanf7399262020-06-05 10:58:44 -07005626 .element_tile = 8,
5627 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005628 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07005629 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__wasm_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07005630 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan49e6ee92019-11-06 15:55:29 -08005631 .channel_tile = 1,
5632 .row_tile = 2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005633 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08005634 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -08005635 init_flags |= XNN_INIT_FLAG_CHW_OPT;
5636
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005637 xnn_params.f32.spmm = (struct spmm_parameters) {
Marat Dukhan355ab432020-04-09 19:01:52 -07005638 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
Marat Dukhanbff791e2019-10-24 11:05:37 -07005639 .mr = 8,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005640 .nr = 1,
5641 };
Erich Elsenc6afd9b2019-10-24 16:10:53 -07005642 xnn_params.f32.spmm2 = (struct spmm_parameters) {
Marat Dukhan355ab432020-04-09 19:01:52 -07005643 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
Erich Elsenc6afd9b2019-10-24 16:10:53 -07005644 .mr = 8,
5645 .nr = 2,
5646 };
5647 xnn_params.f32.spmm4 = (struct spmm_parameters) {
Marat Dukhan355ab432020-04-09 19:01:52 -07005648 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
Erich Elsenc6afd9b2019-10-24 16:10:53 -07005649 .mr = 8,
5650 .nr = 4,
5651 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07005652 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
Marat Dukhan14fe0b22019-10-23 21:20:07 -07005653 .ukernel_with_symm_padding =
Marat Dukhan1f29b802020-05-15 23:46:39 -07005654 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
Marat Dukhan14fe0b22019-10-23 21:20:07 -07005655 .output_channel_tile = 4,
5656 .output_height_tile = 1,
5657 .output_width_tile = 1,
5658 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07005659 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
Marat Dukhan91249d22020-10-24 12:02:51 -07005660 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2,
Marat Dukhan14fe0b22019-10-23 21:20:07 -07005661 .output_width_tile = 1,
Marat Dukhan91249d22020-10-24 12:02:51 -07005662 .output_height_tile = 2,
Marat Dukhan14fe0b22019-10-23 21:20:07 -07005663 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07005664 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
Marat Dukhancf5b3c32020-10-25 19:21:10 -07005665 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2,
Marat Dukhan14fe0b22019-10-23 21:20:07 -07005666 .output_width_tile = 1,
5667 .output_height_tile = 1,
5668 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07005669 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
5670 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5,
Marat Dukhana99918a2019-11-15 14:40:12 -08005671 .output_width_tile = 1,
5672 .output_height_tile = 1,
5673 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07005674 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
5675 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5,
Marat Dukhana99918a2019-11-15 14:40:12 -08005676 .output_width_tile = 1,
5677 .output_height_tile = 1,
5678 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07005679 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
5680 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
Marat Dukhan14fe0b22019-10-23 21:20:07 -07005681 .channel_tile = 1,
5682 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -07005683 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
5684 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
5685 .channel_tile = 1,
5686 .pixel_tile = 4,
5687 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08005688 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005689 #endif // XNN_NO_F32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07005690
Frank Barchardb40ee632021-12-30 11:10:02 -08005691 /*************************** VCVT WAsm micro-kernels***************************/
Marat Dukhanaf2ba002021-10-24 14:21:41 -07005692 #ifndef XNN_NO_VCVT_OPERATORS
5693 init_flags |= XNN_INIT_FLAG_VCVT;
5694
Marat Dukhan134f9842021-12-29 19:57:31 -08005695 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
5696 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_x1,
5697 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
5698 .element_tile = 1,
5699 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08005700 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
5701 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_bitcast_x4,
5702 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_bitcast_params,
5703 .element_tile = 4,
5704 };
Marat Dukhan430b1732021-12-04 02:53:12 -08005705 if (is_wasm_x86) {
5706 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
Marat Dukhanbdf10992022-01-04 09:20:14 -08005707 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x1,
5708 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_imagic_params,
Marat Dukhan430b1732021-12-04 02:53:12 -08005709 .element_tile = 1,
5710 };
5711 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
Marat Dukhanbdf10992022-01-04 09:20:14 -08005712 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x1,
5713 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_imagic_params,
Marat Dukhan430b1732021-12-04 02:53:12 -08005714 .element_tile = 1,
5715 };
5716 } else {
5717 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
Marat Dukhanbdf10992022-01-04 09:20:14 -08005718 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_x4,
5719 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_fmagic_params,
Marat Dukhan430b1732021-12-04 02:53:12 -08005720 .element_tile = 4,
5721 };
5722 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
Marat Dukhanbdf10992022-01-04 09:20:14 -08005723 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_x4,
5724 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_fmagic_params,
Marat Dukhan430b1732021-12-04 02:53:12 -08005725 .element_tile = 4,
5726 };
5727 }
Marat Dukhanf92206b2021-12-10 17:02:07 -08005728 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
5729 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__scalar_x1,
5730 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
5731 .element_tile = 1,
5732 };
5733 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
5734 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__scalar_x1,
5735 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
5736 .element_tile = 1,
5737 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07005738 #endif // XNN_NO_VCVT_OPERATORS
5739
Frank Barchardb40ee632021-12-30 11:10:02 -08005740 /**************************** X32 WAsm micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005741 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07005742 init_flags |= XNN_INIT_FLAG_X32;
5743
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005744 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
5745 xnn_params.x32.zip = (struct zip_parameters) {
5746 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
5747 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
5748 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
5749 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
5750 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08005751 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -08005752 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
5753 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08005754 .channel_tile = 1,
5755 .pixel_tile = 1,
5756 };
5757 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005758 #endif // XNN_NO_X32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07005759
Frank Barchardb40ee632021-12-30 11:10:02 -08005760 /**************************** XX WAsm micro-kernels****************************/
Marat Dukhan933051b2021-08-07 16:26:15 -07005761 #ifndef XNN_NO_XX_OPERATORS
5762 init_flags |= XNN_INIT_FLAG_XX;
5763
5764 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
5765 xnn_params.xx.fill = (struct fill_parameters) {
5766 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
5767 .row_tile = 1,
5768 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -07005769 xnn_params.xx.pad = (struct pad_parameters) {
5770 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
5771 .row_tile = 1,
5772 };
Marat Dukhan933051b2021-08-07 16:26:15 -07005773 #endif
5774
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005775#elif XNN_ARCH_RISCV
5776
Marat Dukhana198f002022-01-04 18:45:11 -08005777 /************************** QC8 RISC-V micro-kernels **************************/
5778 #ifndef XNN_NO_QC8_OPERATORS
5779 init_flags |= XNN_INIT_FLAG_QC8;
5780
5781 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
5782 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
5783 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
5784 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
5785 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_scalar_lrintf_params;
5786 xnn_params.qc8.gemm.mr = 3;
5787 xnn_params.qc8.gemm.nr = 4;
5788
5789 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf;
5790 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_scalar_lrintf_params;
5791 xnn_params.qc8.dwconv[0].channel_tile = 2;
5792 xnn_params.qc8.dwconv[0].primary_tile = 9;
5793 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x25__scalar_lrintf;
5794 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_scalar_lrintf_params;
5795 xnn_params.qc8.dwconv[1].channel_tile = 2;
5796 xnn_params.qc8.dwconv[1].primary_tile = 25;
5797 #endif // XNN_NO_QS8_OPERATORS
5798
5799 /************************** QS8 RISC-V micro-kernels **************************/
Marat Dukhan803c1f82021-05-12 00:13:37 -07005800 #ifndef XNN_NO_QS8_OPERATORS
5801 init_flags |= XNN_INIT_FLAG_QS8;
5802
Marat Dukhana198f002022-01-04 18:45:11 -08005803 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
5804 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
5805 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
5806 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
5807 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
Marat Dukhan15a35c02021-05-12 11:40:03 -07005808 xnn_params.qs8.gemm.mr = 3;
Marat Dukhan803c1f82021-05-12 00:13:37 -07005809 xnn_params.qs8.gemm.nr = 4;
5810
Marat Dukhana198f002022-01-04 18:45:11 -08005811 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf;
5812 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
Marat Dukhan803c1f82021-05-12 00:13:37 -07005813 xnn_params.qs8.dwconv[0].channel_tile = 2;
5814 xnn_params.qs8.dwconv[0].primary_tile = 9;
Marat Dukhana198f002022-01-04 18:45:11 -08005815 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x25__scalar_lrintf;
5816 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07005817 xnn_params.qs8.dwconv[1].channel_tile = 2;
5818 xnn_params.qs8.dwconv[1].primary_tile = 25;
Marat Dukhan803c1f82021-05-12 00:13:37 -07005819
5820 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan847ff5e2022-01-11 20:31:06 -08005821 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
5822 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
Marat Dukhan53f41062022-01-11 19:44:57 -08005823 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
5824 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08005825 .row_tile = 7,
5826 .channel_tile = 1,
Marat Dukhan803c1f82021-05-12 00:13:37 -07005827 };
5828
5829 xnn_params.qs8.vadd = (struct vbinary_parameters) {
5830 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x4,
5831 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
5832 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
Marat Dukhan64287252021-09-07 16:20:03 -07005833 .init.qs8_addsub = xnn_init_qs8_add_minmax_scalar_params,
Marat Dukhan803c1f82021-05-12 00:13:37 -07005834 .element_tile = 4,
5835 };
Marat Dukhana198f002022-01-04 18:45:11 -08005836 xnn_params.qs8.vmul = (struct vbinary_parameters) {
5837 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
5838 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
5839 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
5840 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
5841 .element_tile = 4,
5842 };
Marat Dukhan803c1f82021-05-12 00:13:37 -07005843 #endif // XNN_NO_QS8_OPERATORS
5844
Marat Dukhana198f002022-01-04 18:45:11 -08005845 /************************** QU8 RISC-V micro-kernels **************************/
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005846 #ifndef XNN_NO_QU8_OPERATORS
5847 init_flags |= XNN_INIT_FLAG_QU8;
5848
Marat Dukhana198f002022-01-04 18:45:11 -08005849 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
5850 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
5851 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
5852 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
5853 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
5854 xnn_params.qu8.gemm.mr = 3;
5855 xnn_params.qu8.gemm.nr = 4;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005856
Marat Dukhana198f002022-01-04 18:45:11 -08005857 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf;
5858 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
5859 xnn_params.qu8.dwconv[0].channel_tile = 2;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005860 xnn_params.qu8.dwconv[0].primary_tile = 9;
Marat Dukhana198f002022-01-04 18:45:11 -08005861 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x25__scalar_lrintf;
5862 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
5863 xnn_params.qu8.dwconv[1].channel_tile = 2;
5864 xnn_params.qu8.dwconv[1].primary_tile = 25;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005865
5866 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08005867 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
5868 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
5869 .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
5870 .primary_tile = 9,
5871 .incremental_tile = 8,
5872 .channel_tile = 1,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005873 };
5874 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhand1f53e42022-01-12 22:34:51 -08005875 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
5876 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
5877 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
5878 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08005879 .row_tile = 7,
5880 .channel_tile = 1,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005881 };
Marat Dukhandb007cd2021-07-20 23:42:39 -07005882
5883 xnn_params.qu8.vadd = (struct vbinary_parameters) {
5884 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x4,
5885 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
5886 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
Marat Dukhan64287252021-09-07 16:20:03 -07005887 .init.qu8_addsub = xnn_init_qu8_add_minmax_scalar_params,
Marat Dukhandb007cd2021-07-20 23:42:39 -07005888 .element_tile = 4,
5889 };
Marat Dukhana198f002022-01-04 18:45:11 -08005890 xnn_params.qu8.vmul = (struct vbinary_parameters) {
5891 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
5892 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
5893 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
5894 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
5895 .element_tile = 4,
5896 };
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005897 #endif // XNN_NO_QU8_OPERATORS
5898
Marat Dukhana198f002022-01-04 18:45:11 -08005899 /************************** S8 RISC-V micro-kernels ***************************/
5900 #ifndef XNN_NO_S8_OPERATORS
5901 init_flags |= XNN_INIT_FLAG_S8;
5902
5903 xnn_params.s8.clamp = (struct vunary_parameters) {
5904 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
5905 .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
5906 .element_tile = 4,
5907 };
5908 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
5909 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
5910 .pixel_tile = 1,
5911 .channel_tile = 1,
5912 };
5913 xnn_params.s8.maxpool = (struct maxpool_parameters) {
5914 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
5915 .init.s8 = xnn_init_s8_minmax_scalar_params,
5916 .mr = 9,
5917 .qr = 8,
5918 };
5919 #endif // XNN_NO_S8_OPERATORS
5920
5921 /************************** U8 RISC-V micro-kernels ***************************/
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005922 #ifndef XNN_NO_U8_OPERATORS
5923 init_flags |= XNN_INIT_FLAG_U8;
5924
Marat Dukhan94912792021-08-16 21:40:30 -07005925 xnn_params.u8.clamp = (struct vunary_parameters) {
5926 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
5927 .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
5928 .element_tile = 4,
5929 };
Marat Dukhana198f002022-01-04 18:45:11 -08005930 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
5931 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
5932 .pixel_tile = 1,
5933 .channel_tile = 1,
5934 };
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005935 xnn_params.u8.maxpool = (struct maxpool_parameters) {
5936 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -07005937 .init.u8 = xnn_init_u8_minmax_scalar_params,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005938 .mr = 9,
5939 .qr = 8,
5940 };
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005941 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
5942 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
5943 #endif // XNN_NO_U8_OPERATORS
5944
Marat Dukhana198f002022-01-04 18:45:11 -08005945 /************************** X8 RISC-V micro-kernels ***************************/
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005946 #ifndef XNN_NO_X8_OPERATORS
5947 init_flags |= XNN_INIT_FLAG_X8;
5948
Marat Dukhand67539d2021-09-08 23:06:03 -07005949 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005950 xnn_params.x8.zip = (struct zip_parameters) {
5951 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
5952 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
5953 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
5954 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
5955 };
5956 #endif // XNN_NO_X8_OPERATORS
5957
Marat Dukhana198f002022-01-04 18:45:11 -08005958 /************************** F32 RISC-V micro-kernels **************************/
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005959 #ifndef XNN_NO_F32_OPERATORS
5960 init_flags |= XNN_INIT_FLAG_F32;
5961
5962 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__scalar);
5963 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__scalar);
5964 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__scalar);
5965 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__scalar);
5966 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__scalar);
5967 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__scalar);
5968 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__scalar);
5969 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__scalar);
5970 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__scalar);
5971 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__scalar);
5972 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar);
5973 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07005974 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005975 xnn_params.f32.gemm.mr = 4;
5976 xnn_params.f32.gemm.nr = 4;
5977
5978 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__scalar);
5979 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__scalar),
5980 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__scalar);
5981 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__scalar),
Marat Dukhan104ae5e2021-05-24 13:41:57 -07005982 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005983 xnn_params.f32.gemm2.mr = 4;
5984 xnn_params.f32.gemm2.nr = 2;
5985
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005986 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__scalar_acc2;
5987 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07005988 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005989 xnn_params.f32.dwconv[0].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005990 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005991
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005992 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__scalar_acc2;
5993 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07005994 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005995 xnn_params.f32.dwconv[1].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005996 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07005997
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005998 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__scalar_acc2;
5999 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07006000 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006001 xnn_params.f32.dwconv[2].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07006002 xnn_params.f32.dwconv[2].primary_tile = 9;
6003
6004 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__scalar_acc2;
6005 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar_acc2;
6006 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
6007 xnn_params.f32.dwconv[3].channel_tile = 1;
6008 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006009
6010 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08006011 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__scalar_c1,
6012 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__scalar_c1,
6013 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
6014 .primary_tile = 9,
6015 .incremental_tile = 8,
6016 .channel_tile = 1,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006017 };
6018 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08006019 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__scalar_c1,
6020 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__scalar_c1,
6021 .primary_tile = 9,
6022 .incremental_tile = 8,
6023 .channel_tile = 1,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006024 };
6025 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08006026 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1,
6027 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1,
6028 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
6029 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
6030 .row_tile = 7,
6031 .channel_tile = 1,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006032 };
6033 xnn_params.f32.maxpool = (struct maxpool_parameters) {
6034 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -07006035 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006036 .mr = 9,
6037 .qr = 8,
6038 };
6039 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
6040 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
6041 .mr = 4,
6042 };
6043 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
6044 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
6045 .mr = 9,
6046 };
6047 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
6048 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
6049 .mr = 9,
6050 .qr = 8,
6051 };
6052 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
6053 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
6054 .pixel_tile = 1,
6055 .channel_tile = 2,
6056 };
Marat Dukhane5efb162021-12-31 10:26:13 -08006057 xnn_params.f32.abs = (struct vunary_parameters) {
6058 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4,
6059 .element_tile = 4,
6060 };
Marat Dukhana198f002022-01-04 18:45:11 -08006061 xnn_params.f32.clamp = (struct vunary_parameters) {
6062 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__scalar_x4,
6063 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6064 .element_tile = 4,
6065 };
6066 xnn_params.f32.elu = (struct vunary_parameters) {
6067 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4,
6068 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
6069 .element_tile = 4,
6070 };
Marat Dukhan561d0682021-12-23 16:12:35 -08006071 xnn_params.f32.hswish = (struct vunary_parameters) {
6072 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08006073 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08006074 .element_tile = 4,
6075 };
Marat Dukhana198f002022-01-04 18:45:11 -08006076 xnn_params.f32.lrelu = (struct vunary_parameters) {
6077 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4,
6078 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
6079 .element_tile = 4,
Marat Dukhan4a79ff22022-01-01 12:16:48 -08006080 };
Marat Dukhane5efb162021-12-31 10:26:13 -08006081 xnn_params.f32.neg = (struct vunary_parameters) {
6082 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4,
6083 .element_tile = 4,
6084 };
Marat Dukhan0e801372022-01-04 00:10:41 -08006085 xnn_params.f32.rndne = (struct vunary_parameters) {
Marat Dukhana198f002022-01-04 18:45:11 -08006086 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x1,
6087 .element_tile = 1,
Marat Dukhan0e801372022-01-04 00:10:41 -08006088 };
6089 xnn_params.f32.rndz = (struct vunary_parameters) {
Marat Dukhana198f002022-01-04 18:45:11 -08006090 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x1,
6091 .element_tile = 1,
Marat Dukhan0e801372022-01-04 00:10:41 -08006092 };
6093 xnn_params.f32.rndu = (struct vunary_parameters) {
Marat Dukhana198f002022-01-04 18:45:11 -08006094 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x1,
6095 .element_tile = 1,
Marat Dukhan0e801372022-01-04 00:10:41 -08006096 };
6097 xnn_params.f32.rndd = (struct vunary_parameters) {
Marat Dukhana198f002022-01-04 18:45:11 -08006098 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x1,
6099 .element_tile = 1,
Marat Dukhan0e801372022-01-04 00:10:41 -08006100 };
Marat Dukhance834ad2022-01-03 00:22:01 -08006101 xnn_params.f32.sigmoid = (struct vunary_parameters) {
6102 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
6103 .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
6104 .element_tile = 2,
6105 };
Marat Dukhane5efb162021-12-31 10:26:13 -08006106 xnn_params.f32.sqr = (struct vunary_parameters) {
6107 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4,
6108 .element_tile = 4,
6109 };
Marat Dukhane72b2822021-12-30 14:46:58 -08006110 xnn_params.f32.sqrt = (struct vunary_parameters) {
6111 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
6112 .element_tile = 1,
6113 };
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006114 xnn_params.f32.prelu = (struct prelu_parameters) {
6115 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
Marat Dukhana198f002022-01-04 18:45:11 -08006116 .row_tile = 4,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006117 .channel_tile = 4,
6118 };
Marat Dukhan4a5c7712022-01-05 22:43:13 -08006119 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
6120 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
6121 .init = xnn_init_f32_expminus_scalar_rr2_p5_params,
6122 .element_tile = 4,
6123 };
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006124 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
6125 xnn_params.f32.vadd = (struct vbinary_parameters) {
6126 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__scalar_x8,
6127 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
6128 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08006129 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006130 .element_tile = 8,
6131 };
6132 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Marat Dukhana198f002022-01-04 18:45:11 -08006133 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
6134 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
6135 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
Marat Dukhanf6004972021-12-30 11:23:02 -08006136 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhana198f002022-01-04 18:45:11 -08006137 .element_tile = 2,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006138 };
6139 xnn_params.f32.vmax = (struct vbinary_parameters) {
6140 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__scalar_x8,
6141 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
6142 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
6143 .element_tile = 8,
6144 };
6145 xnn_params.f32.vmin = (struct vbinary_parameters) {
6146 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__scalar_x8,
6147 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
6148 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
6149 .element_tile = 8,
6150 };
6151 xnn_params.f32.vmul = (struct vbinary_parameters) {
6152 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__scalar_x8,
6153 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
6154 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08006155 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006156 .element_tile = 8,
6157 };
6158 xnn_params.f32.vsub = (struct vbinary_parameters) {
6159 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__scalar_x8,
6160 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__scalar_x8,
6161 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__scalar_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08006162 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006163 .element_tile = 8,
6164 };
6165 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
6166 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
6167 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
6168 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
6169 .element_tile = 8,
6170 };
6171 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
6172 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__scalar_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07006173 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006174 .channel_tile = 1,
6175 .row_tile = 2,
6176 };
6177 #ifndef XNN_NO_NCHW_OPERATORS
6178 init_flags |= XNN_INIT_FLAG_CHW_OPT;
6179
6180 xnn_params.f32.spmm = (struct spmm_parameters) {
6181 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
6182 .mr = 8,
6183 .nr = 1,
6184 };
6185 xnn_params.f32.spmm2 = (struct spmm_parameters) {
6186 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
6187 .mr = 8,
6188 .nr = 2,
6189 };
6190 xnn_params.f32.spmm4 = (struct spmm_parameters) {
6191 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
6192 .mr = 8,
6193 .nr = 4,
6194 };
6195 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
6196 .ukernel_with_symm_padding =
6197 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
6198 .output_channel_tile = 4,
6199 .output_height_tile = 1,
6200 .output_width_tile = 1,
6201 };
6202 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
6203 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2,
6204 .output_width_tile = 1,
6205 .output_height_tile = 2,
6206 };
6207 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
6208 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2,
6209 .output_width_tile = 1,
6210 .output_height_tile = 1,
6211 };
6212 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
6213 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5,
6214 .output_width_tile = 1,
6215 .output_height_tile = 1,
6216 };
6217 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
6218 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5,
6219 .output_width_tile = 1,
6220 .output_height_tile = 1,
6221 };
6222 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
6223 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
6224 .channel_tile = 1,
6225 };
6226 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
6227 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
6228 .channel_tile = 1,
6229 .pixel_tile = 4,
6230 };
6231 #endif // XNN_NO_NCHW_OPERATORS
6232 #endif // XNN_NO_F32_OPERATORS
6233
Marat Dukhana198f002022-01-04 18:45:11 -08006234 /************************** VCVT RISC-V micro-kernels *************************/
6235 #ifndef XNN_NO_VCVT_OPERATORS
6236 init_flags |= XNN_INIT_FLAG_VCVT;
6237
6238 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
6239 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_x4,
6240 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
6241 .element_tile = 4,
6242 };
6243 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
6244 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_fabsf_x2,
6245 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_fabsf_params,
6246 .element_tile = 2,
6247 };
6248 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
6249 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__scalar_lrintf_x4,
6250 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_lrintf_params,
6251 .element_tile = 4,
6252 };
6253 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
6254 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_lrintf_x4,
6255 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_lrintf_params,
6256 .element_tile = 4,
6257 };
6258 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
6259 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__scalar_x4,
6260 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
6261 .element_tile = 4,
6262 };
6263 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
6264 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__scalar_x4,
6265 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
6266 .element_tile = 4,
6267 };
6268 #endif // XNN_NO_VCVT_OPERATORS
6269
6270 /************************** X32 RISC-V micro-kernels **************************/
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006271 #ifndef XNN_NO_X32_OPERATORS
6272 init_flags |= XNN_INIT_FLAG_X32;
6273
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006274 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
6275 xnn_params.x32.zip = (struct zip_parameters) {
6276 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
6277 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
6278 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
6279 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
6280 };
6281 #ifndef XNN_NO_NCHW_OPERATORS
6282 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
6283 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
6284 .channel_tile = 1,
6285 .pixel_tile = 1,
6286 };
6287 #endif // XNN_NO_NCHW_OPERATORS
6288 #endif // XNN_NO_X32_OPERATORS
6289
Marat Dukhana198f002022-01-04 18:45:11 -08006290 /************************** XX RISC-V micro-kernels ***************************/
Marat Dukhan0461f2d2021-08-08 12:36:29 -07006291 #ifndef XNN_NO_XX_OPERATORS
6292 init_flags |= XNN_INIT_FLAG_XX;
6293
6294 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
6295 xnn_params.xx.fill = (struct fill_parameters) {
6296 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
6297 .row_tile = 1,
6298 };
6299 xnn_params.xx.pad = (struct pad_parameters) {
6300 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
6301 .row_tile = 1,
6302 };
Marat Dukhana198f002022-01-04 18:45:11 -08006303 #endif // XNN_NO_XX_OPERATORS
Marat Dukhan0461f2d2021-08-08 12:36:29 -07006304
XNNPACK Teamb455b122019-09-27 18:10:33 -07006305#else
6306 #error "Unsupported architecture"
6307#endif
Marat Dukhan496389f2021-04-07 15:47:12 -07006308
6309 memcpy(&xnn_params.allocator, init_allocator, sizeof(struct xnn_allocator));
Marat Dukhan854fb6b2020-06-19 12:33:44 -07006310 xnn_params.init_flags = init_flags;
XNNPACK Teamb455b122019-09-27 18:10:33 -07006311}
6312
Zhi An Ng0db15d32021-12-10 16:45:06 -08006313#if XNN_PLATFORM_WINDOWS
Marat Dukhan57133c02020-04-13 00:54:59 -07006314 static BOOL CALLBACK init_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) {
6315 init();
6316 return TRUE;
6317 }
6318#endif
6319
Marat Dukhan04f03be2019-11-19 12:36:47 -08006320enum xnn_status xnn_initialize(const struct xnn_allocator* allocator) {
Marat Dukhana198f002022-01-04 18:45:11 -08006321 #if !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
Marat Dukhand343c222019-10-07 09:22:14 -07006322 if (!cpuinfo_initialize()) {
6323 return xnn_status_out_of_memory;
6324 }
Marat Dukhana198f002022-01-04 18:45:11 -08006325 #endif // !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
Marat Dukhan496389f2021-04-07 15:47:12 -07006326 if (allocator == NULL) {
6327 allocator = &xnn_default_allocator;
6328 }
6329 #ifdef _MSC_VER
Marat Dukhandf94d982021-06-01 12:21:33 -07006330 _InterlockedCompareExchangePointer((PVOID volatile*) &init_allocator, (PVOID) allocator, NULL);
Marat Dukhan496389f2021-04-07 15:47:12 -07006331 #else
6332 __sync_bool_compare_and_swap(&init_allocator, NULL, allocator);
6333 #endif
Zhi An Ng0db15d32021-12-10 16:45:06 -08006334 #if XNN_PLATFORM_WINDOWS
Marat Dukhan57133c02020-04-13 00:54:59 -07006335 InitOnceExecuteOnce(&init_guard, &init_windows, NULL, NULL);
6336 #else
6337 pthread_once(&init_guard, &init);
6338 #endif
Marat Dukhan854fb6b2020-06-19 12:33:44 -07006339 if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) != 0) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07006340 return xnn_status_success;
6341 } else {
6342 return xnn_status_unsupported_hardware;
6343 }
6344}
6345
6346enum xnn_status xnn_deinitialize(void) {
Marat Dukhana198f002022-01-04 18:45:11 -08006347 #if !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
Marat Dukhand343c222019-10-07 09:22:14 -07006348 cpuinfo_deinitialize();
Marat Dukhana198f002022-01-04 18:45:11 -08006349 #endif // !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
XNNPACK Teamb455b122019-09-27 18:10:33 -07006350 return xnn_status_success;
Marat Dukhan3b9b4bc2021-08-24 00:23:07 -07006351}