blob: da6d2ccde168619d771bec0038ed94f24cee67fa [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
Marat Dukhan01849012020-04-27 19:28:32 -07009#include <math.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070010#include <stdbool.h>
11#include <stddef.h>
12#include <stdint.h>
Marat Dukhan04f03be2019-11-19 12:36:47 -080013#include <string.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070014
Marat Dukhan57133c02020-04-13 00:54:59 -070015#ifdef _WIN32
16 #include <windows.h>
17#else
18 #include <pthread.h>
19#endif
XNNPACK Teamb455b122019-09-27 18:10:33 -070020
Marat Dukhan496389f2021-04-07 15:47:12 -070021#ifdef _MSC_VER
22 #include <intrin.h>
23#endif
24
Marat Dukhand343c222019-10-07 09:22:14 -070025#ifndef __EMSCRIPTEN__
26 #include <cpuinfo.h>
27#endif
XNNPACK Teamb455b122019-09-27 18:10:33 -070028
29#include <xnnpack.h>
Marat Dukhan496389f2021-04-07 15:47:12 -070030#include <xnnpack/allocator.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070031#include <xnnpack/argmaxpool.h>
32#include <xnnpack/avgpool.h>
Marat Dukhan1dadbf72019-10-01 10:46:20 -070033#include <xnnpack/common.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070034#include <xnnpack/conv.h>
35#include <xnnpack/dwconv.h>
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -080036#include <xnnpack/depthtospace.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070037#include <xnnpack/gavgpool.h>
38#include <xnnpack/gemm.h>
Marat Dukhan4662b192020-05-21 15:52:03 -070039#include <xnnpack/fill.h>
Marat Dukhan660fd192020-03-10 04:55:30 -070040#include <xnnpack/ibilinear.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070041#include <xnnpack/igemm.h>
42#include <xnnpack/log.h>
43#include <xnnpack/lut.h>
44#include <xnnpack/maxpool.h>
45#include <xnnpack/pad.h>
46#include <xnnpack/params.h>
Marat Dukhanc5a7a392021-05-21 16:04:31 -070047#include <xnnpack/params-init.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070048#include <xnnpack/pavgpool.h>
49#include <xnnpack/prelu.h>
Marat Dukhan1edc4542020-01-27 12:40:13 -080050#include <xnnpack/raddstoreexpminusmax.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070051#include <xnnpack/rmax.h>
52#include <xnnpack/spmm.h>
53#include <xnnpack/unpool.h>
Marat Dukhan64287252021-09-07 16:20:03 -070054#include <xnnpack/vaddsub.h>
Marat Dukhan1e782c42019-11-21 17:02:40 -080055#include <xnnpack/vbinary.h>
Marat Dukhanaf2ba002021-10-24 14:21:41 -070056#include <xnnpack/vcvt.h>
Marat Dukhan0853b8a2021-08-03 01:01:53 -070057#include <xnnpack/vmul.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070058#include <xnnpack/vmulcaddc.h>
Marat Dukhan1e782c42019-11-21 17:02:40 -080059#include <xnnpack/vunary.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070060#include <xnnpack/zip.h>
61
62#ifndef XNN_ENABLE_ASSEMBLY
63 #define XNN_ENABLE_ASSEMBLY 1
64#endif
65
Zhi An Ng0db15d32021-12-10 16:45:06 -080066#if XNN_PLATFORM_WINDOWS
Marat Dukhan57133c02020-04-13 00:54:59 -070067 static INIT_ONCE init_guard = INIT_ONCE_STATIC_INIT;
68#else
69 static pthread_once_t init_guard = PTHREAD_ONCE_INIT;
70#endif
XNNPACK Teamb455b122019-09-27 18:10:33 -070071
Marat Dukhan496389f2021-04-07 15:47:12 -070072static const struct xnn_allocator* volatile init_allocator = NULL;
73
XNNPACK Teamb455b122019-09-27 18:10:33 -070074struct xnn_parameters xnn_params = {
Marat Dukhan854fb6b2020-06-19 12:33:44 -070075 .init_flags = 0
XNNPACK Teamb455b122019-09-27 18:10:33 -070076};
77
Marat Dukhan01849012020-04-27 19:28:32 -070078static void init(void) {
Marat Dukhan4c617792021-12-21 15:47:58 -080079#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
Marat Dukhan01849012020-04-27 19:28:32 -070080 // Unlike most other architectures, on x86/x86-64 when floating-point instructions
81 // have no NaN arguments, but produce NaN output, the output NaN has sign bit set.
82 // We use it to distinguish x86/x86-64 from other architectures, by doing subtraction
83 // of two infinities (must produce NaN per IEEE 754 standard).
84 static const volatile float inf = INFINITY;
85 const bool is_wasm_x86 = signbit(inf - inf);
XNNPACK Teamb455b122019-09-27 18:10:33 -070086#endif
Marat Dukhan854fb6b2020-06-19 12:33:44 -070087 uint32_t init_flags = XNN_INIT_FLAG_XNNPACK;
XNNPACK Teamb455b122019-09-27 18:10:33 -070088
Marat Dukhan1dadbf72019-10-01 10:46:20 -070089#if XNN_ARCH_ARM
Frank Barchardbcdb1c12020-05-11 14:13:20 -070090 #if XNN_PLATFORM_MOBILE
Marat Dukhan3b745a42020-05-10 21:43:25 -070091 if (!cpuinfo_has_arm_neon()) {
92 xnn_log_error("XNNPACK initialization failed: NEON is not supported");
93 return;
94 }
95 #else
96 if (!cpuinfo_has_arm_vfpv2() && !cpuinfo_has_arm_vfpv3()) {
97 xnn_log_error("XNNPACK initialization failed: VFP is not supported");
98 return;
99 }
100 #endif
XNNPACK Teamb455b122019-09-27 18:10:33 -0700101
Marat Dukhan3b745a42020-05-10 21:43:25 -0700102 if (cpuinfo_has_arm_neon()) {
Frank Barchardb40ee632021-12-30 11:10:02 -0800103 /**************************** QC8 AArch32 micro-kernels ****************************/
Marat Dukhan898d5852021-06-30 21:18:34 -0700104 #ifndef XNN_NO_QC8_OPERATORS
105 init_flags |= XNN_INIT_FLAG_QC8;
106
Frank Barchardf290a142022-01-05 01:08:37 -0800107 #if XNN_ENABLE_ASSEMBLY
108 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
Frank Barchardba5091f2022-01-25 13:31:26 -0800109 switch (cpuinfo_get_uarch(0)->uarch) {
110 case cpuinfo_uarch_cortex_a55:
111 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55);
Frank Barchard6cc5b482022-01-26 17:01:41 -0800112 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55);
Frank Barchardba5091f2022-01-25 13:31:26 -0800113 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
114 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
115 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
116 xnn_params.qc8.gemm.mr = 4;
117 xnn_params.qc8.gemm.nr = 8;
118 xnn_params.qc8.gemm.log2_kr = 2;
119 break;
120 default:
121 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64);
122 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64);
123 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
124 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
125 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
126 xnn_params.qc8.gemm.mr = 4;
127 xnn_params.qc8.gemm.nr = 8;
128 xnn_params.qc8.gemm.log2_kr = 2;
129 break;
130 }
Frank Barchardf290a142022-01-05 01:08:37 -0800131 } else {
132 switch (cpuinfo_get_uarch(0)->uarch) {
Frank Barchard101271e2022-02-02 01:49:54 -0800133 case cpuinfo_uarch_cortex_a7:
Frank Barchard2991acf2022-02-02 20:34:57 -0800134 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
Frank Barchard101271e2022-02-02 01:49:54 -0800135 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
136 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
137 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
138 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neon_params;
139 xnn_params.qc8.gemm.mr = 4;
140 xnn_params.qc8.gemm.nr = 8;
141 break;
Frank Barchard2991acf2022-02-02 20:34:57 -0800142 case cpuinfo_uarch_cortex_a35:
143 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7);
144 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
145 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
146 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
147 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neon_params;
148 xnn_params.qc8.gemm.mr = 4;
149 xnn_params.qc8.gemm.nr = 8;
150 break;
Frank Barchardf290a142022-01-05 01:08:37 -0800151 case cpuinfo_uarch_cortex_a53:
Frank Barchard0455acf2022-02-02 00:51:40 -0800152 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_cortex_a53);
153 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64);
154 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
155 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
156 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
157 xnn_params.qc8.gemm.mr = 4;
158 xnn_params.qc8.gemm.nr = 8;
159 break;
Frank Barchard101271e2022-02-02 01:49:54 -0800160 case cpuinfo_uarch_cortex_a55r0:
161 case cpuinfo_uarch_kryo:
162 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53);
163 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64);
164 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
165 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
166 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
167 xnn_params.qc8.gemm.mr = 4;
168 xnn_params.qc8.gemm.nr = 8;
169 break;
Frank Barchardf290a142022-01-05 01:08:37 -0800170 case cpuinfo_uarch_cortex_a72:
171 case cpuinfo_uarch_exynos_m1:
172 case cpuinfo_uarch_exynos_m2:
173 case cpuinfo_uarch_exynos_m3:
Frank Barchardf290a142022-01-05 01:08:37 -0800174 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64);
175 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64);
176 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
177 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
178 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
179 xnn_params.qc8.gemm.mr = 4;
180 xnn_params.qc8.gemm.nr = 8;
181 break;
182
183 default:
184 if (cpuinfo_has_arm_neon_v8()) {
185 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64);
186 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64);
187 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
188 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
189 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
190 xnn_params.qc8.gemm.mr = 4;
191 xnn_params.qc8.gemm.nr = 8;
192 } else {
Frank Barchardd2e8d4d2022-01-14 17:18:53 -0800193 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
194 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
195 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
196 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
Frank Barchardf290a142022-01-05 01:08:37 -0800197 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neon_params;
Frank Barchardd2e8d4d2022-01-14 17:18:53 -0800198 xnn_params.qc8.gemm.mr = 4;
Frank Barchardf290a142022-01-05 01:08:37 -0800199 xnn_params.qc8.gemm.nr = 8;
Frank Barchardf290a142022-01-05 01:08:37 -0800200 }
201 break;
202 }
203 }
Frank Barchardba5091f2022-01-25 13:31:26 -0800204 #if XNN_MAX_UARCH_TYPES > 1
205 {
206 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
207 const uint32_t mr = xnn_params.qc8.gemm.mr;
208 const uint32_t nr = xnn_params.qc8.gemm.nr;
209 const uint32_t log2_kr = xnn_params.qc8.gemm.log2_kr;
210 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
211 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
212 if (uarch_info == NULL) {
213 /* No more microarchitectures in the system */
214 break;
215 }
216
217 switch (uarch_info->uarch) {
218 case cpuinfo_uarch_cortex_a55:
219 if (mr == 4 && nr == 8 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
220 xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55;
Frank Barchard6cc5b482022-01-26 17:01:41 -0800221 xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55;
Frank Barchardba5091f2022-01-25 13:31:26 -0800222 xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot;
223 xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot;
224 }
225 break;
226 case cpuinfo_uarch_cortex_a53:
Frank Barchardba5091f2022-01-25 13:31:26 -0800227 if (mr == 4 && nr == 8 && log2_kr == 0) {
Frank Barchard0455acf2022-02-02 00:51:40 -0800228 xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_cortex_a53;
Frank Barchardba5091f2022-01-25 13:31:26 -0800229 xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64;
230 xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane;
231 xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane;
232 }
233 break;
Frank Barchard101271e2022-02-02 01:49:54 -0800234 case cpuinfo_uarch_cortex_a55r0:
235 if (mr == 4 && nr == 8 && log2_kr == 0) {
236 xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53;
237 xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64;
238 xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane;
239 xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane;
240 }
241 break;
242
Frank Barchardba5091f2022-01-25 13:31:26 -0800243 default:
244 break;
245 }
246 }
247 }
248 #endif // XNN_MAX_UARCH_TYPES > 1
Frank Barchardf290a142022-01-05 01:08:37 -0800249 #else // XNN_ENABLE_ASSEMBLY
250 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
Frank Barchard70137e42021-12-28 15:40:18 -0800251 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot);
Frank Barchard70137e42021-12-28 15:40:18 -0800252 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__neondot);
Frank Barchardf290a142022-01-05 01:08:37 -0800253 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
Frank Barchard70137e42021-12-28 15:40:18 -0800254 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
255 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
256 xnn_params.qc8.gemm.mr = 4;
257 xnn_params.qc8.gemm.nr = 8;
258 xnn_params.qc8.gemm.log2_kr = 2;
Frank Barchardf290a142022-01-05 01:08:37 -0800259 } else if (cpuinfo_has_arm_v8()) {
260 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
261 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
262 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
263 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
264 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
265 xnn_params.qc8.gemm.mr = 2;
266 xnn_params.qc8.gemm.nr = 8;
267 xnn_params.qc8.gemm.log2_kr = 1;
268 xnn_params.qc8.gemm.log2_sr = 2;
269 } else {
270 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal);
271 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal);
272 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal);
273 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal);
274 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neon_params;
275 xnn_params.qc8.gemm.mr = 2;
276 xnn_params.qc8.gemm.nr = 8;
277 xnn_params.qc8.gemm.log2_kr = 1;
278 xnn_params.qc8.gemm.log2_sr = 2;
279 }
280 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhan898d5852021-06-30 21:18:34 -0700281
Frank Barchard0bc58012021-11-22 18:12:05 -0800282 if (cpuinfo_has_arm_neon_v8()) {
283 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neonv8_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -0800284 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Frank Barchard0bc58012021-11-22 18:12:05 -0800285 xnn_params.qc8.dwconv[0].channel_tile = 16;
286 xnn_params.qc8.dwconv[0].primary_tile = 9;
287 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -0800288 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Frank Barchard0bc58012021-11-22 18:12:05 -0800289 xnn_params.qc8.dwconv[1].channel_tile = 8;
290 xnn_params.qc8.dwconv[1].primary_tile = 25;
291 } else {
292 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neon_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -0800293 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_neon_params;
Frank Barchard0bc58012021-11-22 18:12:05 -0800294 xnn_params.qc8.dwconv[0].channel_tile = 16;
295 xnn_params.qc8.dwconv[0].primary_tile = 9;
296 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neon_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -0800297 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_neon_params;
Frank Barchard0bc58012021-11-22 18:12:05 -0800298 xnn_params.qc8.dwconv[1].channel_tile = 8;
299 xnn_params.qc8.dwconv[1].primary_tile = 25;
300 }
Marat Dukhan898d5852021-06-30 21:18:34 -0700301 #endif // XNN_NO_QC8_OPERATORS
302
Frank Barchardb40ee632021-12-30 11:10:02 -0800303 /**************************** QS8 AArch32 micro-kernels ****************************/
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700304 #ifndef XNN_NO_QS8_OPERATORS
305 init_flags |= XNN_INIT_FLAG_QS8;
306
Frank Barchard95198162021-12-21 17:29:10 -0800307 #if XNN_ENABLE_ASSEMBLY
308 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
Frank Barchard1228b3e2022-01-24 11:57:19 -0800309 switch (cpuinfo_get_uarch(0)->uarch) {
310 case cpuinfo_uarch_cortex_a55:
311 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55);
Frank Barchard6cc5b482022-01-26 17:01:41 -0800312 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55);
Frank Barchard1228b3e2022-01-24 11:57:19 -0800313 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
314 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
315 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
316 xnn_params.qs8.gemm.mr = 4;
317 xnn_params.qs8.gemm.nr = 8;
318 xnn_params.qs8.gemm.log2_kr = 2;
319 break;
320 default:
321 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64);
322 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64);
323 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
324 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
325 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
326 xnn_params.qs8.gemm.mr = 4;
327 xnn_params.qs8.gemm.nr = 8;
328 xnn_params.qs8.gemm.log2_kr = 2;
329 break;
330 }
Frank Barchard95198162021-12-21 17:29:10 -0800331 } else {
Frank Barchard1c852c92021-12-23 13:10:20 -0800332 switch (cpuinfo_get_uarch(0)->uarch) {
Frank Barcharda312e9a2022-02-02 11:27:50 -0800333 case cpuinfo_uarch_cortex_a7:
Frank Barchard2991acf2022-02-02 20:34:57 -0800334 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
335 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
336 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
337 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
338 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
339 xnn_params.qs8.gemm.mr = 4;
340 xnn_params.qs8.gemm.nr = 8;
341 break;
342 case cpuinfo_uarch_cortex_a35:
343 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7);
344 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
345 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
346 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
347 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
348 xnn_params.qs8.gemm.mr = 4;
349 xnn_params.qs8.gemm.nr = 8;
350 break;
Frank Barchard1c852c92021-12-23 13:10:20 -0800351 case cpuinfo_uarch_cortex_a53:
Frank Barchard77a3b5f2022-02-02 00:37:10 -0800352 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53);
353 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
354 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
355 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
356 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
357 xnn_params.qs8.gemm.mr = 4;
358 xnn_params.qs8.gemm.nr = 8;
359 break;
Frank Barcharda312e9a2022-02-02 11:27:50 -0800360 case cpuinfo_uarch_cortex_a55r0:
361 case cpuinfo_uarch_kryo:
362 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53);
363 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
364 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
365 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
366 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
367 xnn_params.qs8.gemm.mr = 4;
368 xnn_params.qs8.gemm.nr = 8;
369 break;
Frank Barchard1c852c92021-12-23 13:10:20 -0800370 case cpuinfo_uarch_cortex_a72:
371 case cpuinfo_uarch_exynos_m1:
372 case cpuinfo_uarch_exynos_m2:
373 case cpuinfo_uarch_exynos_m3:
Frank Barchard1c852c92021-12-23 13:10:20 -0800374 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
375 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
376 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
377 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
378 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
379 xnn_params.qs8.gemm.mr = 4;
380 xnn_params.qs8.gemm.nr = 8;
381 break;
382 default:
383 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
384 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
385 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
386 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
387 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
388 xnn_params.qs8.gemm.mr = 4;
389 xnn_params.qs8.gemm.nr = 8;
390 break;
391 }
Frank Barchard95198162021-12-21 17:29:10 -0800392 }
Frank Barchard364598a2022-01-24 20:39:26 -0800393 #if XNN_MAX_UARCH_TYPES > 1
394 {
395 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
Frank Barchardba5091f2022-01-25 13:31:26 -0800396 const uint32_t mr = xnn_params.qs8.gemm.mr;
397 const uint32_t nr = xnn_params.qs8.gemm.nr;
398 const uint32_t log2_kr = xnn_params.qs8.gemm.log2_kr;
Frank Barchard364598a2022-01-24 20:39:26 -0800399 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
400 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
401 if (uarch_info == NULL) {
402 /* No more microarchitectures in the system */
403 break;
404 }
405
406 switch (uarch_info->uarch) {
Frank Barchardba5091f2022-01-25 13:31:26 -0800407 case cpuinfo_uarch_cortex_a55:
408 if (mr == 4 && nr == 8 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
409 xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55;
Frank Barchard6cc5b482022-01-26 17:01:41 -0800410 xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55;
Frank Barchardba5091f2022-01-25 13:31:26 -0800411 xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot;
412 xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot;
413 }
414 break;
Frank Barchard364598a2022-01-24 20:39:26 -0800415 case cpuinfo_uarch_cortex_a53:
Frank Barchard364598a2022-01-24 20:39:26 -0800416 if (mr == 4 && nr == 8 && log2_kr == 0) {
Frank Barchard77a3b5f2022-02-02 00:37:10 -0800417 xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53;
Frank Barchard364598a2022-01-24 20:39:26 -0800418 xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64;
419 xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
420 xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
421 }
422 break;
Frank Barcharda312e9a2022-02-02 11:27:50 -0800423 case cpuinfo_uarch_cortex_a55r0:
424 if (mr == 4 && nr == 8 && log2_kr == 0) {
425 xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53;
426 xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64;
427 xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
428 xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
429 }
430 break;
Frank Barchard364598a2022-01-24 20:39:26 -0800431 default:
432 break;
433 }
434 }
435 }
436 #endif // XNN_MAX_UARCH_TYPES > 1
Frank Barchard95198162021-12-21 17:29:10 -0800437 #else // XNN_ENABLE_ASSEMBLY
438 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
439 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
440 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
441 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
442 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
443 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
444 xnn_params.qs8.gemm.mr = 4;
445 xnn_params.qs8.gemm.nr = 8;
446 xnn_params.qs8.gemm.log2_kr = 2;
447 } else {
448 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
449 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
450 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
451 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
452 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
453 xnn_params.qs8.gemm.mr = 2;
454 xnn_params.qs8.gemm.nr = 8;
455 xnn_params.qs8.gemm.log2_kr = 1;
456 xnn_params.qs8.gemm.log2_sr = 2;
457 }
458 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700459
Frank Barchard0d065732021-08-31 00:01:40 -0700460 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64;
Marat Dukhanbe18f5c2021-07-16 18:46:39 -0700461 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchard0d065732021-08-31 00:01:40 -0700462 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700463 xnn_params.qs8.dwconv[0].primary_tile = 9;
Marat Dukhan5f2939f2021-07-23 13:38:32 -0700464 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mla8_ld64;
Marat Dukhanbe18f5c2021-07-16 18:46:39 -0700465 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -0700466 xnn_params.qs8.dwconv[1].channel_tile = 8;
467 xnn_params.qs8.dwconv[1].primary_tile = 25;
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700468
469 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan85755042022-01-13 01:46:05 -0800470 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
471 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
472 .init.qs8 = xnn_init_qs8_avgpool_minmax_rndnu_neon_params,
473 .update.qs8 = xnn_update_qs8_avgpool_minmax_rndnu_neon_params,
Marat Dukhanda382d12022-01-07 19:51:20 -0800474 .row_tile = 7,
475 .channel_tile = 8,
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700476 };
Marat Dukhanff209482020-09-03 14:26:53 -0700477
478 xnn_params.qs8.vadd = (struct vbinary_parameters) {
Marat Dukhan01debd92021-07-29 18:14:21 -0700479 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16,
480 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16,
481 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16,
Marat Dukhan64287252021-09-07 16:20:03 -0700482 .init.qs8_addsub = xnn_init_qs8_add_minmax_neon_params,
Marat Dukhan01debd92021-07-29 18:14:21 -0700483 .element_tile = 16,
Marat Dukhanff209482020-09-03 14:26:53 -0700484 };
Marat Dukhan33a98fa2022-01-13 00:08:57 -0800485 xnn_params.qs8.vmul = (struct vbinary_parameters) {
486 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
487 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
488 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
489 .init.qs8_mul = xnn_init_qs8_mul_minmax_rndnu_neon_params,
490 .element_tile = 16,
491 };
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700492 #endif // XNN_NO_QS8_OPERATORS
493
Frank Barchardb40ee632021-12-30 11:10:02 -0800494 /*************************** QU8 AArch32 micro-kernels ***************************/
Marat Dukhan08b7a972020-07-14 18:17:29 -0700495 #ifndef XNN_NO_QU8_OPERATORS
496 init_flags |= XNN_INIT_FLAG_QU8;
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700497
Frank Barchard1d5c6162022-02-03 02:21:50 -0800498 #if XNN_ENABLE_ASSEMBLY
499 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
500 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
501 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
502 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
503 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
504 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
505 xnn_params.qu8.gemm.mr = 4;
506 xnn_params.qu8.gemm.nr = 8;
507 xnn_params.qu8.gemm.log2_kr = 2;
508 } else {
509 switch (cpuinfo_get_uarch(0)->uarch) {
510 case cpuinfo_uarch_cortex_a7:
511 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
512 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
513 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
514 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
515 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
516 xnn_params.qu8.gemm.mr = 4;
517 xnn_params.qu8.gemm.nr = 8;
518 break;
519 case cpuinfo_uarch_cortex_a35:
520 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7);
521 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
522 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
523 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
524 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
525 xnn_params.qu8.gemm.mr = 4;
526 xnn_params.qu8.gemm.nr = 8;
527 break;
528 case cpuinfo_uarch_cortex_a53:
529 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53);
530 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
531 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
532 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
533 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
534 xnn_params.qu8.gemm.mr = 4;
535 xnn_params.qu8.gemm.nr = 8;
536 break;
537 case cpuinfo_uarch_cortex_a55r0:
538 case cpuinfo_uarch_kryo:
539 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53);
540 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
541 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
542 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
543 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
544 xnn_params.qu8.gemm.mr = 4;
545 xnn_params.qu8.gemm.nr = 8;
546 break;
547 case cpuinfo_uarch_cortex_a72:
548 case cpuinfo_uarch_exynos_m1:
549 case cpuinfo_uarch_exynos_m2:
550 case cpuinfo_uarch_exynos_m3:
551 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
552 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
553 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
554 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
555 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
556 xnn_params.qu8.gemm.mr = 4;
557 xnn_params.qu8.gemm.nr = 8;
558 break;
559 default:
560 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
561 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
562 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
563 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
564 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
565 xnn_params.qu8.gemm.mr = 4;
566 xnn_params.qu8.gemm.nr = 8;
567 break;
568 }
569 }
570 #if XNN_MAX_UARCH_TYPES > 1
571 {
572 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
573 const uint32_t mr = xnn_params.qu8.gemm.mr;
574 const uint32_t nr = xnn_params.qu8.gemm.nr;
575 const uint32_t log2_kr = xnn_params.qu8.gemm.log2_kr;
576 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
577 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
578 if (uarch_info == NULL) {
579 /* No more microarchitectures in the system */
580 break;
581 }
582
583 switch (uarch_info->uarch) {
584 case cpuinfo_uarch_cortex_a53:
585 if (mr == 4 && nr == 8 && log2_kr == 0) {
586 xnn_params.qu8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53;
587 xnn_params.qu8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64;
588 xnn_params.qu8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
589 xnn_params.qu8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
590 }
591 break;
592 case cpuinfo_uarch_cortex_a55r0:
593 if (mr == 4 && nr == 8 && log2_kr == 0) {
594 xnn_params.qu8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53;
595 xnn_params.qu8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64;
596 xnn_params.qu8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
597 xnn_params.qu8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
598 }
599 break;
600 default:
601 break;
602 }
603 }
604 }
605 #endif // XNN_MAX_UARCH_TYPES > 1
606 #else // XNN_ENABLE_ASSEMBLY
607 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
608 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
609 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
610 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
611 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
612 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
613 xnn_params.qu8.gemm.mr = 4;
614 xnn_params.qu8.gemm.nr = 8;
615 xnn_params.qu8.gemm.log2_kr = 2;
616 } else {
617 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
618 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
619 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
620 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
621 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
622 xnn_params.qu8.gemm.mr = 2;
623 xnn_params.qu8.gemm.nr = 8;
624 xnn_params.qu8.gemm.log2_kr = 1;
625 xnn_params.qu8.gemm.log2_sr = 2;
626 }
627 #endif // XNN_ENABLE_ASSEMBLY
628
Frank Barchard354cbc62021-09-27 21:42:41 -0700629 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8;
Marat Dukhan73a899a2021-07-27 00:10:38 -0700630 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
Frank Barchard0d065732021-08-31 00:01:40 -0700631 xnn_params.qu8.dwconv[0].channel_tile = 16;
Marat Dukhan08b7a972020-07-14 18:17:29 -0700632 xnn_params.qu8.dwconv[0].primary_tile = 9;
Frank Barchard354cbc62021-09-27 21:42:41 -0700633 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8;
Marat Dukhan73a899a2021-07-27 00:10:38 -0700634 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
Marat Dukhan43b46ee2021-07-15 19:07:50 -0700635 xnn_params.qu8.dwconv[1].channel_tile = 8;
636 xnn_params.qu8.dwconv[1].primary_tile = 25;
637
Marat Dukhan08b7a972020-07-14 18:17:29 -0700638 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -0800639 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8,
640 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8,
Marat Dukhan3c949a32022-01-09 20:12:33 -0800641 .init.qu8 = xnn_init_qu8_avgpool_minmax_neon_params,
Marat Dukhanda382d12022-01-07 19:51:20 -0800642 .primary_tile = 9,
643 .incremental_tile = 8,
644 .channel_tile = 8,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700645 };
Marat Dukhan08b7a972020-07-14 18:17:29 -0700646 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan85755042022-01-13 01:46:05 -0800647 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
648 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
649 .init.qu8 = xnn_init_qu8_avgpool_minmax_rndnu_neon_params,
650 .update.qu8 = xnn_update_qu8_avgpool_minmax_rndnu_neon_params,
Marat Dukhanda382d12022-01-07 19:51:20 -0800651 .row_tile = 7,
652 .channel_tile = 8,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700653 };
Marat Dukhandb007cd2021-07-20 23:42:39 -0700654 xnn_params.qu8.vadd = (struct vbinary_parameters) {
Frank Barchard0a3093c2021-08-31 09:58:11 -0700655 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__neon_ld64_x16,
656 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16,
657 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16,
Marat Dukhan64287252021-09-07 16:20:03 -0700658 .init.qu8_addsub = xnn_init_qu8_add_minmax_neon_params,
Marat Dukhandb007cd2021-07-20 23:42:39 -0700659 .element_tile = 8,
660 };
Marat Dukhan33a98fa2022-01-13 00:08:57 -0800661 xnn_params.qu8.vmul = (struct vbinary_parameters) {
662 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
663 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
664 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
665 .init.qu8_mul = xnn_init_qu8_mul_minmax_rndnu_neon_params,
666 .element_tile = 16,
667 };
Marat Dukhan08b7a972020-07-14 18:17:29 -0700668 #endif // XNN_NO_QU8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -0700669
Frank Barchardb40ee632021-12-30 11:10:02 -0800670 /**************************** S8 AArch32 micro-kernels ****************************/
Marat Dukhan23147532021-08-16 07:26:56 -0700671 #ifndef XNN_NO_S8_OPERATORS
672 init_flags |= XNN_INIT_FLAG_S8;
673
Marat Dukhan61c0c9e2021-08-16 23:16:14 -0700674 xnn_params.s8.clamp = (struct vunary_parameters) {
675 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__neon_x64,
676 .init.s8_minmax = xnn_init_s8_minmax_neon_params,
677 .element_tile = 64,
678 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -0800679 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
680 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__neon_c8,
681 .pixel_tile = 1,
682 .channel_tile = 8,
683 };
Marat Dukhan23147532021-08-16 07:26:56 -0700684 xnn_params.s8.maxpool = (struct maxpool_parameters) {
685 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__neon_c16,
Marat Dukhandc5c1482021-08-16 09:03:15 -0700686 .init.s8 = xnn_init_s8_minmax_neon_params,
Marat Dukhan23147532021-08-16 07:26:56 -0700687 .mr = 9,
688 .qr = 8,
689 };
690 #endif // XNN_NO_S8_OPERATORS
691
Frank Barchardb40ee632021-12-30 11:10:02 -0800692 /**************************** U8 AArch32 micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -0700693 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700694 init_flags |= XNN_INIT_FLAG_U8;
695
Marat Dukhan94912792021-08-16 21:40:30 -0700696 xnn_params.u8.clamp = (struct vunary_parameters) {
697 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__neon_x64,
698 .init.u8_minmax = xnn_init_u8_minmax_neon_params,
699 .element_tile = 64,
700 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -0800701 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
702 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__neon_c8,
703 .pixel_tile = 1,
704 .channel_tile = 8,
705 };
Marat Dukhan3b745a42020-05-10 21:43:25 -0700706 xnn_params.u8.maxpool = (struct maxpool_parameters) {
707 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16,
Marat Dukhan2ea50a02021-08-16 12:59:19 -0700708 .init.u8 = xnn_init_u8_minmax_neon_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700709 .mr = 9,
710 .qr = 8,
711 };
Marat Dukhan3b745a42020-05-10 21:43:25 -0700712 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
713 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
714 #endif // XNN_NO_U8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -0700715
Frank Barchardb40ee632021-12-30 11:10:02 -0800716 /**************************** X8 AArch32 micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -0700717 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700718 init_flags |= XNN_INIT_FLAG_X8;
719
Marat Dukhand67539d2021-09-08 23:06:03 -0700720 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700721 xnn_params.x8.zip = (struct zip_parameters) {
722 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__neon,
723 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__neon,
724 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__neon,
725 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__neon,
726 };
727 #endif // XNN_NO_X8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -0700728
Frank Barchardb40ee632021-12-30 11:10:02 -0800729 /**************************** F32 AArch32 micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -0700730 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700731 init_flags |= XNN_INIT_FLAG_F32;
732
Marat Dukhan3b745a42020-05-10 21:43:25 -0700733 #if XNN_ENABLE_ASSEMBLY
734 switch (cpuinfo_get_uarch(0)->uarch) {
735 case cpuinfo_uarch_cortex_a5:
736 case cpuinfo_uarch_cortex_a7:
Frank Barchard490febe2020-07-16 18:42:17 -0700737 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7);
738 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7);
Marat Dukhan3b745a42020-05-10 21:43:25 -0700739 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
740 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700741 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700742 xnn_params.f32.gemm.mr = 4;
743 xnn_params.f32.gemm.nr = 8;
Marat Dukhan05702cf2020-03-26 15:41:33 -0700744 break;
Marat Dukhan05702cf2020-03-26 15:41:33 -0700745
Marat Dukhan3b745a42020-05-10 21:43:25 -0700746 case cpuinfo_uarch_cortex_a53:
747 case cpuinfo_uarch_cortex_a55r0:
748 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53);
749 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53);
750 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
751 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700752 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700753 xnn_params.f32.gemm.mr = 4;
754 xnn_params.f32.gemm.nr = 8;
755 break;
756
Frank Barchardf975ee02021-11-05 16:01:00 -0700757 case cpuinfo_uarch_cortex_a35:
Marat Dukhan3b745a42020-05-10 21:43:25 -0700758 case cpuinfo_uarch_cortex_a55:
759 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55);
760 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55);
761 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
762 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700763 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700764 xnn_params.f32.gemm.mr = 4;
765 xnn_params.f32.gemm.nr = 8;
766 break;
767
768 case cpuinfo_uarch_cortex_a57:
769 case cpuinfo_uarch_cortex_a72:
770 case cpuinfo_uarch_cortex_a73:
Frank Barchard78735862022-01-04 16:47:44 -0800771 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75);
772 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75);
Marat Dukhan3b745a42020-05-10 21:43:25 -0700773 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
774 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700775 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700776 xnn_params.f32.gemm.mr = 4;
777 xnn_params.f32.gemm.nr = 8;
778 break;
779
780 case cpuinfo_uarch_krait:
781 default:
782 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75);
783 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75);
784 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
785 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700786 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700787 xnn_params.f32.gemm.mr = 4;
788 xnn_params.f32.gemm.nr = 8;
789 break;
790 }
791 #if XNN_MAX_UARCH_TYPES > 1
792 {
793 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
794 const uint32_t mr = xnn_params.f32.gemm.mr;
795 const uint32_t nr = xnn_params.f32.gemm.nr;
796 const uint32_t log2_sr = xnn_params.f32.gemm.log2_sr;
797 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
798 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
799 if (uarch_info == NULL) {
800 /* No more microarchitectures in the system */
Marat Dukhan05702cf2020-03-26 15:41:33 -0700801 break;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700802 }
803
804 switch (uarch_info->uarch) {
805 case cpuinfo_uarch_cortex_a53:
806 case cpuinfo_uarch_cortex_a55r0:
807 if (mr == 4 && nr == 8 && log2_sr == 0) {
808 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53;
809 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53;
810 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
811 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
812 }
813 break;
814 case cpuinfo_uarch_cortex_a55:
815 if (mr == 4 && nr == 8 && log2_sr == 0) {
816 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55;
817 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55;
818 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
819 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
820 }
821 break;
822 default:
823 break;
824 }
Marat Dukhan05702cf2020-03-26 15:41:33 -0700825 }
826 }
Marat Dukhan3b745a42020-05-10 21:43:25 -0700827 #endif // XNN_MAX_UARCH_TYPES > 1
828 #else // XNN_ENABLE_ASSEMBLY
829 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128);
830 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128);
831 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
832 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700833 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700834 xnn_params.f32.gemm.mr = 4;
835 xnn_params.f32.gemm.nr = 8;
836 #endif // XNN_ENABLE_ASSEMBLY
837 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64);
838 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700839 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700840 xnn_params.f32.gemm2.mr = 4;
841 xnn_params.f32.gemm2.nr = 2;
842
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700843 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__neon;
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700844 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Frank Barcharddbe781b2021-10-18 10:29:52 -0700845 xnn_params.f32.dwconv[0].channel_tile = 8,
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700846 xnn_params.f32.dwconv[0].primary_tile = 3,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700847
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700848 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__neon;
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700849 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700850 xnn_params.f32.dwconv[1].channel_tile = 8,
851 xnn_params.f32.dwconv[1].primary_tile = 4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700852
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700853 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neon;
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700854 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
Frank Barcharddbe781b2021-10-18 10:29:52 -0700855 xnn_params.f32.dwconv[2].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700856 xnn_params.f32.dwconv[2].primary_tile = 9;
857
858 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__neon_acc2;
859 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
860 xnn_params.f32.dwconv[3].channel_tile = 8;
861 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700862
863 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -0800864 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__neon_c4,
865 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4,
866 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
867 .primary_tile = 9,
868 .incremental_tile = 8,
869 .channel_tile = 4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700870 };
871 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -0800872 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4,
873 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4,
874 .primary_tile = 9,
875 .incremental_tile = 8,
876 .channel_tile = 4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700877 };
878 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -0800879 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4,
880 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
881 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
882 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
883 .row_tile = 7,
884 .channel_tile = 4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700885 };
886 xnn_params.f32.maxpool = (struct maxpool_parameters) {
887 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4,
Marat Dukhan91ae1652021-08-15 19:19:49 -0700888 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700889 .mr = 9,
890 .qr = 8,
891 };
892 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -0700893 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__neon_c4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700894 .mr = 4,
895 };
896 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -0700897 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__neon_c4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700898 .mr = 9,
899 };
900 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -0700901 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__neon_c4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700902 .mr = 9,
903 .qr = 8,
904 };
905 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
906 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__neon_c8,
907 .pixel_tile = 1,
908 .channel_tile = 8,
909 };
Marat Dukhane5efb162021-12-31 10:26:13 -0800910 xnn_params.f32.abs = (struct vunary_parameters) {
911 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__neon_x8,
912 .element_tile = 8,
913 };
Marat Dukhan94912792021-08-16 21:40:30 -0700914 xnn_params.f32.clamp = (struct vunary_parameters) {
915 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__neon_x8,
916 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
917 .element_tile = 8,
918 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -0800919 if (cpuinfo_has_arm_neon_fma()) {
Marat Dukhan4a79ff22022-01-01 12:16:48 -0800920 xnn_params.f32.elu = (struct vunary_parameters) {
921 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neonfma_rr1_p6_x8,
922 .init.f32_elu = xnn_init_f32_elu_neonfma_rr1_p6_params,
923 .element_tile = 8,
924 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -0800925 } else {
Marat Dukhan4a79ff22022-01-01 12:16:48 -0800926 xnn_params.f32.elu = (struct vunary_parameters) {
927 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x8,
928 .init.f32_elu = xnn_init_f32_elu_neon_rr2_lut16_p3_params,
929 .element_tile = 8,
930 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -0800931 }
Marat Dukhan561d0682021-12-23 16:12:35 -0800932 xnn_params.f32.hswish = (struct vunary_parameters) {
933 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__neon_x16,
Marat Dukhan0d10cc72021-12-23 19:49:19 -0800934 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
Marat Dukhan561d0682021-12-23 16:12:35 -0800935 .element_tile = 16,
936 };
Marat Dukhan2894e992021-12-30 08:29:48 -0800937 xnn_params.f32.lrelu = (struct vunary_parameters) {
938 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__neon_x8,
939 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
940 .element_tile = 8,
941 };
Marat Dukhane5efb162021-12-31 10:26:13 -0800942 xnn_params.f32.neg = (struct vunary_parameters) {
943 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__neon_x8,
944 .element_tile = 8,
945 };
Marat Dukhan64e52512020-06-09 13:41:16 -0700946 if (cpuinfo_has_arm_neon_v8()) {
Marat Dukhan0e801372022-01-04 00:10:41 -0800947 xnn_params.f32.rndne = (struct vunary_parameters) {
948 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8,
949 .element_tile = 8,
950 };
951 xnn_params.f32.rndz = (struct vunary_parameters) {
952 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8,
953 .element_tile = 8,
954 };
955 xnn_params.f32.rndu = (struct vunary_parameters) {
956 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8,
957 .element_tile = 8,
958 };
959 xnn_params.f32.rndd = (struct vunary_parameters) {
960 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8,
961 .element_tile = 8,
962 };
Marat Dukhan64e52512020-06-09 13:41:16 -0700963 } else {
Marat Dukhan0e801372022-01-04 00:10:41 -0800964 xnn_params.f32.rndne = (struct vunary_parameters) {
965 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neon_x8,
966 .element_tile = 8,
967 };
968 xnn_params.f32.rndz = (struct vunary_parameters) {
969 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neon_x8,
970 .element_tile = 8,
971 };
972 xnn_params.f32.rndu = (struct vunary_parameters) {
973 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neon_x8,
974 .element_tile = 8,
975 };
976 xnn_params.f32.rndd = (struct vunary_parameters) {
977 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neon_x8,
978 .element_tile = 8,
979 };
Marat Dukhan64e52512020-06-09 13:41:16 -0700980 }
Marat Dukhance834ad2022-01-03 00:22:01 -0800981 xnn_params.f32.sigmoid = (struct vunary_parameters) {
982 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x8,
Marat Dukhanbbfc27d2022-01-03 13:47:00 -0800983 .init.f32_sigmoid = xnn_init_f32_sigmoid_neon_rr2_lut64_p2_params,
Marat Dukhance834ad2022-01-03 00:22:01 -0800984 .element_tile = 8,
985 };
Marat Dukhane5efb162021-12-31 10:26:13 -0800986 xnn_params.f32.sqr = (struct vunary_parameters) {
987 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__neon_x8,
988 .element_tile = 8,
989 };
Marat Dukhane72b2822021-12-30 14:46:58 -0800990 xnn_params.f32.sqrt = (struct vunary_parameters) {
991 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
992 .element_tile = 1,
993 };
Marat Dukhan3b745a42020-05-10 21:43:25 -0700994 xnn_params.f32.prelu = (struct prelu_parameters) {
995 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
996 .row_tile = 2,
997 .channel_tile = 8,
998 };
Marat Dukhan4a5c7712022-01-05 22:43:13 -0800999 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
1000 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x8,
1001 .init = xnn_init_f32_expminus_neon_rr2_lut64_p2_params,
1002 .element_tile = 8,
1003 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001004 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__neon;
1005 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07001006 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__neon_x8,
1007 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
1008 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08001009 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001010 .element_tile = 8,
1011 };
1012 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07001013 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
1014 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
1015 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
Marat Dukhanf6004972021-12-30 11:23:02 -08001016 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001017 .element_tile = 2,
1018 };
1019 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07001020 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
1021 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
1022 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001023 .element_tile = 8,
1024 };
1025 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07001026 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__neon_x8,
1027 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
1028 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001029 .element_tile = 8,
1030 };
1031 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07001032 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__neon_x8,
1033 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
1034 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08001035 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001036 .element_tile = 8,
1037 };
1038 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07001039 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__neon_x8,
1040 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__neon_x8,
1041 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__neon_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08001042 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001043 .element_tile = 8,
1044 };
Marat Dukhanf7399262020-06-05 10:58:44 -07001045 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07001046 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__neon_x8,
1047 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
1048 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
Marat Dukhanf7399262020-06-05 10:58:44 -07001049 .element_tile = 8,
1050 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001051 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07001052 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07001053 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001054 .channel_tile = 4,
1055 .row_tile = 2,
1056 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -07001057 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -08001058 init_flags |= XNN_INIT_FLAG_CHW_OPT;
1059
Marat Dukhan3e913382020-12-07 13:36:08 -08001060 xnn_params.f32.spmm = (struct spmm_parameters) {
1061 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__neon,
1062 .mr = 32,
1063 .nr = 1,
1064 };
Marat Dukhanc7634882020-12-07 15:11:12 -08001065 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
1066 .ukernel_with_symm_padding =
1067 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2,
1068 .output_channel_tile = 4,
1069 .output_height_tile = 2,
1070 .output_width_tile = 2,
1071 };
Marat Dukhan3e913382020-12-07 13:36:08 -08001072 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
1073 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4,
Marat Dukhan3e913382020-12-07 13:36:08 -08001074 .output_width_tile = 4,
1075 .output_height_tile = 2,
1076 };
1077 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
1078 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4,
Marat Dukhan3e913382020-12-07 13:36:08 -08001079 .output_width_tile = 4,
1080 .output_height_tile = 1,
1081 };
1082 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
1083 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4,
Marat Dukhan3e913382020-12-07 13:36:08 -08001084 .output_width_tile = 4,
1085 .output_height_tile = 1,
1086 };
1087 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
1088 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4,
Marat Dukhan3e913382020-12-07 13:36:08 -08001089 .output_width_tile = 4,
1090 .output_height_tile = 1,
1091 };
1092 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
1093 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__neon_x4,
1094 .channel_tile = 4,
1095 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -07001096 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
Artsiom Ablavatski2202c812021-01-22 14:16:43 -08001097 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__neon_p8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07001098 .channel_tile = 1,
Artsiom Ablavatskib3ffd582021-03-31 13:00:08 -07001099 .pixel_tile = 8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07001100 };
1101 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan3b745a42020-05-10 21:43:25 -07001102 #endif // XNN_NO_F32_OPERATORS
1103
Frank Barchardb40ee632021-12-30 11:10:02 -08001104 /*************************** VCVT AArch32 micro-kernels ***************************/
Marat Dukhanaf2ba002021-10-24 14:21:41 -07001105 #ifndef XNN_NO_VCVT_OPERATORS
1106 init_flags |= XNN_INIT_FLAG_VCVT;
1107
1108 if (cpuinfo_has_arm_neon_fp16()) {
Marat Dukhan134f9842021-12-29 19:57:31 -08001109 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
1110 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neonfp16_x16,
1111 .element_tile = 16,
1112 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08001113 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
1114 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__neonfp16_x16,
1115 .element_tile = 16,
1116 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07001117 } else {
Marat Dukhan134f9842021-12-29 19:57:31 -08001118 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
1119 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neon_int16_x16,
1120 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_neon_params,
1121 .element_tile = 16,
1122 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08001123 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
1124 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__neon_x8,
1125 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_neon_params,
1126 .element_tile = 8,
1127 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07001128 }
Marat Dukhaned2d7762021-12-03 23:51:19 -08001129 if (cpuinfo_has_arm_neon_v8()) {
1130 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
1131 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neonv8_x32,
1132 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neonv8_params,
1133 .element_tile = 32,
1134 };
1135 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
1136 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neonv8_x32,
1137 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neonv8_params,
1138 .element_tile = 32,
1139 };
1140 } else {
1141 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
1142 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neon_x32,
1143 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neon_params,
1144 .element_tile = 32,
1145 };
1146 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
1147 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neon_x32,
1148 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neon_params,
1149 .element_tile = 32,
1150 };
1151 }
Marat Dukhanf92206b2021-12-10 17:02:07 -08001152 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
1153 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__neon_x32,
1154 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_neon_params,
1155 .element_tile = 32,
1156 };
1157 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
1158 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__neon_x32,
1159 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_neon_params,
1160 .element_tile = 32,
1161 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07001162 #endif // XNN_NO_VCVT_OPERATORS
1163
Frank Barchardb40ee632021-12-30 11:10:02 -08001164 /**************************** X32 AArch32 micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -07001165 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07001166 init_flags |= XNN_INIT_FLAG_X32;
1167
Marat Dukhan3b745a42020-05-10 21:43:25 -07001168 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__neon;
1169 xnn_params.x32.zip = (struct zip_parameters) {
1170 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__neon,
1171 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__neon,
1172 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__neon,
1173 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
1174 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08001175 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -08001176 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
1177 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08001178 .channel_tile = 1,
1179 .pixel_tile = 1,
1180 };
1181 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan3b745a42020-05-10 21:43:25 -07001182 #endif // XNN_NO_X32_OPERATORS
Marat Dukhan933051b2021-08-07 16:26:15 -07001183
Frank Barchardb40ee632021-12-30 11:10:02 -08001184 /**************************** XX AArch32 micro-kernels ****************************/
Marat Dukhan933051b2021-08-07 16:26:15 -07001185 #ifndef XNN_NO_XX_OPERATORS
1186 init_flags |= XNN_INIT_FLAG_XX;
1187
1188 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
1189 xnn_params.xx.fill = (struct fill_parameters) {
1190 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__neon_x64,
1191 .row_tile = 1,
1192 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -07001193 xnn_params.xx.pad = (struct pad_parameters) {
1194 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__neon,
1195 .row_tile = 1,
1196 };
Marat Dukhan933051b2021-08-07 16:26:15 -07001197 #endif // XNN_NO_XX_OPERATORS
1198
Marat Dukhan3b745a42020-05-10 21:43:25 -07001199 } else if (!XNN_PLATFORM_MOBILE) {
Marat Dukhan933051b2021-08-07 16:26:15 -07001200
Frank Barchardb40ee632021-12-30 11:10:02 -08001201 /*************************** QS8 AArch32 Pre-NEON micro-kernels ***************************/
Marat Dukhan66a3ca12021-08-06 18:24:19 -07001202 #ifndef XNN_NO_QS8_OPERATORS
1203 init_flags |= XNN_INIT_FLAG_QS8;
1204
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001205 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic);
1206 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_fmagic);
1207 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic);
1208 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic);
1209 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan66a3ca12021-08-06 18:24:19 -07001210 xnn_params.qs8.gemm.mr = 2;
1211 xnn_params.qs8.gemm.nr = 2;
1212
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001213 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic;
1214 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan66a3ca12021-08-06 18:24:19 -07001215 xnn_params.qs8.dwconv[0].channel_tile = 1;
1216 xnn_params.qs8.dwconv[0].primary_tile = 9;
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001217 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x25__scalar_fmagic;
1218 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan66a3ca12021-08-06 18:24:19 -07001219 xnn_params.qs8.dwconv[1].channel_tile = 1;
1220 xnn_params.qs8.dwconv[1].primary_tile = 25;
1221
1222 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan847ff5e2022-01-11 20:31:06 -08001223 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
1224 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
Marat Dukhan53f41062022-01-11 19:44:57 -08001225 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
1226 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08001227 .row_tile = 7,
1228 .channel_tile = 1,
Marat Dukhan66a3ca12021-08-06 18:24:19 -07001229 };
1230 xnn_params.qs8.vadd = (struct vbinary_parameters) {
1231 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x1,
1232 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x1,
1233 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x1,
Marat Dukhan64287252021-09-07 16:20:03 -07001234 .init.qs8_addsub = xnn_init_qs8_add_minmax_scalar_params,
Marat Dukhan66a3ca12021-08-06 18:24:19 -07001235 .element_tile = 1,
1236 };
1237 xnn_params.qs8.vmul = (struct vbinary_parameters) {
1238 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
1239 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
1240 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
1241 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
1242 .element_tile = 4,
1243 };
1244 #endif // XNN_NO_QS8_OPERATORS
1245
Frank Barchardb40ee632021-12-30 11:10:02 -08001246 /*************************** QU8 AArch32 Pre-NEON micro-kernels ***************************/
Marat Dukhan08b7a972020-07-14 18:17:29 -07001247 #ifndef XNN_NO_QU8_OPERATORS
1248 init_flags |= XNN_INIT_FLAG_QU8;
Marat Dukhan854fb6b2020-06-19 12:33:44 -07001249
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001250 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic);
1251 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_fmagic);
1252 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic);
1253 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic);
1254 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan08b7a972020-07-14 18:17:29 -07001255 xnn_params.qu8.gemm.mr = 2;
1256 xnn_params.qu8.gemm.nr = 2;
Marat Dukhan3b745a42020-05-10 21:43:25 -07001257
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001258 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic;
1259 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan08b7a972020-07-14 18:17:29 -07001260 xnn_params.qu8.dwconv[0].channel_tile = 1;
1261 xnn_params.qu8.dwconv[0].primary_tile = 9;
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001262 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x25__scalar_fmagic;
1263 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan43b46ee2021-07-15 19:07:50 -07001264 xnn_params.qu8.dwconv[1].channel_tile = 1;
1265 xnn_params.qu8.dwconv[1].primary_tile = 25;
Marat Dukhan3b745a42020-05-10 21:43:25 -07001266
Marat Dukhan08b7a972020-07-14 18:17:29 -07001267 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08001268 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
1269 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
1270 .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
1271 .primary_tile = 9,
1272 .incremental_tile = 8,
1273 .channel_tile = 1,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001274 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07001275 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhand1f53e42022-01-12 22:34:51 -08001276 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
1277 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
1278 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
1279 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08001280 .row_tile = 7,
1281 .channel_tile = 1,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001282 };
Marat Dukhandb007cd2021-07-20 23:42:39 -07001283 xnn_params.qu8.vadd = (struct vbinary_parameters) {
1284 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x1,
1285 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x1,
1286 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x1,
Marat Dukhan64287252021-09-07 16:20:03 -07001287 .init.qu8_addsub = xnn_init_qu8_add_minmax_scalar_params,
Marat Dukhandb007cd2021-07-20 23:42:39 -07001288 .element_tile = 1,
1289 };
Marat Dukhan3c5e6622021-08-06 00:38:05 -07001290 xnn_params.qu8.vmul = (struct vbinary_parameters) {
1291 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
1292 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
1293 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
1294 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
1295 .element_tile = 4,
1296 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07001297 #endif // XNN_NO_QU8_OPERATORS
Marat Dukhan3b745a42020-05-10 21:43:25 -07001298
Frank Barchardb40ee632021-12-30 11:10:02 -08001299 /**************************** S8 AArch32 Pre-NEON micro-kernels ****************************/
Marat Dukhan23147532021-08-16 07:26:56 -07001300 #ifndef XNN_NO_S8_OPERATORS
1301 init_flags |= XNN_INIT_FLAG_S8;
1302
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07001303 xnn_params.s8.clamp = (struct vunary_parameters) {
1304 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
Marat Dukhan3b9b4bc2021-08-24 00:23:07 -07001305 .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07001306 .element_tile = 4,
1307 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08001308 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
1309 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
1310 .pixel_tile = 1,
1311 .channel_tile = 1,
1312 };
Marat Dukhan23147532021-08-16 07:26:56 -07001313 xnn_params.s8.maxpool = (struct maxpool_parameters) {
1314 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
1315 .init.s8 = xnn_init_s8_minmax_scalar_params,
1316 .mr = 9,
1317 .qr = 8,
1318 };
1319 #endif // XNN_NO_S8_OPERATORS
1320
Frank Barchardb40ee632021-12-30 11:10:02 -08001321 /**************************** U8 AArch32 Pre-NEON micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -07001322 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07001323 init_flags |= XNN_INIT_FLAG_U8;
1324
Marat Dukhan94912792021-08-16 21:40:30 -07001325 xnn_params.u8.clamp = (struct vunary_parameters) {
1326 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
Marat Dukhan3b9b4bc2021-08-24 00:23:07 -07001327 .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
Marat Dukhan94912792021-08-16 21:40:30 -07001328 .element_tile = 4,
1329 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08001330 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
1331 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
1332 .pixel_tile = 1,
1333 .channel_tile = 1,
1334 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001335 xnn_params.u8.maxpool = (struct maxpool_parameters) {
1336 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -07001337 .init.u8 = xnn_init_u8_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001338 .mr = 9,
1339 .qr = 8,
1340 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001341 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
1342 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
1343 #endif // XNN_NO_U8_OPERATORS
1344
Frank Barchardb40ee632021-12-30 11:10:02 -08001345 /**************************** X8 AArch32 Pre-NEON micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -07001346 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07001347 init_flags |= XNN_INIT_FLAG_X8;
1348
Marat Dukhand67539d2021-09-08 23:06:03 -07001349 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
Marat Dukhan3b745a42020-05-10 21:43:25 -07001350 xnn_params.x8.zip = (struct zip_parameters) {
1351 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
1352 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
1353 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
1354 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
1355 };
1356 #endif // XNN_NO_X8_OPERATORS
1357
Frank Barchardb40ee632021-12-30 11:10:02 -08001358 /**************************** F32 AArch32 Pre-NEON micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -07001359 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07001360 init_flags |= XNN_INIT_FLAG_F32;
1361
Marat Dukhan3b745a42020-05-10 21:43:25 -07001362 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__scalar);
1363 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__scalar);
1364 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__scalar);
1365 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__scalar);
Marat Dukhan467f6362020-05-22 23:21:55 -07001366 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__scalar);
1367 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__scalar);
1368 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__scalar);
1369 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__scalar);
Marat Dukhan3b745a42020-05-10 21:43:25 -07001370 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__scalar);
1371 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__scalar);
1372 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar);
1373 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001374 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001375 xnn_params.f32.gemm.mr = 4;
Marat Dukhan3b745a42020-05-10 21:43:25 -07001376 xnn_params.f32.gemm.nr = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001377
Marat Dukhan3b745a42020-05-10 21:43:25 -07001378 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__scalar);
1379 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__scalar),
1380 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__scalar);
1381 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__scalar),
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001382 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -07001383 xnn_params.f32.gemm2.mr = 4;
1384 xnn_params.f32.gemm2.nr = 2;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001385
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07001386 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__scalar_acc2;
1387 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001388 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -07001389 xnn_params.f32.dwconv[0].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07001390 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001391
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07001392 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__scalar_acc2;
1393 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001394 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -07001395 xnn_params.f32.dwconv[1].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07001396 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001397
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07001398 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__scalar_acc2;
1399 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001400 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -07001401 xnn_params.f32.dwconv[2].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07001402 xnn_params.f32.dwconv[2].primary_tile = 9;
1403
1404 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__scalar_acc2;
1405 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar_acc2;
1406 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
1407 xnn_params.f32.dwconv[3].channel_tile = 1;
1408 xnn_params.f32.dwconv[3].primary_tile = 25;
XNNPACK Teamb455b122019-09-27 18:10:33 -07001409
Marat Dukhan3b745a42020-05-10 21:43:25 -07001410 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08001411 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__scalar_c1,
1412 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__scalar_c1,
1413 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
1414 .primary_tile = 9,
1415 .incremental_tile = 8,
1416 .channel_tile = 1,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001417 };
1418 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08001419 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__scalar_c1,
1420 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__scalar_c1,
1421 .primary_tile = 9,
1422 .incremental_tile = 8,
1423 .channel_tile = 1,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001424 };
1425 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08001426 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1,
1427 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1,
1428 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
1429 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
1430 .row_tile = 7,
1431 .channel_tile = 1,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001432 };
1433 xnn_params.f32.maxpool = (struct maxpool_parameters) {
1434 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -07001435 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001436 .mr = 9,
1437 .qr = 8,
1438 };
1439 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
1440 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
1441 .mr = 4,
1442 };
1443 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
1444 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
1445 .mr = 9,
1446 };
1447 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
1448 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
1449 .mr = 9,
1450 .qr = 8,
1451 };
1452 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
1453 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
1454 .pixel_tile = 1,
1455 .channel_tile = 2,
1456 };
Marat Dukhane5efb162021-12-31 10:26:13 -08001457 xnn_params.f32.abs = (struct vunary_parameters) {
1458 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4,
1459 .element_tile = 4,
1460 };
Marat Dukhan94912792021-08-16 21:40:30 -07001461 xnn_params.f32.clamp = (struct vunary_parameters) {
1462 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__scalar_x4,
1463 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1464 .element_tile = 4,
1465 };
Marat Dukhan4a79ff22022-01-01 12:16:48 -08001466 xnn_params.f32.elu = (struct vunary_parameters) {
1467 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4,
1468 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
1469 .element_tile = 4,
1470 };
Marat Dukhan561d0682021-12-23 16:12:35 -08001471 xnn_params.f32.hswish = (struct vunary_parameters) {
1472 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08001473 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08001474 .element_tile = 4,
1475 };
Marat Dukhan2894e992021-12-30 08:29:48 -08001476 xnn_params.f32.lrelu = (struct vunary_parameters) {
1477 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4,
1478 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
1479 .element_tile = 4,
1480 };
Marat Dukhane5efb162021-12-31 10:26:13 -08001481 xnn_params.f32.neg = (struct vunary_parameters) {
1482 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4,
1483 .element_tile = 4,
1484 };
Marat Dukhan0e801372022-01-04 00:10:41 -08001485 xnn_params.f32.rndne = (struct vunary_parameters) {
1486 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x1,
1487 .element_tile = 1,
1488 };
1489 xnn_params.f32.rndz = (struct vunary_parameters) {
1490 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x1,
1491 .element_tile = 1,
1492 };
1493 xnn_params.f32.rndu = (struct vunary_parameters) {
1494 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x1,
1495 .element_tile = 1,
1496 };
1497 xnn_params.f32.rndd = (struct vunary_parameters) {
1498 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x1,
1499 .element_tile = 1,
1500 };
Marat Dukhance834ad2022-01-03 00:22:01 -08001501 xnn_params.f32.sigmoid = (struct vunary_parameters) {
1502 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
1503 .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
1504 .element_tile = 2,
1505 };
Marat Dukhane5efb162021-12-31 10:26:13 -08001506 xnn_params.f32.sqr = (struct vunary_parameters) {
1507 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4,
1508 .element_tile = 4,
1509 };
Marat Dukhane72b2822021-12-30 14:46:58 -08001510 xnn_params.f32.sqrt = (struct vunary_parameters) {
1511 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
1512 .element_tile = 1,
1513 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001514 xnn_params.f32.prelu = (struct prelu_parameters) {
1515 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
1516 .row_tile = 4,
1517 .channel_tile = 4,
1518 };
Marat Dukhan4a5c7712022-01-05 22:43:13 -08001519 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
1520 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
1521 .init = xnn_init_f32_expminus_scalar_rr2_p5_params,
1522 .element_tile = 4,
1523 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001524 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
1525 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001526 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__scalar_x8,
1527 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
1528 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08001529 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001530 .element_tile = 8,
1531 };
1532 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07001533 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
1534 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
1535 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
Marat Dukhanf6004972021-12-30 11:23:02 -08001536 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001537 .element_tile = 2,
1538 };
1539 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001540 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__scalar_x8,
1541 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
1542 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001543 .element_tile = 8,
1544 };
1545 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001546 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__scalar_x8,
1547 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
1548 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001549 .element_tile = 8,
1550 };
1551 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001552 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__scalar_x8,
1553 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
1554 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08001555 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001556 .element_tile = 8,
1557 };
1558 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001559 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__scalar_x8,
1560 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__scalar_x8,
1561 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__scalar_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08001562 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001563 .element_tile = 8,
1564 };
Marat Dukhanf7399262020-06-05 10:58:44 -07001565 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001566 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
1567 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
1568 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
Marat Dukhanf7399262020-06-05 10:58:44 -07001569 .element_tile = 8,
1570 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001571 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07001572 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__scalar_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07001573 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001574 .channel_tile = 1,
1575 .row_tile = 2,
1576 };
1577 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -08001578 init_flags |= XNN_INIT_FLAG_CHW_OPT;
1579
Marat Dukhan3b745a42020-05-10 21:43:25 -07001580 xnn_params.f32.spmm = (struct spmm_parameters) {
1581 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
1582 .mr = 8,
1583 .nr = 1,
1584 };
1585 xnn_params.f32.spmm2 = (struct spmm_parameters) {
1586 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
1587 .mr = 8,
1588 .nr = 2,
1589 };
1590 xnn_params.f32.spmm4 = (struct spmm_parameters) {
1591 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
1592 .mr = 8,
1593 .nr = 4,
1594 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07001595 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
Marat Dukhan3b745a42020-05-10 21:43:25 -07001596 .ukernel_with_symm_padding =
Marat Dukhan1f29b802020-05-15 23:46:39 -07001597 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001598 .output_channel_tile = 4,
1599 .output_height_tile = 1,
1600 .output_width_tile = 1,
1601 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07001602 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001603 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1,
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001604 .output_width_tile = 1,
1605 .output_height_tile = 4,
1606 };
1607 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
1608 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001609 .output_width_tile = 1,
Marat Dukhan91249d22020-10-24 12:02:51 -07001610 .output_height_tile = 2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001611 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07001612 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001613 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001614 .output_width_tile = 1,
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001615 .output_height_tile = 2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001616 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07001617 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001618 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001619 .output_width_tile = 1,
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001620 .output_height_tile = 2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001621 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07001622 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
1623 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001624 .channel_tile = 1,
1625 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -07001626 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
1627 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
1628 .channel_tile = 1,
1629 .pixel_tile = 4,
1630 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001631 #endif // XNN_NO_NCHW_OPERATORS
1632 #endif // XNN_NO_F32_OPERATORS
1633
Frank Barchardb40ee632021-12-30 11:10:02 -08001634 /*************************** VCVT AArch32 Pre-NEON micro-kernels ***************************/
Marat Dukhanaf2ba002021-10-24 14:21:41 -07001635 #ifndef XNN_NO_VCVT_OPERATORS
1636 init_flags |= XNN_INIT_FLAG_VCVT;
1637
Marat Dukhan134f9842021-12-29 19:57:31 -08001638 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
1639 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_x4,
1640 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
1641 .element_tile = 4,
1642 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08001643 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
1644 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_fabsf_x2,
1645 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_fabsf_params,
1646 .element_tile = 2,
1647 };
Marat Dukhaned2d7762021-12-03 23:51:19 -08001648 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
Marat Dukhanbdf10992022-01-04 09:20:14 -08001649 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__scalar_imagic_x4,
1650 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_imagic_params,
Marat Dukhaned2d7762021-12-03 23:51:19 -08001651 .element_tile = 4,
1652 };
1653 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
Marat Dukhanbdf10992022-01-04 09:20:14 -08001654 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x4,
1655 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_imagic_params,
Marat Dukhaned2d7762021-12-03 23:51:19 -08001656 .element_tile = 4,
1657 };
Marat Dukhanf92206b2021-12-10 17:02:07 -08001658 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
1659 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__scalar_x4,
1660 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
1661 .element_tile = 4,
1662 };
1663 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
1664 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__scalar_x4,
1665 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
1666 .element_tile = 4,
1667 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07001668 #endif // XNN_NO_VCVT_OPERATORS
1669
Frank Barchardb40ee632021-12-30 11:10:02 -08001670 /**************************** X32 AArch32 Pre-NEON micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -07001671 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07001672 init_flags |= XNN_INIT_FLAG_X32;
1673
Marat Dukhan3b745a42020-05-10 21:43:25 -07001674 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
1675 xnn_params.x32.zip = (struct zip_parameters) {
1676 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
1677 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
1678 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
1679 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
1680 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08001681 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -08001682 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
1683 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08001684 .channel_tile = 1,
1685 .pixel_tile = 1,
1686 };
1687 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan3b745a42020-05-10 21:43:25 -07001688 #endif // XNN_NO_X32_OPERATORS
Marat Dukhan933051b2021-08-07 16:26:15 -07001689
Frank Barchardb40ee632021-12-30 11:10:02 -08001690 /**************************** XX AArch32 Pre-NEON micro-kernels ****************************/
Marat Dukhan933051b2021-08-07 16:26:15 -07001691 #ifndef XNN_NO_XX_OPERATORS
1692 init_flags |= XNN_INIT_FLAG_XX;
1693
1694 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
1695 xnn_params.xx.fill = (struct fill_parameters) {
1696 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
1697 .row_tile = 1,
1698 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -07001699 xnn_params.xx.pad = (struct pad_parameters) {
1700 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
1701 .row_tile = 1,
1702 };
Marat Dukhan933051b2021-08-07 16:26:15 -07001703 #endif // XNN_NO_XX_OPERATORS
Marat Dukhan3b745a42020-05-10 21:43:25 -07001704 }
XNNPACK Teamb455b122019-09-27 18:10:33 -07001705
Marat Dukhan1dadbf72019-10-01 10:46:20 -07001706#elif XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -07001707
Frank Barchardb40ee632021-12-30 11:10:02 -08001708 /**************************** QC8 AArch64 micro-kernels ****************************/
Marat Dukhan898d5852021-06-30 21:18:34 -07001709 #ifndef XNN_NO_QC8_OPERATORS
1710 init_flags |= XNN_INIT_FLAG_QC8;
1711
Marat Dukhan75d1b792021-07-01 13:00:28 -07001712 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
1713 #if XNN_ENABLE_ASSEMBLY
1714 if (cpuinfo_has_arm_neon_dot()) {
1715 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1716 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1717 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1718 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
Marat Dukhan7988a182021-12-06 22:00:33 -08001719 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001720 xnn_params.qc8.gemm.mr = 4;
1721 xnn_params.qc8.gemm.nr = 16;
1722 xnn_params.qc8.gemm.log2_kr = 2;
1723 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08001724 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1725 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1726 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
1727 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
Marat Dukhan7988a182021-12-06 22:00:33 -08001728 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001729 xnn_params.qc8.gemm.mr = 2;
1730 xnn_params.qc8.gemm.nr = 8;
1731 xnn_params.qc8.gemm.log2_kr = 3;
1732 }
1733 #else // !XNN_ENABLE_ASSEMBLY
1734 if (cpuinfo_has_arm_neon_dot()) {
1735 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__neondot);
1736 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1737 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__neondot);
1738 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
Marat Dukhan7988a182021-12-06 22:00:33 -08001739 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001740 xnn_params.qc8.gemm.mr = 4;
1741 xnn_params.qc8.gemm.nr = 16;
1742 xnn_params.qc8.gemm.log2_kr = 2;
1743 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08001744 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1745 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1746 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
1747 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
Marat Dukhan7988a182021-12-06 22:00:33 -08001748 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001749 xnn_params.qc8.gemm.mr = 2;
1750 xnn_params.qc8.gemm.nr = 8;
1751 xnn_params.qc8.gemm.log2_kr = 1;
Frank Barcharde7043ff2021-11-10 14:50:08 -08001752 xnn_params.qc8.gemm.log2_sr = 2;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001753 }
1754 #endif // XNN_ENABLE_ASSEMBLY
1755 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1756 #if XNN_ENABLE_ASSEMBLY
1757 if (cpuinfo_has_arm_neon_dot()) {
1758 switch (cpuinfo_get_core(0)->uarch) {
1759 case cpuinfo_uarch_cortex_a55:
1760 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1761 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1762 break;
1763 case cpuinfo_uarch_cortex_x1:
1764 case cpuinfo_uarch_cortex_a78:
1765 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1766 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1767 break;
1768 default:
1769 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64);
1770 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64);
1771 break;
1772 }
1773 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1774 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
Marat Dukhan7988a182021-12-06 22:00:33 -08001775 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001776 xnn_params.qc8.gemm.mr = 4;
1777 xnn_params.qc8.gemm.nr = 16;
1778 xnn_params.qc8.gemm.log2_kr = 2;
1779 } else {
1780 switch (cpuinfo_get_core(0)->uarch) {
Frank Barchard6c34dbf2021-11-22 16:14:53 -08001781 case cpuinfo_uarch_cortex_a35:
1782 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1783 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1784 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1785 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
Marat Dukhan7988a182021-12-06 22:00:33 -08001786 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Frank Barchard6c34dbf2021-11-22 16:14:53 -08001787 xnn_params.qc8.gemm.mr = 4;
1788 xnn_params.qc8.gemm.nr = 16;
1789 break;
1790
Marat Dukhan75d1b792021-07-01 13:00:28 -07001791 case cpuinfo_uarch_cortex_a53:
1792 case cpuinfo_uarch_cortex_a55r0:
1793 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1794 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1795 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1796 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
Marat Dukhan7988a182021-12-06 22:00:33 -08001797 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001798 xnn_params.qc8.gemm.mr = 4;
1799 xnn_params.qc8.gemm.nr = 16;
1800 break;
1801
1802 case cpuinfo_uarch_cortex_a72:
1803 case cpuinfo_uarch_cortex_a73:
1804 case cpuinfo_uarch_kryo:
Frank Barcharde22685a2021-11-12 11:36:58 -08001805 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1806 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1807 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm);
1808 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm);
Marat Dukhan7988a182021-12-06 22:00:33 -08001809 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001810 xnn_params.qc8.gemm.mr = 2;
1811 xnn_params.qc8.gemm.nr = 8;
1812 xnn_params.qc8.gemm.log2_kr = 3;
1813 break;
1814
1815 default:
Frank Barcharde22685a2021-11-12 11:36:58 -08001816 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1817 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1818 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
1819 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
Marat Dukhan7988a182021-12-06 22:00:33 -08001820 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001821 xnn_params.qc8.gemm.mr = 2;
1822 xnn_params.qc8.gemm.nr = 8;
1823 xnn_params.qc8.gemm.log2_kr = 3;
1824 break;
1825 }
1826 }
1827 #if XNN_MAX_UARCH_TYPES > 1
1828 {
1829 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
1830 const uint32_t mr = xnn_params.qc8.gemm.mr;
1831 const uint32_t nr = xnn_params.qc8.gemm.nr;
1832 const uint32_t log2_kr = xnn_params.qc8.gemm.log2_kr;
1833 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
1834 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
1835 if (uarch_info == NULL) {
1836 /* No more microarchitectures in the system */
1837 break;
1838 }
1839
1840 switch (uarch_info->uarch) {
1841 case cpuinfo_uarch_cortex_a53:
1842 case cpuinfo_uarch_cortex_a55r0:
1843 if (mr == 2 && nr == 8 && log2_kr == 3) {
Frank Barcharde22685a2021-11-12 11:36:58 -08001844 xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1845 xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1846 xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1847 xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001848 }
1849 break;
1850
1851 case cpuinfo_uarch_cortex_a55:
Frank Barchardc37b8da2021-09-01 00:35:19 -07001852 if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
Marat Dukhan75d1b792021-07-01 13:00:28 -07001853 xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55;
1854 xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55;
1855 xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot;
1856 xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot;
1857 }
1858 break;
1859 default:
1860 break;
1861 }
1862 }
1863 }
1864 #endif // XNN_MAX_UARCH_TYPES > 1
1865 #else // !XNN_ENABLE_ASSEMBLY
1866 if (cpuinfo_has_arm_neon_dot()) {
1867 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__neondot);
1868 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1869 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__neondot);
1870 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
Marat Dukhan7988a182021-12-06 22:00:33 -08001871 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001872 xnn_params.qc8.gemm.mr = 4;
1873 xnn_params.qc8.gemm.nr = 16;
1874 xnn_params.qc8.gemm.log2_kr = 2;
1875 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08001876 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1877 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1878 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
1879 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
Marat Dukhan7988a182021-12-06 22:00:33 -08001880 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001881 xnn_params.qc8.gemm.mr = 2;
1882 xnn_params.qc8.gemm.nr = 8;
1883 xnn_params.qc8.gemm.log2_kr = 1;
Frank Barcharde7043ff2021-11-10 14:50:08 -08001884 xnn_params.qc8.gemm.log2_sr = 2;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001885 }
1886 #endif // XNN_ENABLE_ASSEMBLY
1887 #endif // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Marat Dukhan898d5852021-06-30 21:18:34 -07001888
Frank Barchard0d065732021-08-31 00:01:40 -07001889 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neonv8_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -08001890 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Frank Barchard0d065732021-08-31 00:01:40 -07001891 xnn_params.qc8.dwconv[0].channel_tile = 16;
Marat Dukhan898d5852021-06-30 21:18:34 -07001892 xnn_params.qc8.dwconv[0].primary_tile = 9;
Frank Barchard7da8b022021-08-31 09:49:10 -07001893 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__neonv8_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -08001894 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Frank Barchard7da8b022021-08-31 09:49:10 -07001895 xnn_params.qc8.dwconv[1].channel_tile = 16;
Marat Dukhan898d5852021-06-30 21:18:34 -07001896 xnn_params.qc8.dwconv[1].primary_tile = 25;
1897 #endif // XNN_NO_QC8_OPERATORS
1898
Frank Barchardb40ee632021-12-30 11:10:02 -08001899 /**************************** QS8 AArch64 micro-kernels ****************************/
Marat Dukhanf28cddf2020-08-10 14:05:02 -07001900 #ifndef XNN_NO_QS8_OPERATORS
1901 init_flags |= XNN_INIT_FLAG_QS8;
1902
Marat Dukhandfe47b92020-12-14 02:48:43 -08001903 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Frank Barchardbc0c7292020-10-06 13:36:54 -07001904 #if XNN_ENABLE_ASSEMBLY
Marat Dukhan31677ad2020-10-13 23:59:31 -07001905 if (cpuinfo_has_arm_neon_dot()) {
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001906 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1907 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1908 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1909 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1910 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001911 xnn_params.qs8.gemm.mr = 4;
1912 xnn_params.qs8.gemm.nr = 16;
1913 xnn_params.qs8.gemm.log2_kr = 2;
1914 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08001915 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
1916 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
1917 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
1918 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001919 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001920 xnn_params.qs8.gemm.mr = 2;
Frank Barcharda414daa2021-02-23 15:50:39 -08001921 xnn_params.qs8.gemm.nr = 8;
Frank Barchardbbf51822021-03-12 10:37:31 -08001922 xnn_params.qs8.gemm.log2_kr = 3;
Frank Barchard1e8590e2020-10-12 21:20:46 -07001923 }
Marat Dukhan31677ad2020-10-13 23:59:31 -07001924 #else // !XNN_ENABLE_ASSEMBLY
1925 if (cpuinfo_has_arm_neon_dot()) {
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001926 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
1927 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1928 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
1929 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1930 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001931 xnn_params.qs8.gemm.mr = 4;
1932 xnn_params.qs8.gemm.nr = 16;
1933 xnn_params.qs8.gemm.log2_kr = 2;
1934 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08001935 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
1936 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
1937 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
1938 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001939 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001940 xnn_params.qs8.gemm.mr = 2;
Frank Barcharda414daa2021-02-23 15:50:39 -08001941 xnn_params.qs8.gemm.nr = 8;
1942 xnn_params.qs8.gemm.log2_kr = 1;
Frank Barchard66ae2572021-11-02 17:36:21 -07001943 xnn_params.qs8.gemm.log2_sr = 2;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001944 }
1945 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhandfe47b92020-12-14 02:48:43 -08001946 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
Marat Dukhan31677ad2020-10-13 23:59:31 -07001947 #if XNN_ENABLE_ASSEMBLY
1948 if (cpuinfo_has_arm_neon_dot()) {
1949 switch (cpuinfo_get_core(0)->uarch) {
1950 case cpuinfo_uarch_cortex_a55:
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001951 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1952 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
Marat Dukhan31677ad2020-10-13 23:59:31 -07001953 break;
Frank Barchard0ae35f22021-06-15 17:34:24 -07001954 case cpuinfo_uarch_cortex_x1:
1955 case cpuinfo_uarch_cortex_a78:
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001956 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1957 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
Frank Barchard0ae35f22021-06-15 17:34:24 -07001958 break;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001959 default:
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001960 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64);
1961 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64);
Marat Dukhan31677ad2020-10-13 23:59:31 -07001962 break;
1963 }
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001964 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1965 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1966 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001967 xnn_params.qs8.gemm.mr = 4;
1968 xnn_params.qs8.gemm.nr = 16;
1969 xnn_params.qs8.gemm.log2_kr = 2;
1970 } else {
Frank Barchard2a995e72021-04-13 16:24:25 -07001971 switch (cpuinfo_get_core(0)->uarch) {
Frank Barchard6c34dbf2021-11-22 16:14:53 -08001972 case cpuinfo_uarch_cortex_a35:
1973 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1974 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1975 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1976 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1977 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
1978 xnn_params.qs8.gemm.mr = 4;
1979 xnn_params.qs8.gemm.nr = 16;
1980 break;
1981
Frank Barchard2a995e72021-04-13 16:24:25 -07001982 case cpuinfo_uarch_cortex_a53:
Frank Barchardfb5983d2021-04-20 14:09:08 -07001983 case cpuinfo_uarch_cortex_a55r0:
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001984 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1985 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1986 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1987 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1988 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchardd4416d62021-05-17 15:51:37 -07001989 xnn_params.qs8.gemm.mr = 4;
1990 xnn_params.qs8.gemm.nr = 16;
Frank Barchard6ac1d182021-04-14 13:47:07 -07001991 break;
1992
Frank Barchard2a995e72021-04-13 16:24:25 -07001993 case cpuinfo_uarch_cortex_a72:
1994 case cpuinfo_uarch_cortex_a73:
1995 case cpuinfo_uarch_kryo:
Frank Barcharde22685a2021-11-12 11:36:58 -08001996 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1997 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1998 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm);
1999 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm);
Frank Barchard22f9a9f2021-07-21 11:35:27 -07002000 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchard2a995e72021-04-13 16:24:25 -07002001 xnn_params.qs8.gemm.mr = 2;
2002 xnn_params.qs8.gemm.nr = 8;
2003 xnn_params.qs8.gemm.log2_kr = 3;
Frank Barchardc77fc4c2021-04-14 13:28:01 -07002004 break;
Frank Barchard2a995e72021-04-13 16:24:25 -07002005
2006 default:
Frank Barcharde22685a2021-11-12 11:36:58 -08002007 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
2008 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
2009 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
2010 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
Frank Barchard22f9a9f2021-07-21 11:35:27 -07002011 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchard2a995e72021-04-13 16:24:25 -07002012 xnn_params.qs8.gemm.mr = 2;
2013 xnn_params.qs8.gemm.nr = 8;
2014 xnn_params.qs8.gemm.log2_kr = 3;
Frank Barchardc77fc4c2021-04-14 13:28:01 -07002015 break;
Frank Barchard2a995e72021-04-13 16:24:25 -07002016 }
Marat Dukhan31677ad2020-10-13 23:59:31 -07002017 }
2018 #if XNN_MAX_UARCH_TYPES > 1
2019 {
2020 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2021 const uint32_t mr = xnn_params.qs8.gemm.mr;
2022 const uint32_t nr = xnn_params.qs8.gemm.nr;
2023 const uint32_t log2_kr = xnn_params.qs8.gemm.log2_kr;
2024 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2025 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2026 if (uarch_info == NULL) {
2027 /* No more microarchitectures in the system */
2028 break;
2029 }
2030
2031 switch (uarch_info->uarch) {
Frank Barchard2a995e72021-04-13 16:24:25 -07002032 case cpuinfo_uarch_cortex_a53:
Frank Barchard90f520b2021-04-26 18:01:51 -07002033 case cpuinfo_uarch_cortex_a55r0:
Frank Barchard2a995e72021-04-13 16:24:25 -07002034 if (mr == 2 && nr == 8 && log2_kr == 3) {
Frank Barcharde22685a2021-11-12 11:36:58 -08002035 xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2036 xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2037 xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2038 xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
Frank Barchard2a995e72021-04-13 16:24:25 -07002039 }
2040 break;
2041
Marat Dukhan31677ad2020-10-13 23:59:31 -07002042 case cpuinfo_uarch_cortex_a55:
Frank Barchardc37b8da2021-09-01 00:35:19 -07002043 if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
Frank Barchard22f9a9f2021-07-21 11:35:27 -07002044 xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2045 xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2046 xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot;
2047 xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot;
Marat Dukhan31677ad2020-10-13 23:59:31 -07002048 }
2049 break;
2050 default:
2051 break;
2052 }
2053 }
2054 }
2055 #endif // XNN_MAX_UARCH_TYPES > 1
2056 #else // !XNN_ENABLE_ASSEMBLY
2057 if (cpuinfo_has_arm_neon_dot()) {
Frank Barchard22f9a9f2021-07-21 11:35:27 -07002058 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
2059 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2060 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
2061 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2062 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07002063 xnn_params.qs8.gemm.mr = 4;
2064 xnn_params.qs8.gemm.nr = 16;
2065 xnn_params.qs8.gemm.log2_kr = 2;
2066 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08002067 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
2068 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
2069 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
2070 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
Frank Barchard22f9a9f2021-07-21 11:35:27 -07002071 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07002072 xnn_params.qs8.gemm.mr = 2;
Frank Barcharda414daa2021-02-23 15:50:39 -08002073 xnn_params.qs8.gemm.nr = 8;
2074 xnn_params.qs8.gemm.log2_kr = 1;
Frank Barcharde7043ff2021-11-10 14:50:08 -08002075 xnn_params.qs8.gemm.log2_sr = 2;
Marat Dukhan31677ad2020-10-13 23:59:31 -07002076 }
2077 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhandfe47b92020-12-14 02:48:43 -08002078 #endif // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Marat Dukhanf28cddf2020-08-10 14:05:02 -07002079
Frank Barchard0d065732021-08-31 00:01:40 -07002080 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64;
Marat Dukhan4ba70b72021-07-19 11:20:16 -07002081 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchard0d065732021-08-31 00:01:40 -07002082 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhanf28cddf2020-08-10 14:05:02 -07002083 xnn_params.qs8.dwconv[0].primary_tile = 9;
Frank Barchard7da8b022021-08-31 09:49:10 -07002084 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mla8_ld64;
Marat Dukhan4ba70b72021-07-19 11:20:16 -07002085 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchard7da8b022021-08-31 09:49:10 -07002086 xnn_params.qs8.dwconv[1].channel_tile = 16;
Marat Dukhan4ed14882021-05-12 17:50:40 -07002087 xnn_params.qs8.dwconv[1].primary_tile = 25;
Marat Dukhanf28cddf2020-08-10 14:05:02 -07002088
2089 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan85755042022-01-13 01:46:05 -08002090 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
2091 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
2092 .init.qs8 = xnn_init_qs8_avgpool_minmax_rndnu_neon_params,
2093 .update.qs8 = xnn_update_qs8_avgpool_minmax_rndnu_neon_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08002094 .row_tile = 7,
2095 .channel_tile = 8,
Marat Dukhanf28cddf2020-08-10 14:05:02 -07002096 };
Marat Dukhanff209482020-09-03 14:26:53 -07002097
2098 xnn_params.qs8.vadd = (struct vbinary_parameters) {
Marat Dukhan01debd92021-07-29 18:14:21 -07002099 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__neon_ld64_x32,
2100 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32,
2101 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32,
Marat Dukhan64287252021-09-07 16:20:03 -07002102 .init.qs8_addsub = xnn_init_qs8_add_minmax_neon_params,
Marat Dukhan01debd92021-07-29 18:14:21 -07002103 .element_tile = 32,
Marat Dukhanff209482020-09-03 14:26:53 -07002104 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07002105 xnn_params.qs8.vmul = (struct vbinary_parameters) {
Marat Dukhan33a98fa2022-01-13 00:08:57 -08002106 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
2107 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2108 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2109 .init.qs8_mul = xnn_init_qs8_mul_minmax_rndnu_neon_params,
Marat Dukhan0853b8a2021-08-03 01:01:53 -07002110 .element_tile = 16,
2111 };
Marat Dukhanf28cddf2020-08-10 14:05:02 -07002112 #endif // XNN_NO_QS8_OPERATORS
2113
Frank Barchardb40ee632021-12-30 11:10:02 -08002114 /**************************** QU8 AArch64 micro-kernels ****************************/
Marat Dukhan08b7a972020-07-14 18:17:29 -07002115 #ifndef XNN_NO_QU8_OPERATORS
2116 init_flags |= XNN_INIT_FLAG_QU8;
Frank Barchard20255152021-08-11 14:01:45 -07002117
Frank Barcharda962f1e2021-08-02 13:52:15 -07002118 #if XNN_ENABLE_ASSEMBLY
Frank Barchard20255152021-08-11 14:01:45 -07002119 if (cpuinfo_has_arm_neon_dot()) {
Frank Barchard8b698022021-08-26 11:17:32 -07002120 switch (cpuinfo_get_core(0)->uarch) {
2121 case cpuinfo_uarch_cortex_a55:
Frank Barcharda49e41f2021-08-31 20:30:24 -07002122 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
2123 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
2124 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2125 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
Frank Barchard8b698022021-08-26 11:17:32 -07002126 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2127 xnn_params.qu8.gemm.mr = 4;
Frank Barcharda49e41f2021-08-31 20:30:24 -07002128 xnn_params.qu8.gemm.nr = 16;
Frank Barchard8b698022021-08-26 11:17:32 -07002129 xnn_params.qu8.gemm.log2_kr = 2;
2130 break;
2131 default:
2132 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
2133 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
2134 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2135 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2136 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2137 xnn_params.qu8.gemm.mr = 4;
2138 xnn_params.qu8.gemm.nr = 16;
2139 xnn_params.qu8.gemm.log2_kr = 2;
2140 break;
2141 }
Frank Barchard20255152021-08-11 14:01:45 -07002142 } else {
2143 switch (cpuinfo_get_core(0)->uarch) {
2144 case cpuinfo_uarch_cortex_a53:
2145 case cpuinfo_uarch_cortex_a55r0:
Frank Barchard20255152021-08-11 14:01:45 -07002146 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53);
2147 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53);
2148 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2149 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2150 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2151 xnn_params.qu8.gemm.mr = 4;
2152 xnn_params.qu8.gemm.nr = 16;
2153 break;
Frank Barchardf479a1c2021-08-03 10:20:30 -07002154
Frank Barchard20255152021-08-11 14:01:45 -07002155 case cpuinfo_uarch_cortex_a57:
2156 case cpuinfo_uarch_cortex_a72:
2157 case cpuinfo_uarch_cortex_a73:
2158 case cpuinfo_uarch_cortex_a75:
2159 case cpuinfo_uarch_cortex_a76:
2160 case cpuinfo_uarch_exynos_m1:
2161 case cpuinfo_uarch_exynos_m2:
2162 case cpuinfo_uarch_exynos_m3:
2163 case cpuinfo_uarch_exynos_m4:
2164 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75);
2165 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75);
2166 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2167 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2168 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2169 xnn_params.qu8.gemm.mr = 4;
2170 xnn_params.qu8.gemm.nr = 16;
2171 break;
Frank Barchardf479a1c2021-08-03 10:20:30 -07002172
Frank Barchard20255152021-08-11 14:01:45 -07002173 case cpuinfo_uarch_kryo:
2174 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
2175 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
2176 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2177 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2178 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2179 xnn_params.qu8.gemm.mr = 4;
2180 xnn_params.qu8.gemm.nr = 16;
2181 break;
2182
2183 default:
2184 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75);
2185 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75);
2186 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2187 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2188 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2189 xnn_params.qu8.gemm.mr = 4;
2190 xnn_params.qu8.gemm.nr = 16;
2191 break;
2192 }
Frank Barchardf479a1c2021-08-03 10:20:30 -07002193 }
Frank Barchardc37b8da2021-09-01 00:35:19 -07002194 #if XNN_MAX_UARCH_TYPES > 1
2195 {
2196 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2197 const uint32_t mr = xnn_params.qu8.gemm.mr;
2198 const uint32_t nr = xnn_params.qu8.gemm.nr;
2199 const uint32_t log2_kr = xnn_params.qu8.gemm.log2_kr;
2200 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2201 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2202 if (uarch_info == NULL) {
2203 /* No more microarchitectures in the system */
2204 break;
2205 }
2206
2207 switch (uarch_info->uarch) {
2208 case cpuinfo_uarch_cortex_a53:
2209 case cpuinfo_uarch_cortex_a55r0:
2210 if (mr == 4 && nr == 16 && log2_kr == 0) {
2211 xnn_params.qu8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53;
2212 xnn_params.qu8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53;
2213 }
2214 break;
2215
2216 case cpuinfo_uarch_cortex_a55:
2217 if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
2218 xnn_params.qu8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2219 xnn_params.qu8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2220 }
2221 break;
2222 default:
2223 break;
2224 }
2225 }
2226 }
2227 #endif // XNN_MAX_UARCH_TYPES > 1
Frank Barchard20255152021-08-11 14:01:45 -07002228 #else // !XNN_ENABLE_ASSEMBLY
2229 if (cpuinfo_has_arm_neon_dot()) {
2230 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
2231 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
2232 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2233 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2234 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2235 xnn_params.qu8.gemm.mr = 4;
2236 xnn_params.qu8.gemm.nr = 16;
2237 xnn_params.qu8.gemm.log2_kr = 2;
2238 } else {
2239 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
2240 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
2241 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2242 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2243 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2244 xnn_params.qu8.gemm.mr = 4;
2245 xnn_params.qu8.gemm.nr = 16;
Marat Dukhan947805b2021-12-07 14:32:09 -08002246 }
Frank Barchard20255152021-08-11 14:01:45 -07002247 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhanaefaef32020-04-09 07:09:34 -07002248
Frank Barchard354cbc62021-09-27 21:42:41 -07002249 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8;
Marat Dukhan73a899a2021-07-27 00:10:38 -07002250 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
Frank Barchard354cbc62021-09-27 21:42:41 -07002251 xnn_params.qu8.dwconv[0].channel_tile = 16;
Marat Dukhan08b7a972020-07-14 18:17:29 -07002252 xnn_params.qu8.dwconv[0].primary_tile = 9;
Frank Barchard354cbc62021-09-27 21:42:41 -07002253 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8;
Marat Dukhan73a899a2021-07-27 00:10:38 -07002254 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
Frank Barchard354cbc62021-09-27 21:42:41 -07002255 xnn_params.qu8.dwconv[1].channel_tile = 8;
Marat Dukhan81721352021-07-15 18:26:08 -07002256 xnn_params.qu8.dwconv[1].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002257
Marat Dukhan08b7a972020-07-14 18:17:29 -07002258 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08002259 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8,
2260 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8,
Marat Dukhan3c949a32022-01-09 20:12:33 -08002261 .init.qu8 = xnn_init_qu8_avgpool_minmax_neon_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08002262 .primary_tile = 9,
2263 .incremental_tile = 8,
2264 .channel_tile = 8,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002265 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07002266 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan85755042022-01-13 01:46:05 -08002267 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
2268 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
2269 .init.qu8 = xnn_init_qu8_avgpool_minmax_rndnu_neon_params,
2270 .update.qu8 = xnn_update_qu8_avgpool_minmax_rndnu_neon_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08002271 .row_tile = 7,
2272 .channel_tile = 8,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002273 };
Marat Dukhandb007cd2021-07-20 23:42:39 -07002274 xnn_params.qu8.vadd = (struct vbinary_parameters) {
Frank Barchard0a3093c2021-08-31 09:58:11 -07002275 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__neon_ld64_x32,
2276 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x32,
2277 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x32,
Marat Dukhan64287252021-09-07 16:20:03 -07002278 .init.qu8_addsub = xnn_init_qu8_add_minmax_neon_params,
Marat Dukhandb007cd2021-07-20 23:42:39 -07002279 .element_tile = 8,
2280 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07002281 xnn_params.qu8.vmul = (struct vbinary_parameters) {
Marat Dukhan33a98fa2022-01-13 00:08:57 -08002282 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
2283 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2284 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2285 .init.qu8_mul = xnn_init_qu8_mul_minmax_rndnu_neon_params,
Marat Dukhan0853b8a2021-08-03 01:01:53 -07002286 .element_tile = 16,
2287 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07002288 #endif // XNN_NO_QU8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07002289
Frank Barchardb40ee632021-12-30 11:10:02 -08002290 /**************************** S8 AArch64 micro-kernels ****************************/
Marat Dukhan23147532021-08-16 07:26:56 -07002291 #ifndef XNN_NO_S8_OPERATORS
2292 init_flags |= XNN_INIT_FLAG_S8;
2293
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07002294 xnn_params.s8.clamp = (struct vunary_parameters) {
2295 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__neon_x64,
2296 .init.s8_minmax = xnn_init_s8_minmax_neon_params,
2297 .element_tile = 64,
2298 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08002299 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
2300 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__neon_c16,
2301 .pixel_tile = 1,
2302 .channel_tile = 16,
2303 };
Marat Dukhan23147532021-08-16 07:26:56 -07002304 xnn_params.s8.maxpool = (struct maxpool_parameters) {
2305 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__neon_c16,
Marat Dukhandc5c1482021-08-16 09:03:15 -07002306 .init.s8 = xnn_init_s8_minmax_neon_params,
Marat Dukhan23147532021-08-16 07:26:56 -07002307 .mr = 9,
2308 .qr = 8,
2309 };
2310 #endif // XNN_NO_S8_OPERATORS
2311
Frank Barchardb40ee632021-12-30 11:10:02 -08002312 /**************************** U8 AArch64 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002313 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07002314 init_flags |= XNN_INIT_FLAG_U8;
2315
Marat Dukhan94912792021-08-16 21:40:30 -07002316 xnn_params.u8.clamp = (struct vunary_parameters) {
2317 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__neon_x64,
2318 .init.u8_minmax = xnn_init_u8_minmax_neon_params,
2319 .element_tile = 64,
2320 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08002321 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
2322 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__neon_c16,
2323 .pixel_tile = 1,
2324 .channel_tile = 16,
2325 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002326 xnn_params.u8.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07002327 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16,
Marat Dukhan2ea50a02021-08-16 12:59:19 -07002328 .init.u8 = xnn_init_u8_minmax_neon_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002329 .mr = 9,
2330 .qr = 8,
2331 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002332 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
2333 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
2334 #endif // XNN_NO_U8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07002335
Frank Barchardb40ee632021-12-30 11:10:02 -08002336 /**************************** X8 AArch64 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002337 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07002338 init_flags |= XNN_INIT_FLAG_X8;
2339
Marat Dukhan98e054b2021-09-13 09:43:50 -07002340 xnn_params.x8.lut = xnn_x8_lut_ukernel__neon_tbx128x4_x64;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002341 xnn_params.x8.zip = (struct zip_parameters) {
2342 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__neon,
2343 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__neon,
2344 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__neon,
2345 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__neon,
2346 };
2347 #endif // XNN_NO_X8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07002348
Frank Barchardb40ee632021-12-30 11:10:02 -08002349 /**************************** F16 AArch64 micro-kernels ****************************/
Frank Barchard7e2cbb02020-06-12 01:22:13 -07002350 #ifndef XNN_NO_F16_OPERATORS
Marat Dukhan8d5d2592020-06-19 12:48:57 -07002351 if (cpuinfo_has_arm_neon_fp16_arith()) {
2352 init_flags |= XNN_INIT_FLAG_F16;
Frank Barchard7c3826e2021-06-07 15:14:16 -07002353 xnn_params.f16.gemm.mr = 6;
2354 xnn_params.f16.gemm.nr = 16;
Marat Dukhan854fb6b2020-06-19 12:33:44 -07002355
Frank Barchard6b73c4f2020-06-26 18:40:40 -07002356 #if XNN_ENABLE_ASSEMBLY
Frank Barchard7c3826e2021-06-07 15:14:16 -07002357 switch (cpuinfo_get_core(0)->uarch) {
2358 case cpuinfo_uarch_cortex_a55:
2359 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55);
2360 break;
2361
Frank Barchard07f4a892021-06-07 18:26:08 -07002362 case cpuinfo_uarch_cortex_a75:
Frank Barchard7b48ddc2021-06-11 13:00:49 -07002363 case cpuinfo_uarch_cortex_x1:
Frank Barchard07f4a892021-06-07 18:26:08 -07002364 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75);
2365 break;
2366
Frank Barchard7c3826e2021-06-07 15:14:16 -07002367 default:
2368 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
2369 break;
2370 }
Frank Barchard6b73c4f2020-06-26 18:40:40 -07002371 xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
Frank Barchard7c3826e2021-06-07 15:14:16 -07002372
2373 #if XNN_MAX_UARCH_TYPES > 1
2374 {
2375 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2376 const uint32_t mr = xnn_params.f16.gemm.mr;
2377 const uint32_t nr = xnn_params.f16.gemm.nr;
2378 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2379 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2380 if (uarch_info == NULL) {
2381 /* No more microarchitectures in the system */
2382 break;
2383 }
2384
2385 switch (uarch_info->uarch) {
2386 case cpuinfo_uarch_cortex_a55:
2387 if (mr == 6 && nr == 16) {
2388 xnn_params.f16.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55;
2389 }
2390 break;
Frank Barchard07f4a892021-06-07 18:26:08 -07002391
Frank Barchardd2f454e2021-06-08 10:47:16 -07002392 case cpuinfo_uarch_cortex_a55r0:
2393 if (mr == 6 && nr == 16) {
2394 xnn_params.f16.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64;
2395 }
2396 break;
2397
Frank Barchard07f4a892021-06-07 18:26:08 -07002398 /* Cortex A75 is the medium core Exynos 9820 (M4) */
2399 case cpuinfo_uarch_cortex_a75:
2400 if (mr == 6 && nr == 16) {
2401 xnn_params.f16.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75;
2402 }
2403 break;
2404
Frank Barchard7c3826e2021-06-07 15:14:16 -07002405 default:
2406 break;
2407 }
2408 }
2409 }
2410 #endif // XNN_MAX_UARCH_TYPES > 1
2411 #else // XNN_ENABLE_ASSEMBLY
Frank Barchard6b73c4f2020-06-26 18:40:40 -07002412 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2413 xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
Frank Barchard7c3826e2021-06-07 15:14:16 -07002414 #endif // XNN_ENABLE_ASSEMBLY
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002415 xnn_params.f16.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002416 xnn_params.f16.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
Marat Dukhanc4302c22022-01-06 19:27:03 -08002417 xnn_params.f16.gemm.init.f16 = xnn_init_f16_scaleminmax_neon_params;
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002418
2419 xnn_params.f16.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith;
Marat Dukhan645af972022-01-09 22:50:27 -08002420 xnn_params.f16.dwconv[0].init.f16 = xnn_init_f16_minmax_neon_params;
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002421 xnn_params.f16.dwconv[0].channel_tile = 16;
2422 xnn_params.f16.dwconv[0].primary_tile = 4;
2423
2424 xnn_params.f16.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith;
Marat Dukhan645af972022-01-09 22:50:27 -08002425 xnn_params.f16.dwconv[1].init.f16 = xnn_init_f16_minmax_neon_params;
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002426 xnn_params.f16.dwconv[1].channel_tile = 16;
2427 xnn_params.f16.dwconv[1].primary_tile = 9;
2428
2429 xnn_params.f16.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2;
Marat Dukhan645af972022-01-09 22:50:27 -08002430 xnn_params.f16.dwconv[2].init.f16 = xnn_init_f16_minmax_neon_params;
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002431 xnn_params.f16.dwconv[2].channel_tile = 8;
2432 xnn_params.f16.dwconv[2].primary_tile = 25;
2433
Marat Dukhan8d5d2592020-06-19 12:48:57 -07002434 xnn_params.f16.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08002435 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8,
2436 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8,
2437 .init.f16 = xnn_init_f16_scaleminmax_neon_params,
2438 .update.f16 = xnn_update_f16_scaleminmax_neon_params,
2439 .row_tile = 7,
2440 .channel_tile = 8,
Marat Dukhan8d5d2592020-06-19 12:48:57 -07002441 };
Frank Barchard01898c02020-06-23 21:49:50 -07002442 xnn_params.f16.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002443 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vadd_minmax_ukernel__neonfp16arith_x16,
2444 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16,
2445 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16,
Marat Dukhan645af972022-01-09 22:50:27 -08002446 .init.f16_minmax = xnn_init_f16_minmax_neon_params,
Frank Barchard01898c02020-06-23 21:49:50 -07002447 .element_tile = 16,
2448 };
Frank Barchard0ea6a772020-09-09 15:26:31 -07002449 xnn_params.f16.vmul = (struct vbinary_parameters) {
2450 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmul_minmax_ukernel__neonfp16arith_x16,
2451 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16,
2452 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16,
Marat Dukhan645af972022-01-09 22:50:27 -08002453 .init.f16_minmax = xnn_init_f16_minmax_neon_params,
Frank Barchard0ea6a772020-09-09 15:26:31 -07002454 .element_tile = 16,
2455 };
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002456 xnn_params.f16.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07002457 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f16_vmulcaddc_minmax_ukernel_c8__neonfp16arith_2x,
Marat Dukhan645af972022-01-09 22:50:27 -08002458 .init.f16 = xnn_init_f16_minmax_neon_params,
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002459 .channel_tile = 8,
2460 .row_tile = 2,
2461 };
Marat Dukhan561d0682021-12-23 16:12:35 -08002462 xnn_params.f16.hswish = (struct vunary_parameters) {
2463 .ukernel = (xnn_univector_ukernel_function) xnn_f16_vhswish_ukernel__neonfp16arith_x16,
Marat Dukhan751f6222022-01-09 23:10:04 -08002464 .init.f16_hswish = xnn_init_f16_hswish_neon_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08002465 .element_tile = 16,
2466 };
Marat Dukhan8d5d2592020-06-19 12:48:57 -07002467 }
Frank Barchard7e2cbb02020-06-12 01:22:13 -07002468 #endif // XNN_NO_F16_OPERATORS
2469
Frank Barchardb40ee632021-12-30 11:10:02 -08002470 /**************************** F32 AArch64 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002471 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07002472 init_flags |= XNN_INIT_FLAG_F32;
2473
Marat Dukhandfe47b92020-12-14 02:48:43 -08002474 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Frank Barchard0d1052c2020-03-23 17:28:13 -07002475 #if XNN_ENABLE_ASSEMBLY
Frank Barchard143a1102021-06-15 09:15:34 -07002476 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2477 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2478 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2479 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002480 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002481 xnn_params.f32.gemm.mr = 6;
2482 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002483 #else // !XNN_ENABLE_ASSEMBLY
Marat Dukhanaefaef32020-04-09 07:09:34 -07002484 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64);
2485 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64);
2486 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2487 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002488 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002489 xnn_params.f32.gemm.mr = 6;
2490 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002491 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhandfe47b92020-12-14 02:48:43 -08002492 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
Frank Barchard0d1052c2020-03-23 17:28:13 -07002493 #if XNN_ENABLE_ASSEMBLY
2494 switch (cpuinfo_get_core(0)->uarch) {
2495 case cpuinfo_uarch_cortex_a57:
Frank Barchard143a1102021-06-15 09:15:34 -07002496 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75);
2497 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75);
2498 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
2499 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002500 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002501 xnn_params.f32.gemm.mr = 6;
2502 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002503 break;
2504 case cpuinfo_uarch_cortex_a72:
Frank Barchard143a1102021-06-15 09:15:34 -07002505 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75);
2506 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75);
2507 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2508 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002509 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002510 xnn_params.f32.gemm.mr = 4;
2511 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002512 break;
2513 case cpuinfo_uarch_cortex_a75:
2514 case cpuinfo_uarch_cortex_a76:
2515 case cpuinfo_uarch_exynos_m3:
2516 case cpuinfo_uarch_exynos_m4:
Frank Barchard143a1102021-06-15 09:15:34 -07002517 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2518 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2519 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2520 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002521 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002522 xnn_params.f32.gemm.mr = 6;
2523 xnn_params.f32.gemm.nr = 8;
Zhi An Nga63651c2022-02-01 16:16:33 -08002524 #if XNN_ENABLE_JIT
2525 xnn_params.f32.gemm.generator.gemm = xnn_init_hmp_gemm_codegen(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
Zhi An Nga3bf3ea2022-02-03 15:28:19 -08002526 xnn_params.f32.gemm.generator.igemm = xnn_init_hmp_igemm_codegen(xnn_generate_f32_igemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
Zhi An Ngeb7256b2022-02-03 16:02:54 -08002527 xnn_params.f32.gemm.generator.gemm1 = xnn_init_hmp_gemm_codegen(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
Zhi An Ngf30a8592022-02-03 16:49:19 -08002528 xnn_params.f32.gemm.generator.igemm1 = xnn_init_hmp_igemm_codegen(xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
Zhi An Nga63651c2022-02-01 16:16:33 -08002529 #endif
Frank Barchard0d1052c2020-03-23 17:28:13 -07002530 break;
2531 case cpuinfo_uarch_exynos_m1:
2532 case cpuinfo_uarch_exynos_m2:
Marat Dukhanaefaef32020-04-09 07:09:34 -07002533 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma);
2534 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma);
2535 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma);
2536 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002537 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002538 xnn_params.f32.gemm.mr = 6;
2539 xnn_params.f32.gemm.nr = 8;
2540 xnn_params.f32.gemm.log2_sr = 2;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002541 break;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002542 case cpuinfo_uarch_cortex_a53:
2543 case cpuinfo_uarch_cortex_a55r0:
Marat Dukhanaefaef32020-04-09 07:09:34 -07002544 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53);
2545 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53);
2546 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
2547 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002548 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002549 xnn_params.f32.gemm.mr = 6;
2550 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002551 break;
Frank Barchardf975ee02021-11-05 16:01:00 -07002552 case cpuinfo_uarch_cortex_a35:
Frank Barchard0d1052c2020-03-23 17:28:13 -07002553 case cpuinfo_uarch_cortex_a55:
Marat Dukhanaefaef32020-04-09 07:09:34 -07002554 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55);
2555 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55);
2556 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
2557 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002558 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002559 xnn_params.f32.gemm.mr = 6;
2560 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002561 break;
2562 case cpuinfo_uarch_cortex_a73:
Marat Dukhanaefaef32020-04-09 07:09:34 -07002563 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73);
2564 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73);
Frank Barchard143a1102021-06-15 09:15:34 -07002565 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2566 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002567 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002568 xnn_params.f32.gemm.mr = 6;
2569 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002570 break;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002571 case cpuinfo_uarch_cortex_a77:
2572 case cpuinfo_uarch_exynos_m5:
2573 case cpuinfo_uarch_kryo:
Frank Barchard143a1102021-06-15 09:15:34 -07002574 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75);
2575 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75);
2576 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
2577 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002578 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002579 xnn_params.f32.gemm.mr = 4;
2580 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002581 break;
Frank Barchard990b2af2021-06-14 11:49:15 -07002582 case cpuinfo_uarch_cortex_a78:
2583 case cpuinfo_uarch_cortex_x1:
2584 default:
2585 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128);
Frank Barchard79cd5f92021-06-21 17:34:59 -07002586 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128);
Frank Barchard990b2af2021-06-14 11:49:15 -07002587 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2588 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2589 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2590 xnn_params.f32.gemm.mr = 6;
2591 xnn_params.f32.gemm.nr = 8;
2592 break;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002593 }
Marat Dukhan05702cf2020-03-26 15:41:33 -07002594 #if XNN_MAX_UARCH_TYPES > 1
2595 {
2596 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2597 const uint32_t mr = xnn_params.f32.gemm.mr;
2598 const uint32_t nr = xnn_params.f32.gemm.nr;
2599 const uint32_t log2_sr = xnn_params.f32.gemm.log2_sr;
2600 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2601 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2602 if (uarch_info == NULL) {
2603 /* No more microarchitectures in the system */
2604 break;
2605 }
2606
2607 switch (uarch_info->uarch) {
2608 case cpuinfo_uarch_cortex_a53:
2609 case cpuinfo_uarch_cortex_a55r0:
2610 if (mr == 6 && nr == 8 && log2_sr == 0) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07002611 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53;
2612 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53;
2613 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2614 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
Marat Dukhan05702cf2020-03-26 15:41:33 -07002615 } else if (mr == 4 && nr == 8 && log2_sr == 0) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07002616 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53;
2617 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53;
2618 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2619 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
Marat Dukhan05702cf2020-03-26 15:41:33 -07002620 }
2621 break;
2622 case cpuinfo_uarch_cortex_a55:
2623 if (mr == 6 && nr == 8 && log2_sr == 0) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07002624 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55;
2625 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55;
2626 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2627 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
Marat Dukhan05702cf2020-03-26 15:41:33 -07002628 } else if (mr == 4 && nr == 8 && log2_sr == 0) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07002629 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55;
2630 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55;
2631 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2632 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
Marat Dukhan05702cf2020-03-26 15:41:33 -07002633 }
2634 break;
2635 default:
2636 break;
2637 }
2638 }
2639 }
2640 #endif // XNN_MAX_UARCH_TYPES > 1
Frank Barchard0d1052c2020-03-23 17:28:13 -07002641 #else // !XNN_ENABLE_ASSEMBLY
Marat Dukhanaefaef32020-04-09 07:09:34 -07002642 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64);
2643 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64);
2644 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2645 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002646 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002647 xnn_params.f32.gemm.mr = 6;
2648 xnn_params.f32.gemm.nr = 8;
Marat Dukhan31677ad2020-10-13 23:59:31 -07002649 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhandfe47b92020-12-14 02:48:43 -08002650 #endif // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Marat Dukhanaefaef32020-04-09 07:09:34 -07002651 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__neonfma_lane_ld64);
2652 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__neonfma_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002653 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002654 xnn_params.f32.gemm2.mr = 4;
2655 xnn_params.f32.gemm2.nr = 2;
2656
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002657 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__neonfma;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002658 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanf5425ea2020-04-24 01:46:00 -07002659 xnn_params.f32.dwconv[0].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002660 xnn_params.f32.dwconv[0].primary_tile = 3;
2661
2662 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__neonfma;
2663 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
2664 xnn_params.f32.dwconv[1].channel_tile = 8;
2665 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002666
Marat Dukhandfe47b92020-12-14 02:48:43 -08002667 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002668 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
2669 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2670 xnn_params.f32.dwconv[2].channel_tile = 8;
2671 xnn_params.f32.dwconv[2].primary_tile = 9;
Marat Dukhandfe47b92020-12-14 02:48:43 -08002672 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
Frank Barchard0d1052c2020-03-23 17:28:13 -07002673 switch (cpuinfo_get_core(0)->uarch) {
2674 case cpuinfo_uarch_kryo:
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002675 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
2676 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2677 xnn_params.f32.dwconv[2].channel_tile = 8;
2678 xnn_params.f32.dwconv[2].primary_tile = 9;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002679 break;
2680 #if XNN_ENABLE_ASSEMBLY
2681 case cpuinfo_uarch_cortex_a53:
2682 case cpuinfo_uarch_cortex_a55r0:
2683 case cpuinfo_uarch_cortex_a55:
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002684 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma_cortex_a55;
2685 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2686 xnn_params.f32.dwconv[2].channel_tile = 4;
2687 xnn_params.f32.dwconv[2].primary_tile = 9;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002688 break;
2689 #endif // XNN_ENABLE_ASSEMBLY
2690 default:
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002691 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
2692 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2693 xnn_params.f32.dwconv[2].channel_tile = 8;
2694 xnn_params.f32.dwconv[2].primary_tile = 9;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002695 break;
2696 }
Marat Dukhandfe47b92020-12-14 02:48:43 -08002697 #endif // XNN_PLATFORM_IOS && XNN_PLATFORM_MAC
Marat Dukhanaefaef32020-04-09 07:09:34 -07002698
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002699 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__neonfma_acc2;
2700 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
2701 xnn_params.f32.dwconv[3].channel_tile = 8;
2702 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002703
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002704 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08002705 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__neon_c4,
2706 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4,
2707 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
2708 .primary_tile = 9,
2709 .incremental_tile = 8,
2710 .channel_tile = 4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002711 };
2712 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08002713 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4,
2714 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4,
2715 .primary_tile = 9,
2716 .incremental_tile = 8,
2717 .channel_tile = 4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002718 };
2719 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08002720 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4,
2721 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
2722 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
2723 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
2724 .row_tile = 7,
2725 .channel_tile = 4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002726 };
2727 xnn_params.f32.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07002728 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4,
Marat Dukhan91ae1652021-08-15 19:19:49 -07002729 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002730 .mr = 9,
2731 .qr = 8,
2732 };
2733 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -07002734 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__neon_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002735 .mr = 4,
2736 };
2737 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -07002738 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__neon_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002739 .mr = 9,
2740 };
2741 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -07002742 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__neon_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002743 .mr = 9,
2744 .qr = 8,
2745 };
Marat Dukhan660fd192020-03-10 04:55:30 -07002746 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
2747 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__neonfma_c8,
Marat Dukhan69722492019-11-11 19:55:50 -08002748 .pixel_tile = 1,
2749 .channel_tile = 8,
2750 };
Marat Dukhane5efb162021-12-31 10:26:13 -08002751 xnn_params.f32.abs = (struct vunary_parameters) {
2752 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__neon_x8,
2753 .element_tile = 8,
2754 };
Marat Dukhan94912792021-08-16 21:40:30 -07002755 xnn_params.f32.clamp = (struct vunary_parameters) {
2756 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__neon_x8,
2757 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
2758 .element_tile = 8,
2759 };
Marat Dukhan4a79ff22022-01-01 12:16:48 -08002760 xnn_params.f32.elu = (struct vunary_parameters) {
2761 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x16,
2762 .init.f32_elu = xnn_init_f32_elu_neonfma_rr1_lut16_p3_params,
2763 .element_tile = 16,
2764 };
Marat Dukhan561d0682021-12-23 16:12:35 -08002765 xnn_params.f32.hswish = (struct vunary_parameters) {
2766 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__neon_x16,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08002767 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08002768 .element_tile = 16,
2769 };
Marat Dukhan2894e992021-12-30 08:29:48 -08002770 xnn_params.f32.lrelu = (struct vunary_parameters) {
2771 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__neon_x8,
2772 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
2773 .element_tile = 8,
2774 };
Marat Dukhane5efb162021-12-31 10:26:13 -08002775 xnn_params.f32.neg = (struct vunary_parameters) {
2776 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__neon_x8,
2777 .element_tile = 8,
2778 };
Marat Dukhan0e801372022-01-04 00:10:41 -08002779 xnn_params.f32.rndne = (struct vunary_parameters) {
2780 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8,
2781 .element_tile = 8,
2782 };
2783 xnn_params.f32.rndz = (struct vunary_parameters) {
2784 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8,
2785 .element_tile = 8,
2786 };
2787 xnn_params.f32.rndu = (struct vunary_parameters) {
2788 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8,
2789 .element_tile = 8,
2790 };
2791 xnn_params.f32.rndd = (struct vunary_parameters) {
2792 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8,
2793 .element_tile = 8,
2794 };
Marat Dukhance834ad2022-01-03 00:22:01 -08002795 xnn_params.f32.sigmoid = (struct vunary_parameters) {
2796 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x16,
2797 .init.f32_sigmoid = xnn_init_f32_sigmoid_neonfma_rr1_lut64_p2_params,
2798 .element_tile = 16,
2799 };
Marat Dukhane5efb162021-12-31 10:26:13 -08002800 xnn_params.f32.sqr = (struct vunary_parameters) {
2801 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__neon_x8,
2802 .element_tile = 8,
2803 };
Marat Dukhane72b2822021-12-30 14:46:58 -08002804 xnn_params.f32.sqrt = (struct vunary_parameters) {
2805 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__neon_sqrt_x4,
2806 .element_tile = 4,
2807 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002808 xnn_params.f32.prelu = (struct prelu_parameters) {
Marat Dukhan69c3f2c2019-11-06 12:30:01 -08002809 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
2810 .row_tile = 2,
2811 .channel_tile = 8,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002812 };
Marat Dukhan4a5c7712022-01-05 22:43:13 -08002813 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
2814 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x16,
2815 .init = xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params,
2816 .element_tile = 16,
2817 };
Marat Dukhan1edc4542020-01-27 12:40:13 -08002818 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__neon;
Marat Dukhanb1a0fc32019-12-02 19:32:02 -08002819 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002820 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__neon_x8,
2821 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
2822 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08002823 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhanb1a0fc32019-12-02 19:32:02 -08002824 .element_tile = 8,
2825 };
Marat Dukhan69180502019-12-06 15:00:31 -08002826 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002827 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__neon_x8,
2828 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__neon_x8,
2829 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__neon_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08002830 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan69180502019-12-06 15:00:31 -08002831 .element_tile = 8,
2832 };
Marat Dukhan79e7f842019-12-05 14:35:50 -08002833 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002834 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
2835 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
2836 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
Marat Dukhan79e7f842019-12-05 14:35:50 -08002837 .element_tile = 8,
2838 };
2839 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002840 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__neon_x8,
2841 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
2842 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
Marat Dukhan79e7f842019-12-05 14:35:50 -08002843 .element_tile = 8,
2844 };
Marat Dukhan1e782c42019-11-21 17:02:40 -08002845 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002846 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__neon_x8,
2847 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
2848 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08002849 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhanca2733c2019-11-15 23:21:17 -08002850 .element_tile = 8,
2851 };
Marat Dukhan05f3f6d2019-12-03 15:13:53 -08002852 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002853 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__neon_x8,
2854 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__neon_x8,
2855 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__neon_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08002856 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan05f3f6d2019-12-03 15:13:53 -08002857 .element_tile = 8,
2858 };
Marat Dukhanf7399262020-06-05 10:58:44 -07002859 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002860 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__neon_x8,
2861 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
2862 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
Marat Dukhanf7399262020-06-05 10:58:44 -07002863 .element_tile = 8,
2864 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002865 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07002866 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07002867 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan49e6ee92019-11-06 15:55:29 -08002868 .channel_tile = 4,
2869 .row_tile = 2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002870 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08002871 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -08002872 init_flags |= XNN_INIT_FLAG_CHW_OPT;
2873
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002874 xnn_params.f32.spmm = (struct spmm_parameters) {
Marat Dukhan4baa2ac2021-02-11 10:04:09 -08002875 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__neonfma_pipelined,
2876 .mr = 32,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002877 .nr = 1,
XNNPACK Teamb455b122019-09-27 18:10:33 -07002878 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002879 xnn_params.f32.spmm2 = (struct spmm_parameters) {
Marat Dukhan4baa2ac2021-02-11 10:04:09 -08002880 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x2__neonfma,
2881 .mr = 32,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002882 .nr = 2,
2883 };
2884 xnn_params.f32.spmm4 = (struct spmm_parameters) {
Marat Dukhan4baa2ac2021-02-11 10:04:09 -08002885 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x4__neonfma,
2886 .mr = 32,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002887 .nr = 4,
2888 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07002889 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002890 .ukernel_with_symm_padding =
Marat Dukhan1f29b802020-05-15 23:46:39 -07002891 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002892 .output_channel_tile = 4,
2893 .output_height_tile = 2,
2894 .output_width_tile = 2,
2895 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07002896 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
2897 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002898 .output_width_tile = 4,
2899 .output_height_tile = 3,
2900 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07002901 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
Marat Dukhan82f0c322020-10-25 19:17:35 -07002902 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4_acc2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002903 .output_width_tile = 4,
Marat Dukhan82f0c322020-10-25 19:17:35 -07002904 .output_height_tile = 2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002905 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07002906 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
Marat Dukhan149f0ea2020-10-26 12:50:33 -07002907 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4,
Marat Dukhana99918a2019-11-15 14:40:12 -08002908 .output_width_tile = 4,
Marat Dukhan149f0ea2020-10-26 12:50:33 -07002909 .output_height_tile = 4,
Marat Dukhana99918a2019-11-15 14:40:12 -08002910 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07002911 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
2912 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2,
Marat Dukhana99918a2019-11-15 14:40:12 -08002913 .output_width_tile = 4,
2914 .output_height_tile = 1,
2915 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07002916 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
2917 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__neon_x4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002918 .channel_tile = 4,
2919 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -07002920 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
Artsiom Ablavatski2202c812021-01-22 14:16:43 -08002921 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__neonfma_p8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07002922 .channel_tile = 1,
Artsiom Ablavatskib3ffd582021-03-31 13:00:08 -07002923 .pixel_tile = 8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07002924 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08002925 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002926 #endif // XNN_NO_F32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07002927
Frank Barchardb40ee632021-12-30 11:10:02 -08002928 /*************************** VCVT AArch64 micro-kernels ***************************/
Marat Dukhanaf2ba002021-10-24 14:21:41 -07002929 #ifndef XNN_NO_VCVT_OPERATORS
2930 init_flags |= XNN_INIT_FLAG_VCVT;
2931
Marat Dukhan134f9842021-12-29 19:57:31 -08002932 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
2933 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neonfp16_x16,
2934 .element_tile = 16,
2935 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08002936 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
2937 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__neonfp16_x16,
2938 .element_tile = 16,
2939 };
Marat Dukhaned2d7762021-12-03 23:51:19 -08002940 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
2941 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neonv8_x32,
2942 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neonv8_params,
2943 .element_tile = 32,
2944 };
2945 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
2946 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neonv8_x32,
2947 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neonv8_params,
2948 .element_tile = 32,
2949 };
Marat Dukhanf92206b2021-12-10 17:02:07 -08002950 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
2951 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__neon_x32,
2952 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_neon_params,
2953 .element_tile = 32,
2954 };
2955 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
2956 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__neon_x32,
2957 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_neon_params,
2958 .element_tile = 32,
2959 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07002960 #endif // XNN_NO_VCVT_OPERATORS
2961
Frank Barchardb40ee632021-12-30 11:10:02 -08002962 /**************************** X32 AArch64 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002963 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07002964 init_flags |= XNN_INIT_FLAG_X32;
2965
Marat Dukhan57dccd82020-04-14 00:53:10 -07002966 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__neon;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002967 xnn_params.x32.zip = (struct zip_parameters) {
2968 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__neon,
2969 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__neon,
2970 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__neon,
2971 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
2972 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08002973 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -08002974 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
2975 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08002976 .channel_tile = 1,
2977 .pixel_tile = 1,
2978 };
2979 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002980 #endif // XNN_NO_X32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07002981
Frank Barchardb40ee632021-12-30 11:10:02 -08002982 /**************************** XX AArch64 micro-kernels ****************************/
Marat Dukhan048931b2020-11-24 20:53:54 -08002983 #ifndef XNN_NO_XX_OPERATORS
2984 init_flags |= XNN_INIT_FLAG_XX;
2985
2986 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
Marat Dukhan933051b2021-08-07 16:26:15 -07002987 xnn_params.xx.fill = (struct fill_parameters) {
2988 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__neon_x64,
2989 .row_tile = 1,
2990 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -07002991 xnn_params.xx.pad = (struct pad_parameters) {
2992 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__neon,
2993 .row_tile = 1,
2994 };
Marat Dukhan048931b2020-11-24 20:53:54 -08002995 #endif
2996
Marat Dukhan933051b2021-08-07 16:26:15 -07002997#elif XNN_ARCH_X86 || XNN_ARCH_X86_64
2998 if (!cpuinfo_has_x86_sse2()) {
2999 xnn_log_error("XNNPACK initialization failed: SSE2 is not supported");
3000 return;
3001 }
3002
Frank Barchardb40ee632021-12-30 11:10:02 -08003003 /**************************** QC8 x86 micro-kernels ****************************/
Marat Dukhan5e353862021-06-15 09:03:25 -07003004 #ifndef XNN_NO_QC8_OPERATORS
3005 init_flags |= XNN_INIT_FLAG_QC8;
3006
Marat Dukhan039a3882022-01-21 14:53:11 -08003007 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhan5e353862021-06-15 09:03:25 -07003008 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3009 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3010 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3011 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3012 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_avx512_params;
3013 xnn_params.qc8.gemm.mr = 4;
3014 xnn_params.qc8.gemm.nr = 16;
3015 xnn_params.qc8.gemm.log2_kr = 3;
3016 } else if (cpuinfo_has_x86_xop()) {
3017 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3018 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3019 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3020 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3021 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3022 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse4_params;
3023 xnn_params.qc8.gemm.mr = 2;
3024 xnn_params.qc8.gemm.nr = 4;
3025 xnn_params.qc8.gemm.log2_kr = 3;
3026 } else if (cpuinfo_has_x86_avx2()) {
3027 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3028 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
3029 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3030 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3031 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_avx2_params;
3032 xnn_params.qc8.gemm.mr = 3;
3033 xnn_params.qc8.gemm.nr = 8;
3034 xnn_params.qc8.gemm.log2_kr = 3;
3035 } else if (cpuinfo_has_x86_avx()) {
3036 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3037 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3038 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3039 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3040 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse4_params;
3041 xnn_params.qc8.gemm.mr = 2;
3042 xnn_params.qc8.gemm.nr = 4;
3043 xnn_params.qc8.gemm.log2_kr = 3;
3044 } else if (cpuinfo_has_x86_sse4_1()) {
3045 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3046 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3047 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3048 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3049 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse4_params;
3050 xnn_params.qc8.gemm.mr = 3;
3051 xnn_params.qc8.gemm.nr = 4;
3052 xnn_params.qc8.gemm.log2_kr = 3;
3053 } else {
3054 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3055 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3056 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3057 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3058 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse2_params;
3059 xnn_params.qc8.gemm.mr = 3;
3060 xnn_params.qc8.gemm.nr = 4;
3061 xnn_params.qc8.gemm.log2_kr = 3;
3062 }
3063
Marat Dukhan039a3882022-01-21 14:53:11 -08003064 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhan5e353862021-06-15 09:03:25 -07003065 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
3066 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_avx512_params;
3067 xnn_params.qc8.dwconv[0].channel_tile = 32;
3068 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
3069 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_avx512_params;
3070 xnn_params.qc8.dwconv[1].channel_tile = 32;
3071 } else if (cpuinfo_has_x86_xop()) {
3072 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
Marat Dukhan28480592021-07-27 23:52:27 -07003073 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16;
Marat Dukhan5e353862021-06-15 09:03:25 -07003074 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3075 xnn_params.qc8.dwconv[0].channel_tile = 16;
Marat Dukhan28480592021-07-27 23:52:27 -07003076 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul16_add16;
Marat Dukhan5e353862021-06-15 09:03:25 -07003077 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3078 xnn_params.qc8.dwconv[1].channel_tile = 16;
3079 } else if (cpuinfo_has_x86_avx2()) {
3080 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
3081 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_avx2_params;
3082 xnn_params.qc8.dwconv[0].channel_tile = 16;
3083 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
3084 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_avx2_params;
3085 xnn_params.qc8.dwconv[1].channel_tile = 16;
3086 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan28480592021-07-27 23:52:27 -07003087 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16;
Marat Dukhan5e353862021-06-15 09:03:25 -07003088 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3089 xnn_params.qc8.dwconv[0].channel_tile = 16;
Marat Dukhan28480592021-07-27 23:52:27 -07003090 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16;
Marat Dukhan5e353862021-06-15 09:03:25 -07003091 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3092 xnn_params.qc8.dwconv[1].channel_tile = 16;
3093 } else if (cpuinfo_has_x86_sse4_1()) {
3094 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16;
3095 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3096 xnn_params.qc8.dwconv[0].channel_tile = 8;
3097 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16;
3098 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3099 xnn_params.qc8.dwconv[1].channel_tile = 8;
3100 } else if (cpuinfo_has_x86_sse2()) {
3101 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16;
3102 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse2_params;
3103 xnn_params.qc8.dwconv[0].channel_tile = 8;
3104 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16;
3105 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse2_params;
3106 xnn_params.qc8.dwconv[1].channel_tile = 8;
3107 }
3108 xnn_params.qc8.dwconv[0].primary_tile = 9;
3109 xnn_params.qc8.dwconv[1].primary_tile = 25;
3110 #endif // XNN_NO_QC8_OPERATORS
3111
Frank Barchardb40ee632021-12-30 11:10:02 -08003112 /**************************** QS8 x86 micro-kernels ****************************/
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07003113 #ifndef XNN_NO_QS8_OPERATORS
3114 init_flags |= XNN_INIT_FLAG_QS8;
3115
Marat Dukhan039a3882022-01-21 14:53:11 -08003116 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhan71855ee2021-05-25 19:05:06 -07003117 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3118 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3119 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3120 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3121 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
Marat Dukhanbb00b1d2020-08-10 11:37:23 -07003122 xnn_params.qs8.gemm.mr = 4;
3123 xnn_params.qs8.gemm.nr = 16;
3124 xnn_params.qs8.gemm.log2_kr = 3;
3125 } else if (cpuinfo_has_x86_xop()) {
Marat Dukhan75215d82020-08-07 23:08:03 -07003126 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
Marat Dukhanc46e6712021-06-01 19:00:16 -07003127 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3128 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3129 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3130 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3131 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan75215d82020-08-07 23:08:03 -07003132 xnn_params.qs8.gemm.mr = 2;
3133 xnn_params.qs8.gemm.nr = 4;
3134 xnn_params.qs8.gemm.log2_kr = 3;
3135 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan9b474cf2021-05-25 16:37:48 -07003136 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3137 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
3138 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3139 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3140 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07003141 xnn_params.qs8.gemm.mr = 3;
3142 xnn_params.qs8.gemm.nr = 8;
3143 xnn_params.qs8.gemm.log2_kr = 3;
Marat Dukhana3c16332021-04-02 15:03:27 -07003144 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhanc46e6712021-06-01 19:00:16 -07003145 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3146 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3147 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3148 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3149 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhana3c16332021-04-02 15:03:27 -07003150 xnn_params.qs8.gemm.mr = 2;
3151 xnn_params.qs8.gemm.nr = 4;
3152 xnn_params.qs8.gemm.log2_kr = 3;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07003153 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhanc46e6712021-06-01 19:00:16 -07003154 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3155 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3156 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3157 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3158 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07003159 xnn_params.qs8.gemm.mr = 3;
3160 xnn_params.qs8.gemm.nr = 4;
3161 xnn_params.qs8.gemm.log2_kr = 3;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07003162 } else {
Marat Dukhanc46e6712021-06-01 19:00:16 -07003163 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3164 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3165 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3166 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3167 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07003168 xnn_params.qs8.gemm.mr = 3;
3169 xnn_params.qs8.gemm.nr = 4;
3170 xnn_params.qs8.gemm.log2_kr = 3;
3171 }
3172
Marat Dukhan039a3882022-01-21 14:53:11 -08003173 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhan71855ee2021-05-25 19:05:06 -07003174 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
3175 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
Marat Dukhan2ffc5e62020-09-06 22:33:38 -07003176 xnn_params.qs8.dwconv[0].channel_tile = 32;
Marat Dukhan71855ee2021-05-25 19:05:06 -07003177 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
3178 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07003179 xnn_params.qs8.dwconv[1].channel_tile = 32;
Marat Dukhan3fd4e272021-04-10 11:16:42 -07003180 } else if (cpuinfo_has_x86_xop()) {
3181 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
Marat Dukhan02f06e32021-07-27 14:33:47 -07003182 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07003183 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan3fd4e272021-04-10 11:16:42 -07003184 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhan09668562021-07-26 16:52:20 -07003185 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07003186 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07003187 xnn_params.qs8.dwconv[1].channel_tile = 16;
Marat Dukhan2ffc5e62020-09-06 22:33:38 -07003188 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan9b474cf2021-05-25 16:37:48 -07003189 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
3190 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
Marat Dukhand65a1522020-08-04 19:28:18 -07003191 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhan9b474cf2021-05-25 16:37:48 -07003192 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
3193 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07003194 xnn_params.qs8.dwconv[1].channel_tile = 16;
Marat Dukhanfa0ab852021-04-02 17:30:49 -07003195 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan09668562021-07-26 16:52:20 -07003196 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07003197 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhanfa0ab852021-04-02 17:30:49 -07003198 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhan09668562021-07-26 16:52:20 -07003199 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07003200 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07003201 xnn_params.qs8.dwconv[1].channel_tile = 16;
Marat Dukhand65a1522020-08-04 19:28:18 -07003202 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhan09668562021-07-26 16:52:20 -07003203 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07003204 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhand65a1522020-08-04 19:28:18 -07003205 xnn_params.qs8.dwconv[0].channel_tile = 8;
Marat Dukhan09668562021-07-26 16:52:20 -07003206 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07003207 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07003208 xnn_params.qs8.dwconv[1].channel_tile = 8;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07003209 } else if (cpuinfo_has_x86_sse2()) {
Marat Dukhan09668562021-07-26 16:52:20 -07003210 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07003211 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
Marat Dukhand65a1522020-08-04 19:28:18 -07003212 xnn_params.qs8.dwconv[0].channel_tile = 8;
Marat Dukhan09668562021-07-26 16:52:20 -07003213 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07003214 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07003215 xnn_params.qs8.dwconv[1].channel_tile = 8;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07003216 }
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07003217 xnn_params.qs8.dwconv[0].primary_tile = 9;
Marat Dukhan4ed14882021-05-12 17:50:40 -07003218 xnn_params.qs8.dwconv[1].primary_tile = 25;
Marat Dukhan9e0b5392020-08-07 02:29:34 -07003219
3220 if (cpuinfo_has_x86_sse4_1()) {
3221 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan9e258d62022-01-12 10:50:51 -08003222 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8,
3223 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8,
Marat Dukhan53f41062022-01-11 19:44:57 -08003224 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_sse4_params,
3225 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_sse4_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08003226 .row_tile = 7,
3227 .channel_tile = 8,
Marat Dukhan9e0b5392020-08-07 02:29:34 -07003228 };
Marat Dukhan53f41062022-01-11 19:44:57 -08003229 } else {
Marat Dukhan9e0b5392020-08-07 02:29:34 -07003230 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan9e258d62022-01-12 10:50:51 -08003231 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8,
3232 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8,
Marat Dukhan53f41062022-01-11 19:44:57 -08003233 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_sse2_params,
3234 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_sse2_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08003235 .row_tile = 7,
3236 .channel_tile = 8,
Marat Dukhan9e0b5392020-08-07 02:29:34 -07003237 };
3238 }
Marat Dukhanff209482020-09-03 14:26:53 -07003239
Marat Dukhan039a3882022-01-21 14:53:11 -08003240 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhane76049a2021-07-22 14:48:59 -07003241 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3242 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16,
3243 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
3244 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
Marat Dukhan64287252021-09-07 16:20:03 -07003245 .init.qs8_addsub = xnn_init_qs8_add_minmax_avx512_params,
Marat Dukhane76049a2021-07-22 14:48:59 -07003246 .element_tile = 16,
3247 };
3248 } else if (cpuinfo_has_x86_xop()) {
Marat Dukhanbb9225e2020-09-06 22:40:56 -07003249 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3250 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8,
3251 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
3252 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07003253 .init.qs8_addsub = xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhanbb9225e2020-09-06 22:40:56 -07003254 .element_tile = 8,
3255 };
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003256 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan7679b1e2021-07-20 18:32:23 -07003257 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3258 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_x16,
3259 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
3260 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
Marat Dukhan64287252021-09-07 16:20:03 -07003261 .init.qs8_addsub = xnn_init_qs8_add_minmax_avx2_params,
Marat Dukhan7679b1e2021-07-20 18:32:23 -07003262 .element_tile = 16,
3263 };
Marat Dukhane9c4b962021-04-02 16:56:55 -07003264 } else if (cpuinfo_has_x86_avx()) {
3265 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3266 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_x8,
3267 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
3268 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07003269 .init.qs8_addsub = xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhane9c4b962021-04-02 16:56:55 -07003270 .element_tile = 8,
3271 };
Marat Dukhanbb9225e2020-09-06 22:40:56 -07003272 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhanff209482020-09-03 14:26:53 -07003273 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3274 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8,
3275 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
3276 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07003277 .init.qs8_addsub = xnn_init_qs8_add_minmax_sse4_mul16_params,
Marat Dukhanff209482020-09-03 14:26:53 -07003278 .element_tile = 8,
3279 };
3280 } else {
3281 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3282 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8,
3283 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
3284 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07003285 .init.qs8_addsub = xnn_init_qs8_add_minmax_sse2_params,
Marat Dukhanff209482020-09-03 14:26:53 -07003286 .element_tile = 8,
3287 };
3288 }
Marat Dukhan0853b8a2021-08-03 01:01:53 -07003289 if (cpuinfo_has_x86_avx()) {
3290 xnn_params.qs8.vmul = (struct vbinary_parameters) {
3291 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3292 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3293 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3294 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse4_params,
3295 .element_tile = 16,
3296 };
3297 } else if (cpuinfo_has_x86_sse4_1()) {
3298 xnn_params.qs8.vmul = (struct vbinary_parameters) {
3299 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3300 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3301 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3302 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse4_params,
3303 .element_tile = 16,
3304 };
3305 } else {
3306 xnn_params.qs8.vmul = (struct vbinary_parameters) {
3307 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3308 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3309 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3310 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse2_params,
3311 .element_tile = 8,
3312 };
3313 }
Marat Dukhan07e50402020-08-05 17:16:53 -07003314 #endif // XNN_NO_QS8_OPERATORS
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07003315
Frank Barchardb40ee632021-12-30 11:10:02 -08003316 /**************************** QU8 x86 micro-kernels ****************************/
Marat Dukhan08b7a972020-07-14 18:17:29 -07003317 #ifndef XNN_NO_QU8_OPERATORS
3318 init_flags |= XNN_INIT_FLAG_QU8;
Marat Dukhan854fb6b2020-06-19 12:33:44 -07003319
Marat Dukhan039a3882022-01-21 14:53:11 -08003320 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhan3cf2e222021-07-08 11:38:45 -07003321 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3322 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3323 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3324 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3325 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3326 xnn_params.qu8.gemm.mr = 4;
3327 xnn_params.qu8.gemm.nr = 16;
3328 xnn_params.qu8.gemm.log2_kr = 3;
3329 } else if (cpuinfo_has_x86_xop()) {
Marat Dukhan902ef7f2021-07-02 16:11:06 -07003330 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3331 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3332 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3333 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3334 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3335 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3336 xnn_params.qu8.gemm.mr = 2;
3337 xnn_params.qu8.gemm.nr = 4;
3338 xnn_params.qu8.gemm.log2_kr = 3;
3339 } else if (cpuinfo_has_x86_avx2()) {
3340 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3341 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
3342 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3343 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3344 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3345 xnn_params.qu8.gemm.mr = 3;
3346 xnn_params.qu8.gemm.nr = 8;
3347 xnn_params.qu8.gemm.log2_kr = 3;
3348 } else if (cpuinfo_has_x86_avx()) {
3349 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3350 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3351 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3352 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3353 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3354 xnn_params.qu8.gemm.mr = 2;
3355 xnn_params.qu8.gemm.nr = 4;
3356 xnn_params.qu8.gemm.log2_kr = 3;
3357 } else if (cpuinfo_has_x86_sse4_1()) {
3358 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3359 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3360 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3361 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3362 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3363 xnn_params.qu8.gemm.mr = 3;
3364 xnn_params.qu8.gemm.nr = 4;
3365 xnn_params.qu8.gemm.log2_kr = 3;
3366 } else {
3367 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3368 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3369 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3370 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3371 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3372 xnn_params.qu8.gemm.mr = 3;
3373 xnn_params.qu8.gemm.nr = 4;
3374 xnn_params.qu8.gemm.log2_kr = 3;
3375 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07003376
Marat Dukhan039a3882022-01-21 14:53:11 -08003377 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhanabee3a72021-07-09 09:04:52 -07003378 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
3379 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3380 xnn_params.qu8.dwconv[0].channel_tile = 32;
3381 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
3382 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3383 xnn_params.qu8.dwconv[1].channel_tile = 32;
3384 } else if (cpuinfo_has_x86_xop()) {
3385 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3386 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32;
3387 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3388 xnn_params.qu8.dwconv[0].channel_tile = 16;
3389 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32;
3390 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3391 xnn_params.qu8.dwconv[1].channel_tile = 16;
3392 } else if (cpuinfo_has_x86_avx2()) {
3393 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
3394 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3395 xnn_params.qu8.dwconv[0].channel_tile = 16;
3396 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
3397 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3398 xnn_params.qu8.dwconv[1].channel_tile = 16;
3399 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhancaa7fc72021-07-27 07:48:24 -07003400 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16;
Marat Dukhanabee3a72021-07-09 09:04:52 -07003401 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3402 xnn_params.qu8.dwconv[0].channel_tile = 16;
Marat Dukhancaa7fc72021-07-27 07:48:24 -07003403 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16;
Marat Dukhanabee3a72021-07-09 09:04:52 -07003404 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3405 xnn_params.qu8.dwconv[1].channel_tile = 16;
3406 } else if (cpuinfo_has_x86_sse4_1()) {
3407 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16;
3408 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3409 xnn_params.qu8.dwconv[0].channel_tile = 8;
3410 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16;
3411 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3412 xnn_params.qu8.dwconv[1].channel_tile = 8;
3413 } else if (cpuinfo_has_x86_sse2()) {
3414 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16;
3415 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3416 xnn_params.qu8.dwconv[0].channel_tile = 8;
3417 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16;
3418 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3419 xnn_params.qu8.dwconv[1].channel_tile = 8;
3420 }
Marat Dukhan08b7a972020-07-14 18:17:29 -07003421 xnn_params.qu8.dwconv[0].primary_tile = 9;
Marat Dukhanabee3a72021-07-09 09:04:52 -07003422 xnn_params.qu8.dwconv[1].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003423
Marat Dukhan08b7a972020-07-14 18:17:29 -07003424 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08003425 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__sse2_c8,
3426 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8,
Marat Dukhan3c949a32022-01-09 20:12:33 -08003427 .init.qu8 = xnn_init_qu8_avgpool_minmax_sse2_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08003428 .primary_tile = 9,
3429 .incremental_tile = 8,
3430 .channel_tile = 8,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003431 };
Marat Dukhand1f53e42022-01-12 22:34:51 -08003432 if (cpuinfo_has_x86_sse4_1()) {
3433 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
3434 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8,
3435 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8,
3436 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_sse4_params,
3437 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_sse4_params,
3438 .row_tile = 7,
3439 .channel_tile = 8,
3440 };
3441 } else {
3442 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
3443 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8,
3444 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8,
3445 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_sse2_params,
3446 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_sse2_params,
3447 .row_tile = 7,
3448 .channel_tile = 8,
3449 };
3450 }
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003451
Marat Dukhan039a3882022-01-21 14:53:11 -08003452 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhane76049a2021-07-22 14:48:59 -07003453 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3454 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16,
3455 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
3456 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
Marat Dukhan64287252021-09-07 16:20:03 -07003457 .init.qu8_addsub = xnn_init_qu8_add_minmax_avx512_params,
Marat Dukhane76049a2021-07-22 14:48:59 -07003458 .element_tile = 16,
3459 };
3460 } else if (cpuinfo_has_x86_xop()) {
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003461 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3462 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__xop_mul32_ld32_x8,
3463 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
3464 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07003465 .init.qu8_addsub = xnn_init_qu8_add_minmax_sse4_params,
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003466 .element_tile = 8,
3467 };
3468 } else if (cpuinfo_has_x86_avx2()) {
3469 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3470 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_x16,
3471 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
3472 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
Marat Dukhan64287252021-09-07 16:20:03 -07003473 .init.qu8_addsub = xnn_init_qu8_add_minmax_avx2_params,
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003474 .element_tile = 16,
3475 };
3476 } else if (cpuinfo_has_x86_avx()) {
3477 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3478 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx_mul32_ld32_x8,
3479 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
3480 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07003481 .init.qu8_addsub = xnn_init_qu8_add_minmax_sse4_params,
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003482 .element_tile = 8,
3483 };
3484 } else if (cpuinfo_has_x86_sse4_1()) {
3485 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3486 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__sse41_mul16_ld64_x8,
3487 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
3488 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07003489 .init.qu8_addsub = xnn_init_qu8_add_minmax_sse2_params,
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003490 .element_tile = 8,
3491 };
3492 } else {
3493 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3494 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__sse2_mul16_ld64_x8,
3495 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
3496 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07003497 .init.qu8_addsub = xnn_init_qu8_add_minmax_sse2_params,
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003498 .element_tile = 8,
3499 };
3500 }
Marat Dukhan0853b8a2021-08-03 01:01:53 -07003501 if (cpuinfo_has_x86_avx()) {
3502 xnn_params.qu8.vmul = (struct vbinary_parameters) {
3503 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3504 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3505 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3506 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
3507 .element_tile = 16,
3508 };
3509 } else if (cpuinfo_has_x86_sse4_1()) {
3510 xnn_params.qu8.vmul = (struct vbinary_parameters) {
3511 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3512 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3513 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3514 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
3515 .element_tile = 16,
3516 };
3517 } else {
3518 xnn_params.qu8.vmul = (struct vbinary_parameters) {
3519 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3520 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3521 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3522 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
3523 .element_tile = 8,
3524 };
3525 }
Marat Dukhan08b7a972020-07-14 18:17:29 -07003526 #endif // XNN_NO_QU8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07003527
Frank Barchardb40ee632021-12-30 11:10:02 -08003528 /**************************** U8 x86 micro-kernels ****************************/
Marat Dukhan23147532021-08-16 07:26:56 -07003529 #ifndef XNN_NO_S8_OPERATORS
3530 init_flags |= XNN_INIT_FLAG_S8;
3531
3532 if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07003533 xnn_params.s8.clamp = (struct vunary_parameters) {
3534 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__sse41_x64,
3535 .init.s8_minmax = xnn_init_s8_minmax_sse4_params,
3536 .element_tile = 64,
3537 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08003538 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
3539 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__sse41_c16,
3540 .pixel_tile = 1,
3541 .channel_tile = 16,
3542 };
Marat Dukhan23147532021-08-16 07:26:56 -07003543 xnn_params.s8.maxpool = (struct maxpool_parameters) {
3544 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__sse41_c16,
3545 .init.s8 = xnn_init_s8_minmax_sse4_params,
3546 .mr = 9,
3547 .qr = 8,
3548 };
3549 } else {
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07003550 xnn_params.s8.clamp = (struct vunary_parameters) {
3551 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__sse2_x64,
3552 .init.s8_minmax = xnn_init_s8_minmax_sse2_params,
3553 .element_tile = 64,
3554 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08003555 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
3556 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__sse2_c8,
3557 .pixel_tile = 1,
3558 .channel_tile = 8,
3559 };
Marat Dukhan23147532021-08-16 07:26:56 -07003560 xnn_params.s8.maxpool = (struct maxpool_parameters) {
3561 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__sse2_c16,
3562 .init.s8 = xnn_init_s8_minmax_sse2_params,
3563 .mr = 9,
3564 .qr = 8,
3565 };
3566 }
Marat Dukhan94912792021-08-16 21:40:30 -07003567 #endif // XNN_NO_S8_OPERATORS
Marat Dukhan23147532021-08-16 07:26:56 -07003568
Frank Barchardb40ee632021-12-30 11:10:02 -08003569 /**************************** U8 x86 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003570 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07003571 init_flags |= XNN_INIT_FLAG_U8;
3572
Marat Dukhan94912792021-08-16 21:40:30 -07003573 xnn_params.u8.clamp = (struct vunary_parameters) {
3574 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__sse2_x64,
3575 .init.u8_minmax = xnn_init_u8_minmax_sse2_params,
3576 .element_tile = 64,
3577 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08003578 if (cpuinfo_has_x86_sse4_1()) {
3579 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
3580 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__sse41_c16,
3581 .pixel_tile = 1,
3582 .channel_tile = 16,
3583 };
3584 } else {
3585 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
3586 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__sse2_c8,
3587 .pixel_tile = 1,
3588 .channel_tile = 8,
3589 };
3590 }
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003591 xnn_params.u8.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07003592 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16,
Marat Dukhan91ae1652021-08-15 19:19:49 -07003593 .init.u8 = xnn_init_u8_minmax_sse2_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003594 .mr = 9,
3595 .qr = 8,
3596 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003597 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
3598 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__sse2;
3599 #endif // XNN_NO_U8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07003600
Frank Barchardb40ee632021-12-30 11:10:02 -08003601 /**************************** X8 x86 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003602 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07003603 init_flags |= XNN_INIT_FLAG_X8;
3604
Marat Dukhan039a3882022-01-21 14:53:11 -08003605 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhan98e054b2021-09-13 09:43:50 -07003606 xnn_params.x8.lut = xnn_x8_lut_ukernel__avx512skx_vpshufb_x64;
3607 } else if (cpuinfo_has_x86_avx2()) {
3608 xnn_params.x8.lut = xnn_x8_lut_ukernel__avx2_x128;
3609 } else if (cpuinfo_has_x86_avx()) {
3610 xnn_params.x8.lut = xnn_x8_lut_ukernel__avx_x64;
3611 } else {
3612 // Note: SSSE3 version is usually slower than scalar
3613 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
3614 }
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003615 xnn_params.x8.zip = (struct zip_parameters) {
3616 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__sse2,
3617 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__sse2,
3618 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__sse2,
3619 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__sse2,
3620 };
3621 #endif // XNN_NO_X8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07003622
Marat Dukhan8f920a62022-01-19 14:56:23 -08003623 /**************************** F16 x86 micro-kernels ****************************/
3624 #ifndef XNN_NO_F16_OPERATORS
3625 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx2()) {
3626 init_flags |= XNN_INIT_FLAG_F16;
3627
3628 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast);
3629 xnn_params.f16.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast);
3630 xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast);
3631 xnn_params.f16.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast);
3632 xnn_params.f16.gemm.init.f16 = xnn_init_f16_scaleminmax_avx_params;
3633 xnn_params.f16.gemm.mr = 4;
3634 xnn_params.f16.gemm.nr = 16;
3635
3636 xnn_params.f16.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x4__fma3;
3637 xnn_params.f16.dwconv[0].init.f16 = xnn_init_f16_minmax_avx_params;
3638 xnn_params.f16.dwconv[0].channel_tile = 16;
3639 xnn_params.f16.dwconv[0].primary_tile = 4;
3640
3641 xnn_params.f16.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x9__fma3;
3642 xnn_params.f16.dwconv[1].init.f16 = xnn_init_f16_minmax_avx_params;
3643 xnn_params.f16.dwconv[1].channel_tile = 16;
3644 xnn_params.f16.dwconv[1].primary_tile = 9;
3645
3646 xnn_params.f16.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up8x25__fma3_acc2;
3647 xnn_params.f16.dwconv[2].init.f16 = xnn_init_f16_minmax_avx_params;
3648 xnn_params.f16.dwconv[2].channel_tile = 8;
3649 xnn_params.f16.dwconv[2].primary_tile = 25;
3650
3651 xnn_params.f16.gavgpool = (struct gavgpool_parameters) {
3652 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8,
3653 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8,
3654 .init.f16 = xnn_init_f16_scaleminmax_avx_params,
3655 .update.f16 = xnn_update_f16_scaleminmax_avx_params,
3656 .row_tile = 7,
3657 .channel_tile = 8,
3658 };
3659 xnn_params.f16.vadd = (struct vbinary_parameters) {
3660 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vadd_minmax_ukernel__f16c_x16,
3661 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__f16c_x16,
3662 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__f16c_x16,
3663 .init.f16_minmax = xnn_init_f16_minmax_avx_params,
3664 .element_tile = 16,
3665 };
3666 xnn_params.f16.vmul = (struct vbinary_parameters) {
3667 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmul_minmax_ukernel__f16c_x16,
3668 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__f16c_x16,
3669 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__f16c_x16,
3670 .init.f16_minmax = xnn_init_f16_minmax_avx_params,
3671 .element_tile = 16,
3672 };
3673 xnn_params.f16.vmulcaddc = (struct vmulcaddc_parameters) {
3674 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f16_vmulcaddc_minmax_ukernel_c8__fma3_2x,
3675 .init.f16 = xnn_init_f16_minmax_avx_params,
3676 .channel_tile = 8,
3677 .row_tile = 2,
3678 };
3679 xnn_params.f16.hswish = (struct vunary_parameters) {
3680 .ukernel = (xnn_univector_ukernel_function) xnn_f16_vhswish_ukernel__f16c_x16,
3681 .init.f16_hswish = xnn_init_f16_hswish_avx_params,
3682 .element_tile = 16,
3683 };
3684 }
3685 #endif // XNN_NO_F16_OPERATORS
3686
Frank Barchardb40ee632021-12-30 11:10:02 -08003687 /**************************** F32 x86 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003688 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07003689 init_flags |= XNN_INIT_FLAG_F32;
3690
Marat Dukhan0f349c42019-11-27 11:58:54 -08003691 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07003692 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast);
3693 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast);
3694 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast);
3695 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003696 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003697 xnn_params.f32.gemm.mr = 7;
3698 xnn_params.f32.gemm.nr = 16;
Marat Dukhan48976702022-01-10 18:18:04 -08003699 } else if (cpuinfo_has_x86_fma3()) {
Marat Dukhan27121322019-12-09 14:57:40 -08003700 switch (cpuinfo_get_core(0)->uarch) {
3701 case cpuinfo_uarch_zen:
Marat Dukhanb3801eb2020-03-12 13:41:11 -07003702 case cpuinfo_uarch_dhyana:
Marat Dukhanaefaef32020-04-09 07:09:34 -07003703 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast);
3704 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast);
3705 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast);
3706 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003707 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003708 xnn_params.f32.gemm.mr = 4;
3709 xnn_params.f32.gemm.nr = 16;
3710 xnn_params.f32.gemm.log2_sr = 2;
Marat Dukhan27121322019-12-09 14:57:40 -08003711 break;
3712 default:
Marat Dukhanaefaef32020-04-09 07:09:34 -07003713 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast);
3714 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast);
3715 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast);
3716 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003717 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003718 xnn_params.f32.gemm.mr = 5;
3719 xnn_params.f32.gemm.nr = 16;
Marat Dukhan27121322019-12-09 14:57:40 -08003720 break;
3721 }
Marat Dukhan48976702022-01-10 18:18:04 -08003722 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07003723 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast);
3724 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast);
3725 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast);
3726 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003727 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003728 xnn_params.f32.gemm.mr = 5;
3729 xnn_params.f32.gemm.nr = 16;
Marat Dukhan1025ea32019-11-21 16:01:08 -08003730 } else {
Marat Dukhanaefaef32020-04-09 07:09:34 -07003731 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__sse_load1);
3732 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__sse_load1);
3733 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__sse_load1);
3734 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__sse_load1);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003735 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_sse_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003736 xnn_params.f32.gemm.mr = 4;
3737 xnn_params.f32.gemm.nr = 8;
Marat Dukhan1025ea32019-11-21 16:01:08 -08003738 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07003739 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__sse);
3740 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__sse);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003741 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_sse_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003742 xnn_params.f32.gemm2.mr = 4;
3743 xnn_params.f32.gemm2.nr = 2;
3744 xnn_params.f32.gemm2.log2_kr = 2;
3745
Marat Dukhan479f87e2019-11-27 15:17:06 -08003746 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003747 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__avx512f;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003748 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003749 xnn_params.f32.dwconv[0].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003750 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003751
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003752 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__avx512f;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003753 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003754 xnn_params.f32.dwconv[1].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003755 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003756
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003757 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__avx512f;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003758 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003759 xnn_params.f32.dwconv[2].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003760 xnn_params.f32.dwconv[2].primary_tile = 9;
3761
3762 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x25__avx512f;
3763 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
3764 xnn_params.f32.dwconv[3].channel_tile = 16;
3765 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhan48976702022-01-10 18:18:04 -08003766 } else if (cpuinfo_has_x86_fma3()) {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003767 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__fma3;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003768 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003769 xnn_params.f32.dwconv[0].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003770 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003771
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003772 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__fma3;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003773 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003774 xnn_params.f32.dwconv[1].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003775 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003776
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003777 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__fma3;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003778 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_avx_params;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003779 xnn_params.f32.dwconv[2].channel_tile = 16;
3780 xnn_params.f32.dwconv[2].primary_tile = 9;
3781
3782 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__fma3;
3783 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_avx_params;
3784 xnn_params.f32.dwconv[3].channel_tile = 8;
3785 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhan48976702022-01-10 18:18:04 -08003786 } else if (cpuinfo_has_x86_avx()) {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003787 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__avx;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003788 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003789 xnn_params.f32.dwconv[0].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003790 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003791
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003792 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__avx;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003793 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003794 xnn_params.f32.dwconv[1].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003795 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003796
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003797 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__avx;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003798 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_avx_params;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003799 xnn_params.f32.dwconv[2].channel_tile = 16;
3800 xnn_params.f32.dwconv[2].primary_tile = 9;
3801
3802 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__avx;
3803 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_avx_params;
3804 xnn_params.f32.dwconv[3].channel_tile = 8;
3805 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhan17ec5f32019-11-22 13:34:16 -08003806 } else {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003807 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__sse;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003808 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_sse_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003809 xnn_params.f32.dwconv[0].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003810 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003811
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003812 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__sse;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003813 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_sse_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003814 xnn_params.f32.dwconv[1].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003815 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003816
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003817 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__sse;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003818 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_sse_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003819 xnn_params.f32.dwconv[2].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003820 xnn_params.f32.dwconv[2].primary_tile = 9;
3821
3822 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__sse;
3823 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_sse_params;
3824 xnn_params.f32.dwconv[3].channel_tile = 8;
3825 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhan17ec5f32019-11-22 13:34:16 -08003826 }
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003827 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08003828 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__sse_c4,
3829 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4,
Marat Dukhan84598222022-01-09 21:39:22 -08003830 .init.f32 = xnn_init_f32_scaleminmax_sse_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08003831 .primary_tile = 9,
3832 .incremental_tile = 8,
3833 .channel_tile = 4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003834 };
3835 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08003836 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4,
3837 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4,
3838 .primary_tile = 9,
3839 .incremental_tile = 8,
3840 .channel_tile = 4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003841 };
3842 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08003843 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4,
3844 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4,
Marat Dukhan84598222022-01-09 21:39:22 -08003845 .init.f32 = xnn_init_f32_scaleminmax_sse_params,
3846 .update.f32 = xnn_update_f32_scaleminmax_sse_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08003847 .row_tile = 7,
3848 .channel_tile = 4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003849 };
3850 xnn_params.f32.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07003851 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4,
Marat Dukhan91ae1652021-08-15 19:19:49 -07003852 .init.f32 = xnn_init_f32_minmax_sse_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003853 .mr = 9,
3854 .qr = 8,
3855 };
3856 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07003857 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__sse2_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003858 .mr = 4,
3859 };
3860 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07003861 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__sse2_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003862 .mr = 9,
3863 };
3864 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07003865 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003866 .mr = 9,
3867 .qr = 8,
3868 };
Marat Dukhan660fd192020-03-10 04:55:30 -07003869 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
3870 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__sse_c8,
Marat Dukhan69722492019-11-11 19:55:50 -08003871 .pixel_tile = 1,
3872 .channel_tile = 8,
3873 };
Marat Dukhane2c3f292019-11-27 15:40:54 -08003874 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhane5efb162021-12-31 10:26:13 -08003875 xnn_params.f32.abs = (struct vunary_parameters) {
3876 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__avx512f_x16,
3877 .init.f32_abs = xnn_init_f32_abs_avx512_params,
3878 .element_tile = 16,
3879 };
Marat Dukhan48976702022-01-10 18:18:04 -08003880 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhane5efb162021-12-31 10:26:13 -08003881 xnn_params.f32.abs = (struct vunary_parameters) {
3882 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__avx_x16,
3883 .init.f32_abs = xnn_init_f32_abs_avx_params,
3884 .element_tile = 16,
3885 };
Marat Dukhan5020b962020-06-08 13:30:10 -07003886 } else {
Marat Dukhane5efb162021-12-31 10:26:13 -08003887 xnn_params.f32.abs = (struct vunary_parameters) {
3888 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__sse_x8,
3889 .init.f32_abs = xnn_init_f32_abs_sse_params,
3890 .element_tile = 8,
3891 };
Marat Dukhan5020b962020-06-08 13:30:10 -07003892 }
3893 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan94912792021-08-16 21:40:30 -07003894 xnn_params.f32.clamp = (struct vunary_parameters) {
3895 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__avx512f_x16,
3896 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
3897 .element_tile = 16,
3898 };
Marat Dukhan48976702022-01-10 18:18:04 -08003899 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan94912792021-08-16 21:40:30 -07003900 xnn_params.f32.clamp = (struct vunary_parameters) {
3901 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__avx_x16,
3902 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
3903 .element_tile = 16,
3904 };
Marat Dukhane2c3f292019-11-27 15:40:54 -08003905 } else {
Marat Dukhan94912792021-08-16 21:40:30 -07003906 xnn_params.f32.clamp = (struct vunary_parameters) {
3907 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__sse_x8,
3908 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
3909 .element_tile = 8,
3910 };
Marat Dukhane2c3f292019-11-27 15:40:54 -08003911 }
Marat Dukhan662faa02019-12-09 22:48:16 -08003912 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08003913 xnn_params.f32.elu = (struct vunary_parameters) {
3914 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x64,
3915 .init.f32_elu = xnn_init_f32_elu_avx512_rr1_lut16_p3_params,
3916 .element_tile = 64,
3917 };
Marat Dukhan48976702022-01-10 18:18:04 -08003918 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08003919 xnn_params.f32.elu = (struct vunary_parameters) {
3920 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56,
3921 .init.f32_elu = xnn_init_f32_elu_avx2_rr1_lut4_p4_params,
3922 .element_tile = 56,
3923 };
Marat Dukhan48976702022-01-10 18:18:04 -08003924 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08003925 xnn_params.f32.elu = (struct vunary_parameters) {
3926 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x32,
3927 .init.f32_elu = xnn_init_f32_elu_avx_rr2_lut4_p4_params,
3928 .element_tile = 32,
3929 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08003930 } else {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08003931 xnn_params.f32.elu = (struct vunary_parameters) {
3932 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__sse2_rr2_lut16_p3_x12,
3933 .init.f32_elu = xnn_init_f32_elu_sse2_rr2_lut16_p3_params,
3934 .element_tile = 12,
3935 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08003936 }
3937 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan561d0682021-12-23 16:12:35 -08003938 xnn_params.f32.hswish = (struct vunary_parameters) {
3939 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__avx512f_x16,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08003940 .init.f32_hswish = xnn_init_f32_hswish_avx512_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08003941 .element_tile = 16,
3942 };
Marat Dukhan48976702022-01-10 18:18:04 -08003943 } else if (cpuinfo_has_x86_fma3()) {
Marat Dukhan561d0682021-12-23 16:12:35 -08003944 xnn_params.f32.hswish = (struct vunary_parameters) {
3945 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__fma3_x16,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08003946 .init.f32_hswish = xnn_init_f32_hswish_avx_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08003947 .element_tile = 16,
3948 };
Marat Dukhan48976702022-01-10 18:18:04 -08003949 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan561d0682021-12-23 16:12:35 -08003950 xnn_params.f32.hswish = (struct vunary_parameters) {
3951 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__avx_x16,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08003952 .init.f32_hswish = xnn_init_f32_hswish_avx_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08003953 .element_tile = 16,
3954 };
Marat Dukhan662faa02019-12-09 22:48:16 -08003955 } else {
Marat Dukhan561d0682021-12-23 16:12:35 -08003956 xnn_params.f32.hswish = (struct vunary_parameters) {
3957 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__sse_x8,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08003958 .init.f32_hswish = xnn_init_f32_hswish_sse_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08003959 .element_tile = 8,
3960 };
Marat Dukhan662faa02019-12-09 22:48:16 -08003961 }
Marat Dukhan5020b962020-06-08 13:30:10 -07003962 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan2894e992021-12-30 08:29:48 -08003963 xnn_params.f32.lrelu = (struct vunary_parameters) {
3964 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__avx512f_x16,
3965 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
3966 .element_tile = 16,
3967 };
Marat Dukhan48976702022-01-10 18:18:04 -08003968 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan2894e992021-12-30 08:29:48 -08003969 xnn_params.f32.lrelu = (struct vunary_parameters) {
3970 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__avx_x16,
3971 .init.f32_lrelu = xnn_init_f32_lrelu_avx_params,
3972 .element_tile = 16,
3973 };
Marat Dukhan0d3f4672020-06-25 16:42:58 -07003974 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhan2894e992021-12-30 08:29:48 -08003975 xnn_params.f32.lrelu = (struct vunary_parameters) {
3976 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__sse41_x8,
3977 .init.f32_lrelu = xnn_init_f32_lrelu_sse_params,
3978 .element_tile = 8,
3979 };
Marat Dukhan28813332020-06-10 18:05:38 -07003980 } else {
Marat Dukhan2894e992021-12-30 08:29:48 -08003981 xnn_params.f32.lrelu = (struct vunary_parameters) {
3982 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__sse_x8,
3983 .init.f32_lrelu = xnn_init_f32_lrelu_sse_params,
3984 .element_tile = 8,
3985 };
Marat Dukhan28813332020-06-10 18:05:38 -07003986 }
3987 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhane5efb162021-12-31 10:26:13 -08003988 xnn_params.f32.neg = (struct vunary_parameters) {
3989 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__avx512f_x16,
3990 .init.f32_neg = xnn_init_f32_neg_avx512_params,
3991 .element_tile = 16,
3992 };
Marat Dukhan48976702022-01-10 18:18:04 -08003993 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhane5efb162021-12-31 10:26:13 -08003994 xnn_params.f32.neg = (struct vunary_parameters) {
3995 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__avx_x16,
3996 .init.f32_neg = xnn_init_f32_neg_avx_params,
3997 .element_tile = 16,
3998 };
Marat Dukhan5020b962020-06-08 13:30:10 -07003999 } else {
Marat Dukhane5efb162021-12-31 10:26:13 -08004000 xnn_params.f32.neg = (struct vunary_parameters) {
4001 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__sse_x8,
4002 .init.f32_neg = xnn_init_f32_neg_sse_params,
4003 .element_tile = 8,
4004 };
Marat Dukhan5020b962020-06-08 13:30:10 -07004005 }
Marat Dukhan64e52512020-06-09 13:41:16 -07004006 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan0e801372022-01-04 00:10:41 -08004007 xnn_params.f32.rndne = (struct vunary_parameters) {
4008 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__avx512f_x16,
4009 .element_tile = 16,
4010 };
4011 xnn_params.f32.rndz = (struct vunary_parameters) {
4012 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__avx512f_x16,
4013 .element_tile = 16,
4014 };
4015 xnn_params.f32.rndu = (struct vunary_parameters) {
4016 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__avx512f_x16,
4017 .element_tile = 16,
4018 };
4019 xnn_params.f32.rndd = (struct vunary_parameters) {
4020 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__avx512f_x16,
4021 .element_tile = 16,
4022 };
Marat Dukhan48976702022-01-10 18:18:04 -08004023 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan0e801372022-01-04 00:10:41 -08004024 xnn_params.f32.rndne = (struct vunary_parameters) {
4025 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__avx_x16,
4026 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4027 .element_tile = 16,
4028 };
4029 xnn_params.f32.rndz = (struct vunary_parameters) {
4030 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__avx_x16,
4031 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4032 .element_tile = 16,
4033 };
4034 xnn_params.f32.rndu = (struct vunary_parameters) {
4035 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__avx_x16,
4036 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4037 .element_tile = 16,
4038 };
4039 xnn_params.f32.rndd = (struct vunary_parameters) {
4040 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__avx_x16,
4041 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4042 .element_tile = 16,
4043 };
Marat Dukhan64e52512020-06-09 13:41:16 -07004044 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhan0e801372022-01-04 00:10:41 -08004045 xnn_params.f32.rndne = (struct vunary_parameters) {
4046 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__sse41_x8,
4047 .element_tile = 8,
4048 };
4049 xnn_params.f32.rndz = (struct vunary_parameters) {
4050 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__sse41_x8,
4051 .element_tile = 8,
4052 };
4053 xnn_params.f32.rndu = (struct vunary_parameters) {
4054 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__sse41_x8,
4055 .element_tile = 8,
4056 };
4057 xnn_params.f32.rndd = (struct vunary_parameters) {
4058 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__sse41_x8,
4059 .element_tile = 8,
4060 };
Marat Dukhan64e52512020-06-09 13:41:16 -07004061 } else {
Marat Dukhan0e801372022-01-04 00:10:41 -08004062 xnn_params.f32.rndne = (struct vunary_parameters) {
4063 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__sse2_x8,
4064 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4065 .element_tile = 8,
4066 };
4067 xnn_params.f32.rndz = (struct vunary_parameters) {
4068 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__sse2_x8,
4069 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4070 .element_tile = 8,
4071 };
4072 xnn_params.f32.rndu = (struct vunary_parameters) {
4073 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__sse2_x8,
4074 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4075 .element_tile = 8,
4076 };
4077 xnn_params.f32.rndd = (struct vunary_parameters) {
4078 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__sse2_x8,
4079 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4080 .element_tile = 8,
4081 };
Marat Dukhan64e52512020-06-09 13:41:16 -07004082 }
Marat Dukhand9ca7e62020-09-23 23:45:29 -07004083 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhance834ad2022-01-03 00:22:01 -08004084 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4085 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x64,
4086 .init.f32_sigmoid = xnn_init_f32_sigmoid_avx512_rr2_lut32_p2_params,
4087 .element_tile = 64,
4088 };
Marat Dukhan48976702022-01-10 18:18:04 -08004089 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhance834ad2022-01-03 00:22:01 -08004090 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4091 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_x40,
4092 .init.f32_sigmoid = xnn_init_f32_sigmoid_avx2_rr1_p5_params,
4093 .element_tile = 40,
4094 };
Marat Dukhan48976702022-01-10 18:18:04 -08004095 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhance834ad2022-01-03 00:22:01 -08004096 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4097 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_x40,
4098 .init.f32_sigmoid = xnn_init_f32_sigmoid_avx_rr2_p5_params,
4099 .element_tile = 40,
4100 };
Marat Dukhan6dd71362020-09-17 23:11:21 -07004101 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhance834ad2022-01-03 00:22:01 -08004102 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4103 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_x8,
4104 .init.f32_sigmoid = xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params,
4105 .element_tile = 8,
4106 };
Marat Dukhanfa0a4322020-01-06 16:14:29 -08004107 } else {
Marat Dukhance834ad2022-01-03 00:22:01 -08004108 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4109 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__sse2_rr2_lut64_p2_div_x8,
4110 .init.f32_sigmoid = xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params,
4111 .element_tile = 8,
4112 };
Marat Dukhanfa0a4322020-01-06 16:14:29 -08004113 }
Marat Dukhan90eca0a2020-03-11 00:52:23 -07004114 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhane5efb162021-12-31 10:26:13 -08004115 xnn_params.f32.sqr = (struct vunary_parameters) {
4116 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__avx512f_x16,
4117 .element_tile = 16,
4118 };
Marat Dukhan48976702022-01-10 18:18:04 -08004119 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhane5efb162021-12-31 10:26:13 -08004120 xnn_params.f32.sqr = (struct vunary_parameters) {
4121 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__avx_x16,
4122 .init.f32_default = xnn_init_f32_default_avx_params,
4123 .element_tile = 16,
4124 };
Marat Dukhan5020b962020-06-08 13:30:10 -07004125 } else {
Marat Dukhane5efb162021-12-31 10:26:13 -08004126 xnn_params.f32.sqr = (struct vunary_parameters) {
4127 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__sse_x8,
4128 .element_tile = 8,
4129 };
Marat Dukhan5020b962020-06-08 13:30:10 -07004130 }
Marat Dukhan48976702022-01-10 18:18:04 -08004131 if (cpuinfo_has_x86_avx()) {
Marat Dukhane72b2822021-12-30 14:46:58 -08004132 xnn_params.f32.sqrt = (struct vunary_parameters) {
4133 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__avx_sqrt_x8,
4134 .init.f32_sqrt = xnn_init_f32_sqrt_avx_params,
4135 .element_tile = 8,
4136 };
Marat Dukhan6804bbd2020-06-30 19:26:11 -07004137 } else {
Marat Dukhane72b2822021-12-30 14:46:58 -08004138 xnn_params.f32.sqrt = (struct vunary_parameters) {
4139 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__sse_sqrt_x4,
4140 .element_tile = 4,
4141 };
Marat Dukhan6804bbd2020-06-30 19:26:11 -07004142 }
Marat Dukhan5020b962020-06-08 13:30:10 -07004143 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan90eca0a2020-03-11 00:52:23 -07004144 xnn_params.f32.prelu = (struct prelu_parameters) {
4145 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__avx512f_2x16,
4146 .row_tile = 2,
4147 .channel_tile = 16,
4148 };
Marat Dukhan48976702022-01-10 18:18:04 -08004149 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan90eca0a2020-03-11 00:52:23 -07004150 xnn_params.f32.prelu = (struct prelu_parameters) {
4151 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__avx_2x16,
4152 .row_tile = 2,
4153 .channel_tile = 16,
4154 };
Marat Dukhan39b5e942020-06-24 15:03:48 -07004155 } else if (cpuinfo_has_x86_sse4_1()) {
4156 xnn_params.f32.prelu = (struct prelu_parameters) {
4157 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__sse41_2x8,
4158 .row_tile = 2,
4159 .channel_tile = 8,
4160 };
Marat Dukhan90eca0a2020-03-11 00:52:23 -07004161 } else {
4162 xnn_params.f32.prelu = (struct prelu_parameters) {
4163 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__sse2_2x8,
4164 .row_tile = 2,
4165 .channel_tile = 8,
4166 };
4167 }
Marat Dukhan4a5c7712022-01-05 22:43:13 -08004168 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
4169 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_x20_acc2,
4170 .init = xnn_init_f32_expminus_sse2_rr2_p5_params,
4171 .element_tile = 20,
4172 };
Marat Dukhan1edc4542020-01-27 12:40:13 -08004173 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__sse;
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004174 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4175 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004176 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__avx512f_x32,
4177 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx512f_x32,
4178 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx512f_x32,
Marat Dukhanf6004972021-12-30 11:23:02 -08004179 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004180 .element_tile = 32,
4181 };
4182 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004183 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__avx512f_x32,
4184 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__avx512f_x32,
4185 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__avx512f_x32,
Marat Dukhanf6004972021-12-30 11:23:02 -08004186 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004187 .element_tile = 32,
4188 };
4189 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004190 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__avx512f_x32,
4191 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx512f_x32,
4192 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx512f_x32,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004193 .element_tile = 32,
4194 };
4195 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004196 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__avx512f_x32,
4197 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx512f_x32,
4198 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx512f_x32,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004199 .element_tile = 32,
4200 };
4201 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004202 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__avx512f_x32,
4203 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
4204 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
Marat Dukhanf6004972021-12-30 11:23:02 -08004205 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004206 .element_tile = 32,
4207 };
4208 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004209 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__avx512f_x32,
4210 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__avx512f_x32,
4211 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__avx512f_x32,
Marat Dukhanf6004972021-12-30 11:23:02 -08004212 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004213 .element_tile = 32,
4214 };
Marat Dukhanf7399262020-06-05 10:58:44 -07004215 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004216 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__avx512f_x32,
4217 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx512f_x32,
4218 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx512f_x32,
Marat Dukhanf7399262020-06-05 10:58:44 -07004219 .element_tile = 32,
4220 };
Marat Dukhan48976702022-01-10 18:18:04 -08004221 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004222 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004223 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__avx_x16,
4224 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx_x16,
4225 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08004226 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004227 .element_tile = 16,
4228 };
4229 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004230 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__avx_x16,
4231 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__avx_x16,
4232 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__avx_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08004233 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004234 .element_tile = 16,
4235 };
4236 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004237 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__avx_x16,
4238 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx_x16,
4239 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx_x16,
Marat Dukhan98c52152021-12-30 13:31:00 -08004240 .init.f32_default = xnn_init_f32_default_avx_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004241 .element_tile = 16,
4242 };
4243 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004244 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__avx_x16,
4245 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx_x16,
4246 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx_x16,
Marat Dukhan98c52152021-12-30 13:31:00 -08004247 .init.f32_default = xnn_init_f32_default_avx_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004248 .element_tile = 16,
4249 };
4250 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004251 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__avx_x16,
4252 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx_x16,
4253 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08004254 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004255 .element_tile = 16,
4256 };
4257 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004258 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__avx_x16,
4259 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__avx_x16,
4260 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__avx_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08004261 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004262 .element_tile = 16,
4263 };
Marat Dukhanf7399262020-06-05 10:58:44 -07004264 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004265 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__avx_x16,
4266 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx_x16,
4267 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx_x16,
Marat Dukhan98c52152021-12-30 13:31:00 -08004268 .init.f32_default = xnn_init_f32_default_avx_params,
Marat Dukhanf7399262020-06-05 10:58:44 -07004269 .element_tile = 16,
4270 };
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004271 } else {
4272 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004273 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__sse_x8,
4274 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__sse_x8,
4275 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__sse_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08004276 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004277 .element_tile = 8,
4278 };
4279 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004280 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__sse_x8,
4281 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__sse_x8,
4282 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__sse_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08004283 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004284 .element_tile = 8,
4285 };
4286 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004287 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__sse_x8,
4288 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
4289 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004290 .element_tile = 8,
4291 };
4292 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004293 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__sse_x8,
4294 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__sse_x8,
4295 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__sse_x8,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004296 .element_tile = 8,
4297 };
4298 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004299 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__sse_x8,
4300 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__sse_x8,
4301 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__sse_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08004302 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004303 .element_tile = 8,
4304 };
4305 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004306 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__sse_x8,
4307 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__sse_x8,
4308 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__sse_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08004309 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004310 .element_tile = 8,
4311 };
Marat Dukhanf7399262020-06-05 10:58:44 -07004312 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004313 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__sse_x8,
4314 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__sse_x8,
4315 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__sse_x8,
Marat Dukhanf7399262020-06-05 10:58:44 -07004316 .element_tile = 8,
4317 };
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004318 }
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004319 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07004320 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07004321 .init.f32 = xnn_init_f32_minmax_sse_params,
Marat Dukhan49e6ee92019-11-06 15:55:29 -08004322 .channel_tile = 4,
4323 .row_tile = 2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004324 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08004325 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -08004326 // Sparse microkernels on x86 currently target only SSE, and on processors
4327 // with AVX ISA dense inference is expected to be faster than sparse.
4328 if (!cpuinfo_has_x86_avx()) {
4329 init_flags |= XNN_INIT_FLAG_CHW_OPT;
4330 }
4331
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004332 xnn_params.f32.spmm = (struct spmm_parameters) {
Frank Barchard4fd38b22020-10-30 17:10:11 -07004333 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__sse,
4334 .mr = 32,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004335 .nr = 1,
4336 };
Erich Elsen5b2e07a2020-06-09 03:27:59 -07004337 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
4338 .ukernel_with_symm_padding =
4339 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2,
4340 .output_channel_tile = 4,
4341 .output_height_tile = 2,
4342 .output_width_tile = 2,
4343 };
Marat Dukhan48976702022-01-10 18:18:04 -08004344 if (cpuinfo_has_x86_ssse3()) {
Frank Barchard0b18cb32020-11-23 10:50:44 -08004345 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
4346 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4_acc2,
Frank Barchard0b18cb32020-11-23 10:50:44 -08004347 .output_width_tile = 4,
4348 .output_height_tile = 2,
4349 };
4350 } else {
4351 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
4352 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2,
Frank Barchard0b18cb32020-11-23 10:50:44 -08004353 .output_width_tile = 4,
4354 .output_height_tile = 2,
4355 };
4356 }
Marat Dukhanbf715f92020-10-23 20:17:00 -07004357 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
4358 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004359 .output_width_tile = 4,
4360 .output_height_tile = 1,
4361 };
Marat Dukhand0503892020-10-30 08:22:04 -07004362 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
4363 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4,
Marat Dukhand0503892020-10-30 08:22:04 -07004364 .output_width_tile = 4,
4365 .output_height_tile = 4,
4366 };
Marat Dukhanccca2142020-10-30 17:32:45 -07004367 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
4368 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4,
Marat Dukhanccca2142020-10-30 17:32:45 -07004369 .output_width_tile = 4,
4370 .output_height_tile = 2,
4371 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07004372 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
4373 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__sse_x4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004374 .channel_tile = 4,
4375 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -07004376 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
Artsiom Ablavatskib3ffd582021-03-31 13:00:08 -07004377 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__sse_p8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07004378 .channel_tile = 1,
Artsiom Ablavatskib3ffd582021-03-31 13:00:08 -07004379 .pixel_tile = 8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07004380 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08004381 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004382 #endif // XNN_NO_F32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07004383
Frank Barchardb40ee632021-12-30 11:10:02 -08004384 /*************************** VCVT x86 micro-kernels ***************************/
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004385 #ifndef XNN_NO_VCVT_OPERATORS
4386 init_flags |= XNN_INIT_FLAG_VCVT;
4387
4388 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhan134f9842021-12-29 19:57:31 -08004389 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4390 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__avx512skx_x16,
4391 .element_tile = 16,
4392 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08004393 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4394 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__avx512skx_x16,
4395 .element_tile = 16,
4396 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004397 } else if (cpuinfo_has_x86_f16c()) {
Marat Dukhan134f9842021-12-29 19:57:31 -08004398 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4399 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__f16c_x16,
4400 .element_tile = 16,
4401 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08004402 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4403 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__f16c_x16,
4404 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_f16c_params,
4405 .element_tile = 16,
4406 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004407 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan134f9842021-12-29 19:57:31 -08004408 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4409 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__avx_int16_x16,
4410 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
4411 .element_tile = 16,
4412 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08004413 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4414 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__avx_x24,
4415 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
4416 .element_tile = 24,
4417 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004418 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhan134f9842021-12-29 19:57:31 -08004419 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4420 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__sse41_int16_x16,
4421 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
4422 .element_tile = 16,
4423 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08004424 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4425 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__sse41_x8,
4426 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
4427 .element_tile = 8,
4428 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004429 } else {
Marat Dukhan134f9842021-12-29 19:57:31 -08004430 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4431 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__sse2_int16_x32,
4432 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
4433 .element_tile = 32,
4434 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08004435 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4436 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__sse2_x16,
4437 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
4438 .element_tile = 16,
4439 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004440 }
Marat Dukhan2edf8632021-12-14 23:17:14 -08004441 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
4442 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4443 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx512skx_x128,
4444 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx512_params,
4445 .element_tile = 128,
4446 };
4447 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan0d399ca2021-12-14 19:25:50 -08004448 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4449 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx2_x64,
4450 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx2_params,
4451 .element_tile = 64,
4452 };
4453 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhanb91432c2021-12-14 16:52:09 -08004454 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4455 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx_x32,
4456 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx_params,
4457 .element_tile = 32,
4458 };
4459 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhaned2d7762021-12-03 23:51:19 -08004460 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4461 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__sse41_x32,
4462 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_sse4_params,
4463 .element_tile = 32,
4464 };
4465 } else {
4466 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4467 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__sse2_x32,
4468 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_sse2_params,
4469 .element_tile = 32,
4470 };
4471 }
Marat Dukhan2edf8632021-12-14 23:17:14 -08004472 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
4473 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4474 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx512skx_x128,
4475 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx512_params,
4476 .element_tile = 128,
4477 };
4478 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan0d399ca2021-12-14 19:25:50 -08004479 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4480 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx2_x64,
4481 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx2_params,
4482 .element_tile = 64,
4483 };
4484 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhanb91432c2021-12-14 16:52:09 -08004485 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4486 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx_x32,
4487 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx_params,
4488 .element_tile = 32,
4489 };
4490 } else {
4491 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4492 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__sse2_x32,
4493 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_sse2_params,
4494 .element_tile = 32,
4495 };
4496 }
Marat Dukhan98393ad2021-12-15 11:07:40 -08004497 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
4498 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4499 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx512skx_x32,
4500 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx512_params,
4501 .element_tile = 32,
4502 };
4503 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4504 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx512skx_x32,
4505 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx512_params,
4506 .element_tile = 32,
4507 };
4508 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan7b5f7792021-12-15 00:29:39 -08004509 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4510 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx2_x16,
4511 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params,
4512 .element_tile = 16,
4513 };
4514 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4515 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx2_x16,
4516 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx_params,
4517 .element_tile = 16,
4518 };
4519 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhancd4089f2021-12-14 23:53:33 -08004520 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4521 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx_x32,
4522 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params,
4523 .element_tile = 32,
4524 };
4525 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4526 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx_x32,
4527 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx_params,
4528 .element_tile = 32,
4529 };
4530 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhanf92206b2021-12-10 17:02:07 -08004531 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4532 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__sse41_x16,
4533 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_sse4_params,
4534 .element_tile = 16,
4535 };
4536 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4537 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__sse41_x16,
4538 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_sse4_params,
4539 .element_tile = 16,
4540 };
4541 } else {
4542 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4543 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__sse2_x32,
4544 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_sse2_params,
4545 .element_tile = 32,
4546 };
4547 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4548 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__sse2_x32,
4549 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_sse2_params,
4550 .element_tile = 32,
4551 };
4552 }
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004553 #endif // XNN_NO_VCVT_OPERATORS
4554
Frank Barchardb40ee632021-12-30 11:10:02 -08004555 /**************************** X32 x86 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004556 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07004557 init_flags |= XNN_INIT_FLAG_X32;
4558
Marat Dukhan57dccd82020-04-14 00:53:10 -07004559 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__sse2;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004560 xnn_params.x32.zip = (struct zip_parameters) {
4561 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__sse2,
4562 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__sse2,
4563 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__sse2,
4564 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__sse2,
4565 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08004566 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -08004567 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
4568 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08004569 .channel_tile = 1,
4570 .pixel_tile = 1,
4571 };
4572 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004573 #endif // XNN_NO_X32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07004574
Frank Barchardb40ee632021-12-30 11:10:02 -08004575 /**************************** XX x86 micro-kernels ****************************/
Marat Dukhan048931b2020-11-24 20:53:54 -08004576 #ifndef XNN_NO_XX_OPERATORS
4577 init_flags |= XNN_INIT_FLAG_XX;
4578
4579 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
Marat Dukhan933051b2021-08-07 16:26:15 -07004580 xnn_params.xx.fill = (struct fill_parameters) {
4581 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__sse2_x64,
4582 .row_tile = 1,
4583 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -07004584 xnn_params.xx.pad = (struct pad_parameters) {
4585 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__sse2,
4586 .row_tile = 1,
4587 };
Marat Dukhan048931b2020-11-24 20:53:54 -08004588 #endif
4589
Marat Dukhan4c617792021-12-21 15:47:58 -08004590#elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
Marat Dukhan933051b2021-08-07 16:26:15 -07004591
Frank Barchardb40ee632021-12-30 11:10:02 -08004592 /**************************** QC8 WAsm SIMD micro-kernels****************************/
Marat Dukhan898d5852021-06-30 21:18:34 -07004593 #ifndef XNN_NO_QS8_OPERATORS
4594 init_flags |= XNN_INIT_FLAG_QC8;
4595
Marat Dukhan189c1d02021-09-03 15:39:54 -07004596 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
Marat Dukhan58cdcf22022-02-01 02:05:00 -08004597 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4598 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4599 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4600 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
Marat Dukhan189c1d02021-09-03 15:39:54 -07004601 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
Marat Dukhan0f1ed942021-12-08 23:25:50 -08004602 xnn_params.qc8.gemm.mr = 4;
Marat Dukhan189c1d02021-09-03 15:39:54 -07004603 xnn_params.qc8.gemm.nr = 4;
4604 xnn_params.qc8.gemm.log2_kr = 1;
Marat Dukhan0f1ed942021-12-08 23:25:50 -08004605 xnn_params.qc8.gemm.log2_sr = 2;
Marat Dukhan189c1d02021-09-03 15:39:54 -07004606 #else
4607 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
4608 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
4609 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
4610 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
4611 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
4612 xnn_params.qc8.gemm.mr = 3;
4613 xnn_params.qc8.gemm.nr = 4;
4614 xnn_params.qc8.gemm.log2_kr = 3;
4615 #endif
Marat Dukhan898d5852021-06-30 21:18:34 -07004616
Marat Dukhan9cedb592021-08-17 17:25:24 -07004617 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16_add16;
Marat Dukhan898d5852021-06-30 21:18:34 -07004618 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
Marat Dukhan9cedb592021-08-17 17:25:24 -07004619 xnn_params.qc8.dwconv[0].channel_tile = 16;
Marat Dukhan898d5852021-06-30 21:18:34 -07004620 xnn_params.qc8.dwconv[0].primary_tile = 9;
Marat Dukhan9cedb592021-08-17 17:25:24 -07004621 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__wasmsimd_mul16_add16;
Marat Dukhan898d5852021-06-30 21:18:34 -07004622 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
Marat Dukhan9cedb592021-08-17 17:25:24 -07004623 xnn_params.qc8.dwconv[1].channel_tile = 16;
Marat Dukhan898d5852021-06-30 21:18:34 -07004624 xnn_params.qc8.dwconv[1].primary_tile = 25;
4625 #endif // XNN_NO_QC8_OPERATORS
4626
Frank Barchardb40ee632021-12-30 11:10:02 -08004627 /**************************** QS8 WAsm SIMD micro-kernels****************************/
Marat Dukhan07e50402020-08-05 17:16:53 -07004628 #ifndef XNN_NO_QS8_OPERATORS
4629 init_flags |= XNN_INIT_FLAG_QS8;
4630
Marat Dukhan189c1d02021-09-03 15:39:54 -07004631 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
Marat Dukhan58cdcf22022-02-01 02:05:00 -08004632 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4633 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4634 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4635 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
Marat Dukhan189c1d02021-09-03 15:39:54 -07004636 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
Marat Dukhan0f1ed942021-12-08 23:25:50 -08004637 xnn_params.qs8.gemm.mr = 4;
Marat Dukhan189c1d02021-09-03 15:39:54 -07004638 xnn_params.qs8.gemm.nr = 4;
4639 xnn_params.qs8.gemm.log2_kr = 1;
Marat Dukhan0f1ed942021-12-08 23:25:50 -08004640 xnn_params.qs8.gemm.log2_sr = 2;
Marat Dukhan189c1d02021-09-03 15:39:54 -07004641 #else // XNN_WASMSIMD_VERSION >= 88
4642 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
4643 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
4644 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
4645 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
4646 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
4647 xnn_params.qs8.gemm.mr = 3;
4648 xnn_params.qs8.gemm.nr = 4;
4649 xnn_params.qs8.gemm.log2_kr = 3;
4650 #endif // XNN_WASMSIMD_VERSION >= 88
Marat Dukhan07e50402020-08-05 17:16:53 -07004651
Marat Dukhan9cedb592021-08-17 17:25:24 -07004652 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16_add16;
Marat Dukhan400e7cb2021-08-07 15:14:54 -07004653 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
Marat Dukhan9cedb592021-08-17 17:25:24 -07004654 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhan07e50402020-08-05 17:16:53 -07004655 xnn_params.qs8.dwconv[0].primary_tile = 9;
Marat Dukhan9cedb592021-08-17 17:25:24 -07004656 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__wasmsimd_mul16_add16;
Marat Dukhan400e7cb2021-08-07 15:14:54 -07004657 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
Marat Dukhan9cedb592021-08-17 17:25:24 -07004658 xnn_params.qs8.dwconv[1].channel_tile = 16;
Marat Dukhan4ed14882021-05-12 17:50:40 -07004659 xnn_params.qs8.dwconv[1].primary_tile = 25;
Marat Dukhan9e0b5392020-08-07 02:29:34 -07004660
4661 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan9e258d62022-01-12 10:50:51 -08004662 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16,
4663 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16,
Marat Dukhan53f41062022-01-11 19:44:57 -08004664 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params,
4665 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_wasmsimd_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08004666 .row_tile = 7,
Marat Dukhan9e258d62022-01-12 10:50:51 -08004667 .channel_tile = 16,
Marat Dukhan9e0b5392020-08-07 02:29:34 -07004668 };
Marat Dukhanff209482020-09-03 14:26:53 -07004669
4670 xnn_params.qs8.vadd = (struct vbinary_parameters) {
Marat Dukhane20a8732021-12-07 17:11:37 -08004671 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__wasmsimd_x32,
4672 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32,
4673 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32,
Marat Dukhan64287252021-09-07 16:20:03 -07004674 .init.qs8_addsub = xnn_init_qs8_add_minmax_wasmsimd_params,
Marat Dukhane20a8732021-12-07 17:11:37 -08004675 .element_tile = 32,
Marat Dukhanff209482020-09-03 14:26:53 -07004676 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07004677 xnn_params.qs8.vmul = (struct vbinary_parameters) {
4678 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4679 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4680 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4681 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_wasmsimd_params,
4682 .element_tile = 8,
4683 };
Marat Dukhan07e50402020-08-05 17:16:53 -07004684 #endif // XNN_NO_QS8_OPERATORS
4685
Frank Barchardb40ee632021-12-30 11:10:02 -08004686 /**************************** QU8 WAsm SIMD micro-kernels****************************/
Marat Dukhan08b7a972020-07-14 18:17:29 -07004687 #ifndef XNN_NO_QU8_OPERATORS
4688 init_flags |= XNN_INIT_FLAG_QU8;
Marat Dukhan854fb6b2020-06-19 12:33:44 -07004689
Marat Dukhan189c1d02021-09-03 15:39:54 -07004690 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
Marat Dukhan58cdcf22022-02-01 02:05:00 -08004691 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4692 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4693 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4694 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
Marat Dukhan189c1d02021-09-03 15:39:54 -07004695 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
Marat Dukhan58cdcf22022-02-01 02:05:00 -08004696 xnn_params.qu8.gemm.mr = 4;
Marat Dukhan189c1d02021-09-03 15:39:54 -07004697 xnn_params.qu8.gemm.nr = 4;
4698 xnn_params.qu8.gemm.log2_kr = 1;
Marat Dukhan58cdcf22022-02-01 02:05:00 -08004699 xnn_params.qu8.gemm.log2_sr = 2;
Marat Dukhan189c1d02021-09-03 15:39:54 -07004700 #else // XNN_WASMSIMD_VERSION >= 88
4701 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64);
4702 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64);
4703 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64);
4704 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64);
4705 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
4706 xnn_params.qu8.gemm.mr = 3;
4707 xnn_params.qu8.gemm.nr = 4;
4708 xnn_params.qu8.gemm.log2_kr = 3;
4709 #endif // XNN_WASMSIMD_VERSION >= 88
Marat Dukhanaefaef32020-04-09 07:09:34 -07004710
Marat Dukhana97e9752021-07-15 16:30:41 -07004711 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__wasmsimd_mul16;
4712 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
4713 xnn_params.qu8.dwconv[0].channel_tile = 8;
Marat Dukhan08b7a972020-07-14 18:17:29 -07004714 xnn_params.qu8.dwconv[0].primary_tile = 9;
Marat Dukhana97e9752021-07-15 16:30:41 -07004715 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__wasmsimd_mul16;
4716 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
4717 xnn_params.qu8.dwconv[1].channel_tile = 8;
4718 xnn_params.qu8.dwconv[1].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004719
Marat Dukhan08b7a972020-07-14 18:17:29 -07004720 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08004721 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
4722 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
4723 .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
4724 .primary_tile = 9,
4725 .incremental_tile = 8,
4726 .channel_tile = 1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004727 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07004728 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhand1f53e42022-01-12 22:34:51 -08004729 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16,
4730 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16,
4731 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params,
4732 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_wasmsimd_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08004733 .row_tile = 7,
Marat Dukhand1f53e42022-01-12 22:34:51 -08004734 .channel_tile = 16,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004735 };
Marat Dukhandb007cd2021-07-20 23:42:39 -07004736
4737 xnn_params.qu8.vadd = (struct vbinary_parameters) {
Marat Dukhane20a8732021-12-07 17:11:37 -08004738 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__wasmsimd_x32,
4739 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_x32,
4740 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_x32,
Marat Dukhan64287252021-09-07 16:20:03 -07004741 .init.qu8_addsub = xnn_init_qu8_add_minmax_wasmsimd_params,
Marat Dukhane20a8732021-12-07 17:11:37 -08004742 .element_tile = 32,
Marat Dukhandb007cd2021-07-20 23:42:39 -07004743 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07004744 xnn_params.qu8.vmul = (struct vbinary_parameters) {
4745 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4746 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4747 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4748 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_wasmsimd_params,
4749 .element_tile = 8,
4750 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07004751 #endif // XNN_NO_QU8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07004752
Frank Barchardb40ee632021-12-30 11:10:02 -08004753 /**************************** S8 WAsm SIMD micro-kernels****************************/
Marat Dukhandc5c1482021-08-16 09:03:15 -07004754 #ifndef XNN_NO_S8_OPERATORS
4755 init_flags |= XNN_INIT_FLAG_S8;
4756
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07004757 xnn_params.s8.clamp = (struct vunary_parameters) {
4758 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__wasmsimd_x64,
4759 .init.s8_minmax = xnn_init_s8_minmax_wasmsimd_params,
4760 .element_tile = 64,
4761 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08004762 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
4763 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
4764 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__wasmsimd_dot16x2_c8,
4765 .pixel_tile = 1,
4766 .channel_tile = 8,
4767 };
4768 #else // XNN_WASMSIMD_VERSION >= 88
4769 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
4770 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__wasmsimd_mul32_c8,
4771 .pixel_tile = 1,
4772 .channel_tile = 8,
4773 };
4774 #endif // XNN_WASMSIMD_VERSION >= 88
Marat Dukhandc5c1482021-08-16 09:03:15 -07004775 xnn_params.s8.maxpool = (struct maxpool_parameters) {
4776 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__wasmsimd_c16,
4777 .init.s8 = xnn_init_s8_minmax_wasmsimd_params,
4778 .mr = 9,
4779 .qr = 8,
4780 };
4781 #endif // XNN_NO_S8_OPERATORS
4782
Frank Barchardb40ee632021-12-30 11:10:02 -08004783 /**************************** U8 WAsm SIMD micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004784 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07004785 init_flags |= XNN_INIT_FLAG_U8;
4786
Marat Dukhan94912792021-08-16 21:40:30 -07004787 xnn_params.u8.clamp = (struct vunary_parameters) {
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07004788 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__wasmsimd_x64,
4789 .init.u8_minmax = xnn_init_u8_minmax_wasmsimd_params,
4790 .element_tile = 64,
Marat Dukhan94912792021-08-16 21:40:30 -07004791 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08004792 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
4793 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
4794 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__wasmsimd_dot16x2_c8,
4795 .pixel_tile = 1,
4796 .channel_tile = 8,
4797 };
4798 #else // XNN_WASMSIMD_VERSION >= 88
4799 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
4800 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__wasmsimd_mul32_c8,
4801 .pixel_tile = 1,
4802 .channel_tile = 8,
4803 };
4804 #endif // XNN_WASMSIMD_VERSION >= 88
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004805 xnn_params.u8.maxpool = (struct maxpool_parameters) {
Marat Dukhanf1589422021-08-15 20:37:06 -07004806 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__wasmsimd_c16,
4807 .init.u8 = xnn_init_u8_minmax_wasmsimd_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004808 .mr = 9,
4809 .qr = 8,
4810 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004811 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
4812 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
4813 #endif // XNN_NO_U8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07004814
Frank Barchardb40ee632021-12-30 11:10:02 -08004815 /**************************** X8 WAsm SIMD micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004816 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07004817 init_flags |= XNN_INIT_FLAG_X8;
4818
Marat Dukhand67539d2021-09-08 23:06:03 -07004819 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004820 xnn_params.x8.zip = (struct zip_parameters) {
4821 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
4822 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
4823 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
4824 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
4825 };
4826 #endif // XNN_NO_X8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07004827
Frank Barchardb40ee632021-12-30 11:10:02 -08004828 /**************************** F32 WAsm SIMD micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004829 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07004830 init_flags |= XNN_INIT_FLAG_F32;
4831
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004832 if (is_wasm_x86) {
Frank Barchard0725b8d2020-12-07 11:07:35 -08004833 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
4834 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
4835 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
4836 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
Marat Dukhan688f6d82020-07-14 17:02:11 -07004837 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat);
4838 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat);
4839 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
4840 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
Marat Dukhan802808c2020-06-16 11:01:17 -07004841 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__wasmsimd_splat);
4842 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__wasmsimd_splat);
4843 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
4844 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004845 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004846 xnn_params.f32.gemm.mr = 4;
4847 xnn_params.f32.gemm.nr = 8;
Marat Dukhane39e6462020-07-09 01:33:36 -07004848
4849 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
4850 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
4851 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
4852 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004853 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhane39e6462020-07-09 01:33:36 -07004854 xnn_params.f32.gemm2.mr = 4;
4855 xnn_params.f32.gemm2.nr = 2;
4856 xnn_params.f32.gemm2.log2_kr = 2;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004857 } else {
Frank Barchard0725b8d2020-12-07 11:07:35 -08004858 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
4859 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
4860 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
4861 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
Marat Dukhan688f6d82020-07-14 17:02:11 -07004862 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat);
4863 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat);
4864 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
4865 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
Marat Dukhan802808c2020-06-16 11:01:17 -07004866 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_5x8__wasmsimd_splat);
4867 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_5x8__wasmsimd_splat);
4868 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
4869 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004870 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07004871 xnn_params.f32.gemm.mr = 5;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004872 xnn_params.f32.gemm.nr = 8;
Marat Dukhane39e6462020-07-09 01:33:36 -07004873
4874 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
4875 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
4876 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
4877 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004878 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhane39e6462020-07-09 01:33:36 -07004879 xnn_params.f32.gemm2.mr = 4;
4880 xnn_params.f32.gemm2.nr = 2;
4881 xnn_params.f32.gemm2.log2_kr = 2;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004882 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07004883
Marat Dukhanac014d72020-06-16 08:36:47 -07004884 if (is_wasm_x86) {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004885 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__wasmsimd_x86;
4886 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x3__wasmsimd;
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004887 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhanac014d72020-06-16 08:36:47 -07004888 xnn_params.f32.dwconv[0].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004889 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004890
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004891 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86;
4892 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x4__wasmsimd;
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004893 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhanac014d72020-06-16 08:36:47 -07004894 xnn_params.f32.dwconv[1].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004895 xnn_params.f32.dwconv[1].primary_tile = 4;
4896
4897 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__wasmsimd_x86;
4898 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x9__wasmsimd;
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004899 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004900 xnn_params.f32.dwconv[2].channel_tile = 8;
4901 xnn_params.f32.dwconv[2].primary_tile = 9;
Marat Dukhanac014d72020-06-16 08:36:47 -07004902 } else {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004903 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x3__wasmsimd_arm;
4904 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x3__wasmsimd;
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004905 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhanac014d72020-06-16 08:36:47 -07004906 xnn_params.f32.dwconv[0].channel_tile = 4;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004907 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004908
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004909 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_arm;
4910 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x4__wasmsimd;
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004911 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhanac014d72020-06-16 08:36:47 -07004912 xnn_params.f32.dwconv[1].channel_tile = 4;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004913 xnn_params.f32.dwconv[1].primary_tile = 4;
4914
4915 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__wasmsimd_arm;
4916 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x9__wasmsimd;
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004917 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004918 xnn_params.f32.dwconv[2].channel_tile = 4;
4919 xnn_params.f32.dwconv[2].primary_tile = 9;
Marat Dukhanac014d72020-06-16 08:36:47 -07004920 }
4921
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004922 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x25__wasmsimd_arm;
4923 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x25__wasmsimd;
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004924 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004925 xnn_params.f32.dwconv[3].channel_tile = 4;
4926 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004927
Marat Dukhan3b7432d2020-07-16 17:46:32 -07004928 if (is_wasm_x86) {
4929 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08004930 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_x86_c4,
4931 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
Marat Dukhan84598222022-01-09 21:39:22 -08004932 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08004933 .primary_tile = 9,
4934 .incremental_tile = 8,
4935 .channel_tile = 4,
Marat Dukhan3b7432d2020-07-16 17:46:32 -07004936 };
Marat Dukhan1483c532020-07-16 18:08:19 -07004937 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08004938 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_x86_c4,
4939 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
4940 .primary_tile = 9,
4941 .incremental_tile = 8,
4942 .channel_tile = 4,
Marat Dukhan1483c532020-07-16 18:08:19 -07004943 };
Marat Dukhanc6016802020-07-16 18:51:28 -07004944 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08004945 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4,
4946 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4,
Marat Dukhan84598222022-01-09 21:39:22 -08004947 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
4948 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08004949 .row_tile = 7,
4950 .channel_tile = 4,
Marat Dukhanc6016802020-07-16 18:51:28 -07004951 };
Marat Dukhan3b7432d2020-07-16 17:46:32 -07004952 } else {
4953 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08004954 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_arm_c4,
4955 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
Marat Dukhan84598222022-01-09 21:39:22 -08004956 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08004957 .primary_tile = 9,
4958 .incremental_tile = 8,
4959 .channel_tile = 4,
Marat Dukhan3b7432d2020-07-16 17:46:32 -07004960 };
Marat Dukhan1483c532020-07-16 18:08:19 -07004961 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08004962 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_arm_c4,
4963 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
4964 .primary_tile = 9,
4965 .incremental_tile = 8,
4966 .channel_tile = 4,
Marat Dukhan1483c532020-07-16 18:08:19 -07004967 };
Marat Dukhanc6016802020-07-16 18:51:28 -07004968 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08004969 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4,
4970 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4,
Marat Dukhan84598222022-01-09 21:39:22 -08004971 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
4972 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08004973 .row_tile = 7,
4974 .channel_tile = 4,
Marat Dukhanc6016802020-07-16 18:51:28 -07004975 };
Marat Dukhan3b7432d2020-07-16 17:46:32 -07004976 }
Marat Dukhanf6e24802020-07-08 22:20:40 -07004977 if (is_wasm_x86) {
4978 xnn_params.f32.maxpool = (struct maxpool_parameters) {
4979 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004980 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
Marat Dukhanf6e24802020-07-08 22:20:40 -07004981 .mr = 9,
4982 .qr = 8,
4983 };
4984 } else {
4985 xnn_params.f32.maxpool = (struct maxpool_parameters) {
4986 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004987 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
Marat Dukhanf6e24802020-07-08 22:20:40 -07004988 .mr = 9,
4989 .qr = 8,
4990 };
4991 }
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004992 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
Marat Dukhan40f05522020-07-16 22:33:12 -07004993 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__wasmsimd_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004994 .mr = 4,
4995 };
4996 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
Marat Dukhan40f05522020-07-16 22:33:12 -07004997 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__wasmsimd_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004998 .mr = 9,
4999 };
5000 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
Marat Dukhan40f05522020-07-16 22:33:12 -07005001 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__wasmsimd_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005002 .mr = 9,
5003 .qr = 8,
5004 };
Marat Dukhan660fd192020-03-10 04:55:30 -07005005 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
Marat Dukhan00d1d6e2020-07-09 01:37:27 -07005006 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__wasmsimd_c8,
Marat Dukhan69722492019-11-11 19:55:50 -08005007 .pixel_tile = 1,
5008 .channel_tile = 8,
5009 };
Marat Dukhane5efb162021-12-31 10:26:13 -08005010 xnn_params.f32.abs = (struct vunary_parameters) {
5011 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__wasmsimd_x8,
5012 .init.f32_abs = xnn_init_f32_abs_wasmsimd_params,
5013 .element_tile = 16,
5014 };
Marat Dukhanc303fe62020-06-26 10:09:25 -07005015 if (is_wasm_x86) {
Marat Dukhan94912792021-08-16 21:40:30 -07005016 xnn_params.f32.clamp = (struct vunary_parameters) {
5017 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__wasmsimd_x86_x8,
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08005018 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Marat Dukhan94912792021-08-16 21:40:30 -07005019 .element_tile = 8,
5020 };
Marat Dukhanc303fe62020-06-26 10:09:25 -07005021 } else {
Marat Dukhan94912792021-08-16 21:40:30 -07005022 xnn_params.f32.clamp = (struct vunary_parameters) {
5023 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__wasmsimd_arm_x8,
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08005024 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Marat Dukhan94912792021-08-16 21:40:30 -07005025 .element_tile = 8,
5026 };
Marat Dukhanc303fe62020-06-26 10:09:25 -07005027 }
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08005028 if (is_wasm_x86) {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08005029 xnn_params.f32.elu = (struct vunary_parameters) {
5030 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x20,
5031 .init.f32_elu = xnn_init_f32_elu_wasmsimd_rr2_p6_params,
5032 .element_tile = 20,
5033 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08005034 } else {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08005035 xnn_params.f32.elu = (struct vunary_parameters) {
5036 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x20,
5037 .init.f32_elu = xnn_init_f32_elu_wasmsimd_rr2_p6_params,
5038 .element_tile = 20,
5039 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08005040 }
Marat Dukhan561d0682021-12-23 16:12:35 -08005041 xnn_params.f32.hswish = (struct vunary_parameters) {
5042 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__wasmsimd_x16,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08005043 .init.f32_hswish = xnn_init_f32_hswish_wasmsimd_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08005044 .element_tile = 16,
5045 };
Marat Dukhanf4935a22020-07-16 15:59:10 -07005046 if (is_wasm_x86) {
Marat Dukhan2894e992021-12-30 08:29:48 -08005047 xnn_params.f32.lrelu = (struct vunary_parameters) {
5048 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__wasmsimd_minmax_x8,
5049 .init.f32_lrelu = xnn_init_f32_lrelu_wasmsimd_params,
5050 .element_tile = 8,
5051 };
Marat Dukhanf4935a22020-07-16 15:59:10 -07005052 } else {
Marat Dukhan2894e992021-12-30 08:29:48 -08005053 xnn_params.f32.lrelu = (struct vunary_parameters) {
5054 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__wasmsimd_bitselect_x8,
5055 .init.f32_lrelu = xnn_init_f32_lrelu_wasmsimd_params,
5056 .element_tile = 8,
5057 };
Marat Dukhanf4935a22020-07-16 15:59:10 -07005058 }
Marat Dukhane5efb162021-12-31 10:26:13 -08005059 xnn_params.f32.neg = (struct vunary_parameters) {
5060 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__wasmsimd_x8,
5061 .init.f32_neg = xnn_init_f32_neg_wasmsimd_params,
5062 .element_tile = 16,
5063 };
Marat Dukhan6674d692021-05-05 22:27:00 -07005064 xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_vrelu_ukernel__wasmsimd_x16;
Marat Dukhan189c1d02021-09-03 15:39:54 -07005065 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 91)
Marat Dukhan0e801372022-01-04 00:10:41 -08005066 xnn_params.f32.rndne = (struct vunary_parameters) {
5067 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__wasmsimd_native_x8,
5068 .element_tile = 8,
5069 };
5070 xnn_params.f32.rndz = (struct vunary_parameters) {
5071 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_native_x8,
5072 .element_tile = 8,
5073 };
5074 xnn_params.f32.rndu = (struct vunary_parameters) {
5075 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__wasmsimd_native_x8,
5076 .element_tile = 8,
5077 };
5078 xnn_params.f32.rndd = (struct vunary_parameters) {
5079 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__wasmsimd_native_x8,
5080 .element_tile = 8,
5081 };
Marat Dukhan189c1d02021-09-03 15:39:54 -07005082 #else // XNN_WASMSIMD_VERSION >= 91
Marat Dukhan0e801372022-01-04 00:10:41 -08005083 xnn_params.f32.rndne = (struct vunary_parameters) {
5084 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__wasmsimd_addsub_x8,
5085 .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
5086 .element_tile = 8,
5087 };
Marat Dukhan189c1d02021-09-03 15:39:54 -07005088 if (is_wasm_x86) {
Marat Dukhan0e801372022-01-04 00:10:41 -08005089 xnn_params.f32.rndz = (struct vunary_parameters) {
5090 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_addsub_x8,
5091 .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
5092 .element_tile = 8,
5093 };
Marat Dukhan189c1d02021-09-03 15:39:54 -07005094 } else {
Marat Dukhan0e801372022-01-04 00:10:41 -08005095 xnn_params.f32.rndz = (struct vunary_parameters) {
5096 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_cvt_x8,
5097 .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
5098 .element_tile = 8,
5099 };
Marat Dukhan189c1d02021-09-03 15:39:54 -07005100 }
Marat Dukhan0e801372022-01-04 00:10:41 -08005101 xnn_params.f32.rndu = (struct vunary_parameters) {
5102 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__wasmsimd_addsub_x8,
5103 .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
5104 .element_tile = 8,
5105 };
5106 xnn_params.f32.rndd = (struct vunary_parameters) {
5107 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__wasmsimd_addsub_x8,
5108 .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
5109 .element_tile = 8,
5110 };
Marat Dukhan189c1d02021-09-03 15:39:54 -07005111 #endif // XNN_WASMSIMD_VERSION >= 91
Marat Dukhance834ad2022-01-03 00:22:01 -08005112 xnn_params.f32.sigmoid = (struct vunary_parameters) {
5113 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_p5_div_x16,
5114 .init.f32_sigmoid = xnn_init_f32_sigmoid_wasmsimd_rr2_p5_params,
5115 .element_tile = 16,
5116 };
Marat Dukhane5efb162021-12-31 10:26:13 -08005117 xnn_params.f32.sqr = (struct vunary_parameters) {
5118 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__wasmsimd_x8,
5119 .element_tile = 16,
5120 };
Marat Dukhane72b2822021-12-30 14:46:58 -08005121 xnn_params.f32.sqrt = (struct vunary_parameters) {
5122 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_x8,
5123 .element_tile = 8,
5124 };
Marat Dukhan195f8eb2020-06-25 12:50:57 -07005125 if (is_wasm_x86) {
5126 xnn_params.f32.prelu = (struct prelu_parameters) {
Marat Dukhan78299282020-07-15 17:38:06 -07005127 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasmsimd_minmax_2x8,
Marat Dukhan195f8eb2020-06-25 12:50:57 -07005128 .row_tile = 2,
5129 .channel_tile = 8,
5130 };
5131 } else {
5132 xnn_params.f32.prelu = (struct prelu_parameters) {
Marat Dukhan78299282020-07-15 17:38:06 -07005133 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasmsimd_bitselect_2x8,
Marat Dukhan195f8eb2020-06-25 12:50:57 -07005134 .row_tile = 2,
5135 .channel_tile = 8,
5136 };
5137 }
Marat Dukhan4a5c7712022-01-05 22:43:13 -08005138 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
5139 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_x16_acc2,
5140 .init = xnn_init_f32_expminus_wasmsimd_rr2_p5_params,
5141 .element_tile = 16,
5142 };
Marat Dukhancdc56552020-06-26 19:49:41 -07005143 if (is_wasm_x86) {
Marat Dukhan0bf8afa2021-09-20 10:02:18 -07005144 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__wasmsimd_x86;
Marat Dukhancdc56552020-06-26 19:49:41 -07005145 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005146 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_x16,
5147 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16,
5148 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16,
5149 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x16,
5150 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
5151 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08005152 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchard9c7308f2020-08-31 17:03:01 -07005153 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005154 };
5155 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardb392f8e2020-10-27 10:46:44 -07005156 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16,
5157 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x16,
5158 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x16,
5159 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__wasmsimd_x16,
5160 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__wasmsimd_x16,
5161 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08005162 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchardb392f8e2020-10-27 10:46:44 -07005163 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005164 };
5165 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005166 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_x86_x16,
5167 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16,
5168 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16,
5169 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005170 };
5171 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005172 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_x86_x16,
5173 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x16,
5174 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x16,
Frank Barchard9c7308f2020-08-31 17:03:01 -07005175 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005176 };
5177 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005178 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16,
5179 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16,
5180 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16,
5181 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x16,
5182 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
5183 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08005184 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchard9c7308f2020-08-31 17:03:01 -07005185 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005186 };
5187 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005188 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16,
5189 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16,
5190 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16,
5191 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x16,
5192 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x16,
5193 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08005194 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchard9c7308f2020-08-31 17:03:01 -07005195 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005196 };
5197 } else {
Marat Dukhan0bf8afa2021-09-20 10:02:18 -07005198 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__wasmsimd_arm;
Marat Dukhancdc56552020-06-26 19:49:41 -07005199 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005200 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_x16,
5201 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16,
5202 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16,
5203 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x16,
5204 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
5205 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08005206 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchard9c7308f2020-08-31 17:03:01 -07005207 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005208 };
5209 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardb392f8e2020-10-27 10:46:44 -07005210 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16,
5211 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x16,
5212 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x16,
5213 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__wasmsimd_x16,
5214 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__wasmsimd_x16,
5215 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08005216 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchardb392f8e2020-10-27 10:46:44 -07005217 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005218 };
5219 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005220 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_arm_x16,
5221 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16,
5222 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16,
5223 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005224 };
5225 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005226 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_arm_x16,
5227 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x16,
5228 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x16,
5229 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005230 };
5231 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005232 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16,
5233 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16,
5234 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16,
5235 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x16,
5236 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
5237 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08005238 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchard9c7308f2020-08-31 17:03:01 -07005239 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005240 };
5241 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005242 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16,
5243 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16,
5244 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16,
5245 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x16,
5246 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x16,
5247 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08005248 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchard9c7308f2020-08-31 17:03:01 -07005249 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005250 };
5251 }
Marat Dukhanf7399262020-06-05 10:58:44 -07005252 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005253 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__wasmsimd_x16,
5254 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16,
5255 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16,
5256 .element_tile = 16,
Marat Dukhanf7399262020-06-05 10:58:44 -07005257 };
Marat Dukhand816f622020-07-15 10:14:39 -07005258 if (is_wasm_x86) {
5259 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07005260 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_x86_2x,
Marat Dukhand57186a2021-12-30 11:37:24 -08005261 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
Marat Dukhand816f622020-07-15 10:14:39 -07005262 .channel_tile = 4,
5263 .row_tile = 2,
5264 };
5265 } else {
5266 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07005267 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_arm_2x,
Marat Dukhand57186a2021-12-30 11:37:24 -08005268 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
Marat Dukhand816f622020-07-15 10:14:39 -07005269 .channel_tile = 4,
5270 .row_tile = 2,
5271 };
5272 }
Erich Elsen6e80fdc2020-06-09 15:35:37 -07005273 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -08005274 init_flags |= XNN_INIT_FLAG_CHW_OPT;
5275
Frank Barchard498cb502020-11-16 23:50:04 -08005276 if (is_wasm_x86) {
5277 xnn_params.f32.spmm = (struct spmm_parameters) {
5278 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86,
5279 .mr = 32,
5280 .nr = 1,
5281 };
5282 } else {
5283 xnn_params.f32.spmm = (struct spmm_parameters) {
5284 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm,
5285 .mr = 32,
5286 .nr = 1,
5287 };
5288 }
Erich Elsen0a1970e2020-06-10 09:24:59 -07005289 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
5290 .ukernel_with_symm_padding =
Frank Barchard22136062020-11-24 18:44:46 -08005291 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2,
Erich Elsen0a1970e2020-06-10 09:24:59 -07005292 .output_channel_tile = 4,
5293 .output_height_tile = 2,
5294 .output_width_tile = 2,
5295 };
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005296 if (is_wasm_x86) {
5297 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08005298 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005299 .output_width_tile = 4,
Frank Barchard97883b82020-11-23 13:01:03 -08005300 .output_height_tile = 2,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005301 };
5302 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08005303 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_1x4_acc2,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005304 .output_width_tile = 4,
5305 .output_height_tile = 1,
5306 };
5307 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08005308 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005309 .output_width_tile = 4,
5310 .output_height_tile = 3,
5311 };
5312 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08005313 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005314 .output_width_tile = 4,
5315 .output_height_tile = 1,
5316 };
5317 } else {
5318 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08005319 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005320 .output_width_tile = 4,
Frank Barchard97883b82020-11-23 13:01:03 -08005321 .output_height_tile = 2,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005322 };
5323 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08005324 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_1x4_acc4,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005325 .output_width_tile = 4,
5326 .output_height_tile = 1,
5327 };
5328 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08005329 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005330 .output_width_tile = 4,
5331 .output_height_tile = 3,
5332 };
5333 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08005334 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005335 .output_width_tile = 4,
5336 .output_height_tile = 1,
5337 };
5338 }
Marat Dukhanc5045bf2020-07-27 18:16:35 -07005339 if (is_wasm_x86) {
5340 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
5341 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4,
5342 .channel_tile = 4,
5343 };
5344 } else {
5345 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
5346 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4,
5347 .channel_tile = 4,
5348 };
5349 }
Artsiom Ablavatski97918102020-10-27 15:52:59 -07005350 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
5351 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8,
5352 .channel_tile = 1,
Artsiom Ablavatskib3ffd582021-03-31 13:00:08 -07005353 .pixel_tile = 8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07005354 };
Erich Elsen6e80fdc2020-06-09 15:35:37 -07005355 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005356 #endif // XNN_NO_F32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07005357
Frank Barchardb40ee632021-12-30 11:10:02 -08005358 /*************************** VCVT WAsm SIMD micro-kernels***************************/
Marat Dukhanaf2ba002021-10-24 14:21:41 -07005359 #ifndef XNN_NO_VCVT_OPERATORS
5360 init_flags |= XNN_INIT_FLAG_VCVT;
5361
Marat Dukhan134f9842021-12-29 19:57:31 -08005362 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
5363 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x16,
5364 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_wasmsimd_int16_params,
5365 .element_tile = 16,
5366 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08005367 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
5368 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__wasmsimd_x24,
5369 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_wasmsimd_params,
5370 .element_tile = 24,
5371 };
Marat Dukhand52d20b2021-12-05 09:50:25 -08005372 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
5373 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_x32,
5374 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_wasmsimd_magic_params,
5375 .element_tile = 32,
5376 };
5377 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
5378 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_x32,
5379 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_wasmsimd_magic_params,
5380 .element_tile = 32,
5381 };
Marat Dukhanf92206b2021-12-10 17:02:07 -08005382 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
5383 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__wasmsimd_x32,
5384 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_wasmsimd_params,
5385 .element_tile = 32,
5386 };
5387 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
5388 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__wasmsimd_x32,
5389 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_wasmsimd_params,
5390 .element_tile = 32,
5391 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07005392 #endif // XNN_NO_VCVT_OPERATORS
5393
Frank Barchardb40ee632021-12-30 11:10:02 -08005394 /**************************** X32 WAsm SIMD micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005395 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07005396 init_flags |= XNN_INIT_FLAG_X32;
5397
Marat Dukhan9d4bfa22020-07-16 19:07:04 -07005398 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__wasmsimd;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005399 xnn_params.x32.zip = (struct zip_parameters) {
Marat Dukhane3b78762020-07-16 20:02:58 -07005400 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__wasmsimd,
5401 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__wasmsimd,
5402 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__wasmsimd,
5403 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__wasmsimd,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005404 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08005405 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -08005406 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
5407 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08005408 .channel_tile = 1,
5409 .pixel_tile = 1,
5410 };
5411 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005412 #endif // XNN_NO_X32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07005413
Frank Barchardb40ee632021-12-30 11:10:02 -08005414 /**************************** XX WAsm SIMD micro-kernels****************************/
Marat Dukhan048931b2020-11-24 20:53:54 -08005415 #ifndef XNN_NO_XX_OPERATORS
5416 init_flags |= XNN_INIT_FLAG_XX;
5417
5418 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
Marat Dukhan933051b2021-08-07 16:26:15 -07005419 xnn_params.xx.fill = (struct fill_parameters) {
5420 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__wasmsimd_x64,
5421 .row_tile = 1,
5422 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -07005423 xnn_params.xx.pad = (struct pad_parameters) {
5424 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__wasmsimd,
5425 .row_tile = 1,
5426 };
Marat Dukhan048931b2020-11-24 20:53:54 -08005427 #endif
5428
Marat Dukhan933051b2021-08-07 16:26:15 -07005429#elif XNN_ARCH_WASM
5430
Frank Barchardb40ee632021-12-30 11:10:02 -08005431 /**************************** QC8 WAsm micro-kernels****************************/
Marat Dukhan898d5852021-06-30 21:18:34 -07005432 #ifndef XNN_NO_QC8_OPERATORS
5433 init_flags |= XNN_INIT_FLAG_QC8;
5434
5435 if (is_wasm_x86) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005436 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5437 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5438 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5439 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5440 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_scalar_imagic_params;
Marat Dukhan898d5852021-06-30 21:18:34 -07005441 xnn_params.qc8.gemm.mr = 2;
5442 xnn_params.qc8.gemm.nr = 2;
5443 } else {
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005444 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5445 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5446 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5447 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
Marat Dukhan2ac722e2022-01-04 01:54:20 -08005448 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_scalar_fmagic_params;
Marat Dukhan898d5852021-06-30 21:18:34 -07005449 xnn_params.qc8.gemm.mr = 4;
5450 xnn_params.qc8.gemm.nr = 4;
5451 }
5452
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005453 if (is_wasm_x86) {
5454 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic;
5455 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_scalar_imagic_params;
5456 xnn_params.qc8.dwconv[0].channel_tile = 2;
5457 xnn_params.qc8.dwconv[0].primary_tile = 9;
5458 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up1x25__scalar_imagic;
5459 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_scalar_imagic_params;
5460 xnn_params.qc8.dwconv[1].channel_tile = 1;
5461 xnn_params.qc8.dwconv[1].primary_tile = 25;
5462 } else {
5463 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic;
5464 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_scalar_fmagic_params;
5465 xnn_params.qc8.dwconv[0].channel_tile = 2;
5466 xnn_params.qc8.dwconv[0].primary_tile = 9;
5467 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x25__wasm_fmagic;
5468 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_scalar_fmagic_params;
5469 xnn_params.qc8.dwconv[1].channel_tile = 2;
5470 xnn_params.qc8.dwconv[1].primary_tile = 25;
5471 }
Marat Dukhan898d5852021-06-30 21:18:34 -07005472 #endif // XNN_NO_QC8_OPERATORS
5473
Frank Barchardb40ee632021-12-30 11:10:02 -08005474 /**************************** QS8 WAsm micro-kernels****************************/
Marat Dukhan803c1f82021-05-12 00:13:37 -07005475 #ifndef XNN_NO_QS8_OPERATORS
5476 init_flags |= XNN_INIT_FLAG_QS8;
5477
5478 if (is_wasm_x86) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005479 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5480 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5481 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5482 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5483 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
Marat Dukhan803c1f82021-05-12 00:13:37 -07005484 xnn_params.qs8.gemm.mr = 2;
5485 xnn_params.qs8.gemm.nr = 2;
5486 } else {
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005487 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5488 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5489 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5490 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
Marat Dukhan2ac722e2022-01-04 01:54:20 -08005491 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan803c1f82021-05-12 00:13:37 -07005492 xnn_params.qs8.gemm.mr = 4;
5493 xnn_params.qs8.gemm.nr = 4;
5494 }
5495
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005496 if (is_wasm_x86) {
5497 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic;
5498 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
5499 xnn_params.qs8.dwconv[0].channel_tile = 2;
5500 xnn_params.qs8.dwconv[0].primary_tile = 9;
5501 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x25__scalar_imagic;
5502 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
5503 xnn_params.qs8.dwconv[1].channel_tile = 1;
5504 xnn_params.qs8.dwconv[1].primary_tile = 25;
5505 } else {
5506 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic;
5507 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
5508 xnn_params.qs8.dwconv[0].channel_tile = 2;
5509 xnn_params.qs8.dwconv[0].primary_tile = 9;
5510 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x25__wasm_fmagic;
5511 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
5512 xnn_params.qs8.dwconv[1].channel_tile = 2;
5513 xnn_params.qs8.dwconv[1].primary_tile = 25;
5514 }
Marat Dukhan803c1f82021-05-12 00:13:37 -07005515
5516 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan847ff5e2022-01-11 20:31:06 -08005517 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4,
5518 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4,
Marat Dukhan53f41062022-01-11 19:44:57 -08005519 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
5520 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08005521 .row_tile = 7,
5522 .channel_tile = 4,
Marat Dukhan803c1f82021-05-12 00:13:37 -07005523 };
5524
5525 xnn_params.qs8.vadd = (struct vbinary_parameters) {
5526 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x4,
5527 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
5528 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
Marat Dukhan64287252021-09-07 16:20:03 -07005529 .init.qs8_addsub = xnn_init_qs8_add_minmax_scalar_params,
Marat Dukhan803c1f82021-05-12 00:13:37 -07005530 .element_tile = 4,
5531 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07005532 xnn_params.qs8.vmul = (struct vbinary_parameters) {
5533 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
5534 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
5535 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
5536 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
5537 .element_tile = 4,
5538 };
Marat Dukhan803c1f82021-05-12 00:13:37 -07005539 #endif // XNN_NO_QS8_OPERATORS
5540
Frank Barchardb40ee632021-12-30 11:10:02 -08005541 /**************************** QU8 WAsm micro-kernels****************************/
Marat Dukhan08b7a972020-07-14 18:17:29 -07005542 #ifndef XNN_NO_QU8_OPERATORS
5543 init_flags |= XNN_INIT_FLAG_QU8;
Marat Dukhan854fb6b2020-06-19 12:33:44 -07005544
Marat Dukhan3d76e552021-07-15 18:54:01 -07005545 if (is_wasm_x86) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005546 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5547 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5548 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5549 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5550 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
Marat Dukhan3d76e552021-07-15 18:54:01 -07005551 xnn_params.qu8.gemm.mr = 2;
5552 xnn_params.qu8.gemm.nr = 2;
5553 } else {
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005554 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5555 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5556 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5557 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
Marat Dukhan2ac722e2022-01-04 01:54:20 -08005558 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan3d76e552021-07-15 18:54:01 -07005559 xnn_params.qu8.gemm.mr = 4;
5560 xnn_params.qu8.gemm.nr = 4;
5561 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07005562
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005563 if (is_wasm_x86) {
5564 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic;
5565 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
5566 xnn_params.qu8.dwconv[0].channel_tile = 2;
5567 xnn_params.qu8.dwconv[0].primary_tile = 9;
5568 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x25__scalar_imagic;
5569 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
5570 xnn_params.qu8.dwconv[1].channel_tile = 1;
5571 xnn_params.qu8.dwconv[1].primary_tile = 25;
5572 } else {
5573 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic;
5574 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
5575 xnn_params.qu8.dwconv[0].channel_tile = 2;
5576 xnn_params.qu8.dwconv[0].primary_tile = 9;
5577 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x25__wasm_fmagic;
5578 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
5579 xnn_params.qu8.dwconv[1].channel_tile = 2;
5580 xnn_params.qu8.dwconv[1].primary_tile = 25;
5581 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07005582
Marat Dukhan08b7a972020-07-14 18:17:29 -07005583 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08005584 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
5585 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
5586 .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
5587 .primary_tile = 9,
5588 .incremental_tile = 8,
5589 .channel_tile = 1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005590 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07005591 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhand1f53e42022-01-12 22:34:51 -08005592 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4,
5593 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4,
5594 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
5595 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08005596 .row_tile = 7,
Marat Dukhand1f53e42022-01-12 22:34:51 -08005597 .channel_tile = 4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005598 };
Marat Dukhandb007cd2021-07-20 23:42:39 -07005599
5600 xnn_params.qu8.vadd = (struct vbinary_parameters) {
5601 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x4,
5602 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
5603 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
Marat Dukhan64287252021-09-07 16:20:03 -07005604 .init.qu8_addsub = xnn_init_qu8_add_minmax_scalar_params,
Marat Dukhandb007cd2021-07-20 23:42:39 -07005605 .element_tile = 4,
5606 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07005607 xnn_params.qu8.vmul = (struct vbinary_parameters) {
5608 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
5609 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
5610 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
5611 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
5612 .element_tile = 4,
5613 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07005614 #endif // XNN_NO_QU8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07005615
Frank Barchardb40ee632021-12-30 11:10:02 -08005616 /**************************** S8 WAsm micro-kernels****************************/
Marat Dukhandc5c1482021-08-16 09:03:15 -07005617 #ifndef XNN_NO_S8_OPERATORS
5618 init_flags |= XNN_INIT_FLAG_S8;
5619
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07005620 xnn_params.s8.clamp = (struct vunary_parameters) {
5621 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
5622 .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
5623 .element_tile = 4,
5624 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08005625 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
5626 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
5627 .pixel_tile = 1,
5628 .channel_tile = 1,
5629 };
Marat Dukhandc5c1482021-08-16 09:03:15 -07005630 xnn_params.s8.maxpool = (struct maxpool_parameters) {
5631 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
5632 .init.s8 = xnn_init_s8_minmax_scalar_params,
5633 .mr = 9,
5634 .qr = 8,
5635 };
5636 #endif // XNN_NO_S8_OPERATORS
5637
Frank Barchardb40ee632021-12-30 11:10:02 -08005638 /**************************** U8 WAsm micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005639 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07005640 init_flags |= XNN_INIT_FLAG_U8;
5641
Marat Dukhan94912792021-08-16 21:40:30 -07005642 xnn_params.u8.clamp = (struct vunary_parameters) {
5643 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
5644 .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
5645 .element_tile = 4,
5646 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08005647 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
5648 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
5649 .pixel_tile = 1,
5650 .channel_tile = 1,
5651 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005652 xnn_params.u8.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07005653 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -07005654 .init.u8 = xnn_init_u8_minmax_scalar_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005655 .mr = 9,
5656 .qr = 8,
5657 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005658 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
5659 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
5660 #endif // XNN_NO_U8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07005661
Frank Barchardb40ee632021-12-30 11:10:02 -08005662 /**************************** X8 WAsm micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005663 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07005664 init_flags |= XNN_INIT_FLAG_X8;
5665
Marat Dukhand67539d2021-09-08 23:06:03 -07005666 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005667 xnn_params.x8.zip = (struct zip_parameters) {
5668 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
5669 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
5670 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
5671 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
5672 };
5673 #endif // XNN_NO_X8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07005674
Frank Barchardb40ee632021-12-30 11:10:02 -08005675 /**************************** F32 WAsm micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005676 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07005677 init_flags |= XNN_INIT_FLAG_F32;
5678
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005679 if (is_wasm_x86) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07005680 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_2x4__scalar);
5681 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_2x4__scalar);
5682 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__wasm);
5683 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__wasm);
Marat Dukhan467f6362020-05-22 23:21:55 -07005684 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_2x4__scalar);
5685 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_2x4__scalar);
5686 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__wasm);
5687 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__wasm);
Marat Dukhan869c62d2020-04-09 17:17:55 -07005688 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_2x4__scalar);
5689 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_2x4__scalar);
5690 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__wasm);
5691 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__wasm);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07005692 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005693 xnn_params.f32.gemm.mr = 2;
5694 xnn_params.f32.gemm.nr = 4;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005695 } else {
Marat Dukhanaefaef32020-04-09 07:09:34 -07005696 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__wasm);
5697 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__wasm);
5698 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__wasm);
5699 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__wasm);
Marat Dukhan467f6362020-05-22 23:21:55 -07005700 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__wasm);
5701 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__wasm);
5702 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__wasm);
5703 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__wasm);
Marat Dukhan869c62d2020-04-09 17:17:55 -07005704 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__wasm);
5705 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__wasm);
5706 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__wasm);
5707 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__wasm);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07005708 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005709 xnn_params.f32.gemm.mr = 4;
5710 xnn_params.f32.gemm.nr = 4;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005711 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07005712 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__wasm);
5713 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__wasm),
Marat Dukhan869c62d2020-04-09 17:17:55 -07005714 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__wasm);
5715 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__wasm),
Marat Dukhan104ae5e2021-05-24 13:41:57 -07005716 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005717 xnn_params.f32.gemm2.mr = 4;
5718 xnn_params.f32.gemm2.nr = 2;
5719
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005720 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__wasm_acc2;
5721 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__wasm_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07005722 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005723 xnn_params.f32.dwconv[0].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005724 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005725
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005726 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__wasm_acc2;
5727 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__wasm_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07005728 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005729 xnn_params.f32.dwconv[1].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005730 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005731
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005732 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__wasm_acc2;
5733 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__wasm_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07005734 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005735 xnn_params.f32.dwconv[2].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005736 xnn_params.f32.dwconv[2].primary_tile = 9;
5737
5738 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__wasm_acc2;
5739 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__wasm_acc2;
5740 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
5741 xnn_params.f32.dwconv[3].channel_tile = 1;
5742 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005743
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005744 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08005745 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasm_c1,
5746 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasm_c1,
5747 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5748 .primary_tile = 9,
5749 .incremental_tile = 8,
5750 .channel_tile = 1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005751 };
5752 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08005753 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasm_c1,
5754 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasm_c1,
5755 .primary_tile = 9,
5756 .incremental_tile = 8,
5757 .channel_tile = 1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005758 };
5759 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08005760 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1,
5761 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1,
5762 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5763 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
5764 .row_tile = 7,
5765 .channel_tile = 1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005766 };
5767 xnn_params.f32.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07005768 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasm_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -07005769 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005770 .mr = 9,
5771 .qr = 8,
5772 };
5773 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07005774 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005775 .mr = 4,
5776 };
5777 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07005778 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005779 .mr = 9,
5780 };
5781 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07005782 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005783 .mr = 9,
5784 .qr = 8,
5785 };
Marat Dukhan660fd192020-03-10 04:55:30 -07005786 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
5787 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
Marat Dukhan69722492019-11-11 19:55:50 -08005788 .pixel_tile = 1,
5789 .channel_tile = 2,
5790 };
Marat Dukhane5efb162021-12-31 10:26:13 -08005791 xnn_params.f32.abs = (struct vunary_parameters) {
5792 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4,
5793 .element_tile = 4,
5794 };
Marat Dukhan94912792021-08-16 21:40:30 -07005795 xnn_params.f32.clamp = (struct vunary_parameters) {
5796 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__wasm_x4,
5797 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
5798 .element_tile = 4,
5799 };
Marat Dukhanc303fe62020-06-26 10:09:25 -07005800 if (is_wasm_x86) {
Marat Dukhan561d0682021-12-23 16:12:35 -08005801 xnn_params.f32.hswish = (struct vunary_parameters) {
5802 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08005803 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08005804 .element_tile = 4,
5805 };
Marat Dukhanc303fe62020-06-26 10:09:25 -07005806 } else {
Marat Dukhan561d0682021-12-23 16:12:35 -08005807 xnn_params.f32.hswish = (struct vunary_parameters) {
5808 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__wasm_x4,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08005809 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08005810 .element_tile = 4,
5811 };
Marat Dukhanc303fe62020-06-26 10:09:25 -07005812 }
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08005813 if (is_wasm_x86) {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08005814 xnn_params.f32.elu = (struct vunary_parameters) {
5815 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x2,
5816 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
5817 .element_tile = 2,
5818 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08005819 } else {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08005820 xnn_params.f32.elu = (struct vunary_parameters) {
5821 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasm_rr2_p6_x6,
5822 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_p6_params,
5823 .element_tile = 6,
5824 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08005825 }
Marat Dukhan2894e992021-12-30 08:29:48 -08005826 xnn_params.f32.lrelu = (struct vunary_parameters) {
5827 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4,
5828 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
5829 .element_tile = 4,
5830 };
Marat Dukhane5efb162021-12-31 10:26:13 -08005831 xnn_params.f32.neg = (struct vunary_parameters) {
5832 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4,
5833 .element_tile = 4,
5834 };
Frank Barchard62c5e232020-07-21 17:42:19 -07005835 if (is_wasm_x86) {
Marat Dukhan6674d692021-05-05 22:27:00 -07005836 xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_vrelu_ukernel__scalar_x8;
Frank Barchard62c5e232020-07-21 17:42:19 -07005837 } else {
Marat Dukhan6674d692021-05-05 22:27:00 -07005838 xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_vrelu_ukernel__wasm_x8;
Frank Barchard62c5e232020-07-21 17:42:19 -07005839 }
Marat Dukhan0e801372022-01-04 00:10:41 -08005840 xnn_params.f32.rndne = (struct vunary_parameters) {
5841 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x4,
5842 .element_tile = 4,
5843 };
5844 xnn_params.f32.rndz = (struct vunary_parameters) {
5845 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x4,
5846 .element_tile = 4,
5847 };
5848 xnn_params.f32.rndu = (struct vunary_parameters) {
5849 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x4,
5850 .element_tile = 4,
5851 };
5852 xnn_params.f32.rndd = (struct vunary_parameters) {
5853 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x4,
5854 .element_tile = 4,
5855 };
Marat Dukhance834ad2022-01-03 00:22:01 -08005856 xnn_params.f32.sigmoid = (struct vunary_parameters) {
5857 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
5858 .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
5859 .element_tile = 2,
5860 };
Marat Dukhane5efb162021-12-31 10:26:13 -08005861 xnn_params.f32.sqr = (struct vunary_parameters) {
5862 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4,
5863 .element_tile = 4,
5864 };
Marat Dukhane72b2822021-12-30 14:46:58 -08005865 xnn_params.f32.sqrt = (struct vunary_parameters) {
5866 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
5867 .element_tile = 1,
5868 };
Marat Dukhan7c1f8082020-06-25 13:26:20 -07005869 if (is_wasm_x86) {
5870 xnn_params.f32.prelu = (struct prelu_parameters) {
5871 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
5872 .row_tile = 2,
5873 .channel_tile = 4,
5874 };
5875 } else {
5876 xnn_params.f32.prelu = (struct prelu_parameters) {
5877 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasm_2x4,
5878 .row_tile = 2,
5879 .channel_tile = 4,
5880 };
5881 }
Marat Dukhan4a5c7712022-01-05 22:43:13 -08005882 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
5883 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
5884 .init = xnn_init_f32_expminus_scalar_rr2_p5_params,
5885 .element_tile = 4,
5886 };
Marat Dukhan1edc4542020-01-27 12:40:13 -08005887 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
Marat Dukhanb1a0fc32019-12-02 19:32:02 -08005888 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005889 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasm_x8,
5890 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x8,
5891 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08005892 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhanb1a0fc32019-12-02 19:32:02 -08005893 .element_tile = 8,
5894 };
Marat Dukhan69180502019-12-06 15:00:31 -08005895 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardb392f8e2020-10-27 10:46:44 -07005896 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasm_x8,
5897 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasm_x8,
5898 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasm_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08005899 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Frank Barchardb392f8e2020-10-27 10:46:44 -07005900 .element_tile = 8,
Marat Dukhan69180502019-12-06 15:00:31 -08005901 };
Marat Dukhan79e7f842019-12-05 14:35:50 -08005902 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005903 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasm_x8,
5904 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x8,
5905 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x8,
Marat Dukhan79e7f842019-12-05 14:35:50 -08005906 .element_tile = 8,
5907 };
5908 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005909 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasm_x8,
5910 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x8,
5911 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x8,
Marat Dukhan79e7f842019-12-05 14:35:50 -08005912 .element_tile = 8,
5913 };
Marat Dukhan1e782c42019-11-21 17:02:40 -08005914 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005915 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasm_x8,
5916 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x8,
5917 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08005918 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhanca2733c2019-11-15 23:21:17 -08005919 .element_tile = 8,
5920 };
Marat Dukhan05f3f6d2019-12-03 15:13:53 -08005921 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005922 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasm_x8,
5923 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasm_x8,
5924 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasm_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08005925 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan05f3f6d2019-12-03 15:13:53 -08005926 .element_tile = 8,
5927 };
Marat Dukhanf7399262020-06-05 10:58:44 -07005928 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005929 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
5930 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
5931 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
Marat Dukhanf7399262020-06-05 10:58:44 -07005932 .element_tile = 8,
5933 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005934 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07005935 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__wasm_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07005936 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan49e6ee92019-11-06 15:55:29 -08005937 .channel_tile = 1,
5938 .row_tile = 2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005939 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08005940 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -08005941 init_flags |= XNN_INIT_FLAG_CHW_OPT;
5942
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005943 xnn_params.f32.spmm = (struct spmm_parameters) {
Marat Dukhan355ab432020-04-09 19:01:52 -07005944 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
Marat Dukhanbff791e2019-10-24 11:05:37 -07005945 .mr = 8,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005946 .nr = 1,
5947 };
Erich Elsenc6afd9b2019-10-24 16:10:53 -07005948 xnn_params.f32.spmm2 = (struct spmm_parameters) {
Marat Dukhan355ab432020-04-09 19:01:52 -07005949 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
Erich Elsenc6afd9b2019-10-24 16:10:53 -07005950 .mr = 8,
5951 .nr = 2,
5952 };
5953 xnn_params.f32.spmm4 = (struct spmm_parameters) {
Marat Dukhan355ab432020-04-09 19:01:52 -07005954 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
Erich Elsenc6afd9b2019-10-24 16:10:53 -07005955 .mr = 8,
5956 .nr = 4,
5957 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07005958 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
Marat Dukhan14fe0b22019-10-23 21:20:07 -07005959 .ukernel_with_symm_padding =
Marat Dukhan1f29b802020-05-15 23:46:39 -07005960 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
Marat Dukhan14fe0b22019-10-23 21:20:07 -07005961 .output_channel_tile = 4,
5962 .output_height_tile = 1,
5963 .output_width_tile = 1,
5964 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07005965 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
Marat Dukhan91249d22020-10-24 12:02:51 -07005966 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2,
Marat Dukhan14fe0b22019-10-23 21:20:07 -07005967 .output_width_tile = 1,
Marat Dukhan91249d22020-10-24 12:02:51 -07005968 .output_height_tile = 2,
Marat Dukhan14fe0b22019-10-23 21:20:07 -07005969 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07005970 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
Marat Dukhancf5b3c32020-10-25 19:21:10 -07005971 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2,
Marat Dukhan14fe0b22019-10-23 21:20:07 -07005972 .output_width_tile = 1,
5973 .output_height_tile = 1,
5974 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07005975 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
5976 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5,
Marat Dukhana99918a2019-11-15 14:40:12 -08005977 .output_width_tile = 1,
5978 .output_height_tile = 1,
5979 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07005980 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
5981 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5,
Marat Dukhana99918a2019-11-15 14:40:12 -08005982 .output_width_tile = 1,
5983 .output_height_tile = 1,
5984 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07005985 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
5986 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
Marat Dukhan14fe0b22019-10-23 21:20:07 -07005987 .channel_tile = 1,
5988 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -07005989 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
5990 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
5991 .channel_tile = 1,
5992 .pixel_tile = 4,
5993 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08005994 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005995 #endif // XNN_NO_F32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07005996
Frank Barchardb40ee632021-12-30 11:10:02 -08005997 /*************************** VCVT WAsm micro-kernels***************************/
Marat Dukhanaf2ba002021-10-24 14:21:41 -07005998 #ifndef XNN_NO_VCVT_OPERATORS
5999 init_flags |= XNN_INIT_FLAG_VCVT;
6000
Marat Dukhan134f9842021-12-29 19:57:31 -08006001 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
6002 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_x1,
6003 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
6004 .element_tile = 1,
6005 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08006006 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
6007 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_bitcast_x4,
6008 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_bitcast_params,
6009 .element_tile = 4,
6010 };
Marat Dukhan430b1732021-12-04 02:53:12 -08006011 if (is_wasm_x86) {
6012 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
Marat Dukhanbdf10992022-01-04 09:20:14 -08006013 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x1,
6014 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_imagic_params,
Marat Dukhan430b1732021-12-04 02:53:12 -08006015 .element_tile = 1,
6016 };
6017 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
Marat Dukhanbdf10992022-01-04 09:20:14 -08006018 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x1,
6019 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_imagic_params,
Marat Dukhan430b1732021-12-04 02:53:12 -08006020 .element_tile = 1,
6021 };
6022 } else {
6023 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
Marat Dukhanbdf10992022-01-04 09:20:14 -08006024 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_x4,
6025 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_fmagic_params,
Marat Dukhan430b1732021-12-04 02:53:12 -08006026 .element_tile = 4,
6027 };
6028 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
Marat Dukhanbdf10992022-01-04 09:20:14 -08006029 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_x4,
6030 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_fmagic_params,
Marat Dukhan430b1732021-12-04 02:53:12 -08006031 .element_tile = 4,
6032 };
6033 }
Marat Dukhanf92206b2021-12-10 17:02:07 -08006034 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
6035 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__scalar_x1,
6036 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
6037 .element_tile = 1,
6038 };
6039 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
6040 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__scalar_x1,
6041 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
6042 .element_tile = 1,
6043 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07006044 #endif // XNN_NO_VCVT_OPERATORS
6045
Frank Barchardb40ee632021-12-30 11:10:02 -08006046 /**************************** X32 WAsm micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07006047 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07006048 init_flags |= XNN_INIT_FLAG_X32;
6049
Marat Dukhan8fe54e42019-10-10 14:12:59 -07006050 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
6051 xnn_params.x32.zip = (struct zip_parameters) {
6052 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
6053 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
6054 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
6055 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
6056 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08006057 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -08006058 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
6059 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08006060 .channel_tile = 1,
6061 .pixel_tile = 1,
6062 };
6063 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07006064 #endif // XNN_NO_X32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07006065
Frank Barchardb40ee632021-12-30 11:10:02 -08006066 /**************************** XX WAsm micro-kernels****************************/
Marat Dukhan933051b2021-08-07 16:26:15 -07006067 #ifndef XNN_NO_XX_OPERATORS
6068 init_flags |= XNN_INIT_FLAG_XX;
6069
6070 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
6071 xnn_params.xx.fill = (struct fill_parameters) {
6072 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
6073 .row_tile = 1,
6074 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -07006075 xnn_params.xx.pad = (struct pad_parameters) {
6076 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
6077 .row_tile = 1,
6078 };
Marat Dukhan933051b2021-08-07 16:26:15 -07006079 #endif
6080
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006081#elif XNN_ARCH_RISCV
6082
Marat Dukhana198f002022-01-04 18:45:11 -08006083 /************************** QC8 RISC-V micro-kernels **************************/
6084 #ifndef XNN_NO_QC8_OPERATORS
6085 init_flags |= XNN_INIT_FLAG_QC8;
6086
6087 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6088 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6089 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6090 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6091 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_scalar_lrintf_params;
6092 xnn_params.qc8.gemm.mr = 3;
6093 xnn_params.qc8.gemm.nr = 4;
6094
6095 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf;
6096 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_scalar_lrintf_params;
6097 xnn_params.qc8.dwconv[0].channel_tile = 2;
6098 xnn_params.qc8.dwconv[0].primary_tile = 9;
6099 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x25__scalar_lrintf;
6100 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_scalar_lrintf_params;
6101 xnn_params.qc8.dwconv[1].channel_tile = 2;
6102 xnn_params.qc8.dwconv[1].primary_tile = 25;
6103 #endif // XNN_NO_QS8_OPERATORS
6104
6105 /************************** QS8 RISC-V micro-kernels **************************/
Marat Dukhan803c1f82021-05-12 00:13:37 -07006106 #ifndef XNN_NO_QS8_OPERATORS
6107 init_flags |= XNN_INIT_FLAG_QS8;
6108
Marat Dukhana198f002022-01-04 18:45:11 -08006109 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6110 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6111 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6112 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6113 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
Marat Dukhan15a35c02021-05-12 11:40:03 -07006114 xnn_params.qs8.gemm.mr = 3;
Marat Dukhan803c1f82021-05-12 00:13:37 -07006115 xnn_params.qs8.gemm.nr = 4;
6116
Marat Dukhana198f002022-01-04 18:45:11 -08006117 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf;
6118 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
Marat Dukhan803c1f82021-05-12 00:13:37 -07006119 xnn_params.qs8.dwconv[0].channel_tile = 2;
6120 xnn_params.qs8.dwconv[0].primary_tile = 9;
Marat Dukhana198f002022-01-04 18:45:11 -08006121 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x25__scalar_lrintf;
6122 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07006123 xnn_params.qs8.dwconv[1].channel_tile = 2;
6124 xnn_params.qs8.dwconv[1].primary_tile = 25;
Marat Dukhan803c1f82021-05-12 00:13:37 -07006125
6126 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan847ff5e2022-01-11 20:31:06 -08006127 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
6128 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
Marat Dukhan53f41062022-01-11 19:44:57 -08006129 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
6130 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08006131 .row_tile = 7,
6132 .channel_tile = 1,
Marat Dukhan803c1f82021-05-12 00:13:37 -07006133 };
6134
6135 xnn_params.qs8.vadd = (struct vbinary_parameters) {
6136 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x4,
6137 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
6138 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
Marat Dukhan64287252021-09-07 16:20:03 -07006139 .init.qs8_addsub = xnn_init_qs8_add_minmax_scalar_params,
Marat Dukhan803c1f82021-05-12 00:13:37 -07006140 .element_tile = 4,
6141 };
Marat Dukhana198f002022-01-04 18:45:11 -08006142 xnn_params.qs8.vmul = (struct vbinary_parameters) {
6143 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
6144 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
6145 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
6146 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
6147 .element_tile = 4,
6148 };
Marat Dukhan803c1f82021-05-12 00:13:37 -07006149 #endif // XNN_NO_QS8_OPERATORS
6150
Marat Dukhana198f002022-01-04 18:45:11 -08006151 /************************** QU8 RISC-V micro-kernels **************************/
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006152 #ifndef XNN_NO_QU8_OPERATORS
6153 init_flags |= XNN_INIT_FLAG_QU8;
6154
Marat Dukhana198f002022-01-04 18:45:11 -08006155 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6156 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6157 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6158 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6159 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
6160 xnn_params.qu8.gemm.mr = 3;
6161 xnn_params.qu8.gemm.nr = 4;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006162
Marat Dukhana198f002022-01-04 18:45:11 -08006163 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf;
6164 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
6165 xnn_params.qu8.dwconv[0].channel_tile = 2;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006166 xnn_params.qu8.dwconv[0].primary_tile = 9;
Marat Dukhana198f002022-01-04 18:45:11 -08006167 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x25__scalar_lrintf;
6168 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
6169 xnn_params.qu8.dwconv[1].channel_tile = 2;
6170 xnn_params.qu8.dwconv[1].primary_tile = 25;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006171
6172 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08006173 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
6174 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
6175 .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
6176 .primary_tile = 9,
6177 .incremental_tile = 8,
6178 .channel_tile = 1,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006179 };
6180 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhand1f53e42022-01-12 22:34:51 -08006181 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
6182 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
6183 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
6184 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08006185 .row_tile = 7,
6186 .channel_tile = 1,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006187 };
Marat Dukhandb007cd2021-07-20 23:42:39 -07006188
6189 xnn_params.qu8.vadd = (struct vbinary_parameters) {
6190 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x4,
6191 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
6192 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
Marat Dukhan64287252021-09-07 16:20:03 -07006193 .init.qu8_addsub = xnn_init_qu8_add_minmax_scalar_params,
Marat Dukhandb007cd2021-07-20 23:42:39 -07006194 .element_tile = 4,
6195 };
Marat Dukhana198f002022-01-04 18:45:11 -08006196 xnn_params.qu8.vmul = (struct vbinary_parameters) {
6197 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
6198 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
6199 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
6200 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
6201 .element_tile = 4,
6202 };
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006203 #endif // XNN_NO_QU8_OPERATORS
6204
Marat Dukhana198f002022-01-04 18:45:11 -08006205 /************************** S8 RISC-V micro-kernels ***************************/
6206 #ifndef XNN_NO_S8_OPERATORS
6207 init_flags |= XNN_INIT_FLAG_S8;
6208
6209 xnn_params.s8.clamp = (struct vunary_parameters) {
6210 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
6211 .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
6212 .element_tile = 4,
6213 };
6214 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
6215 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
6216 .pixel_tile = 1,
6217 .channel_tile = 1,
6218 };
6219 xnn_params.s8.maxpool = (struct maxpool_parameters) {
6220 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
6221 .init.s8 = xnn_init_s8_minmax_scalar_params,
6222 .mr = 9,
6223 .qr = 8,
6224 };
6225 #endif // XNN_NO_S8_OPERATORS
6226
6227 /************************** U8 RISC-V micro-kernels ***************************/
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006228 #ifndef XNN_NO_U8_OPERATORS
6229 init_flags |= XNN_INIT_FLAG_U8;
6230
Marat Dukhan94912792021-08-16 21:40:30 -07006231 xnn_params.u8.clamp = (struct vunary_parameters) {
6232 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
6233 .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
6234 .element_tile = 4,
6235 };
Marat Dukhana198f002022-01-04 18:45:11 -08006236 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
6237 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
6238 .pixel_tile = 1,
6239 .channel_tile = 1,
6240 };
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006241 xnn_params.u8.maxpool = (struct maxpool_parameters) {
6242 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -07006243 .init.u8 = xnn_init_u8_minmax_scalar_params,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006244 .mr = 9,
6245 .qr = 8,
6246 };
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006247 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
6248 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
6249 #endif // XNN_NO_U8_OPERATORS
6250
Marat Dukhana198f002022-01-04 18:45:11 -08006251 /************************** X8 RISC-V micro-kernels ***************************/
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006252 #ifndef XNN_NO_X8_OPERATORS
6253 init_flags |= XNN_INIT_FLAG_X8;
6254
Marat Dukhand67539d2021-09-08 23:06:03 -07006255 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006256 xnn_params.x8.zip = (struct zip_parameters) {
6257 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
6258 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
6259 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
6260 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
6261 };
6262 #endif // XNN_NO_X8_OPERATORS
6263
Marat Dukhana198f002022-01-04 18:45:11 -08006264 /************************** F32 RISC-V micro-kernels **************************/
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006265 #ifndef XNN_NO_F32_OPERATORS
6266 init_flags |= XNN_INIT_FLAG_F32;
6267
6268 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__scalar);
6269 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__scalar);
6270 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__scalar);
6271 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__scalar);
6272 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__scalar);
6273 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__scalar);
6274 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__scalar);
6275 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__scalar);
6276 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__scalar);
6277 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__scalar);
6278 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar);
6279 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07006280 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006281 xnn_params.f32.gemm.mr = 4;
6282 xnn_params.f32.gemm.nr = 4;
6283
6284 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__scalar);
6285 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__scalar),
6286 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__scalar);
6287 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__scalar),
Marat Dukhan104ae5e2021-05-24 13:41:57 -07006288 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006289 xnn_params.f32.gemm2.mr = 4;
6290 xnn_params.f32.gemm2.nr = 2;
6291
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07006292 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__scalar_acc2;
6293 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07006294 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006295 xnn_params.f32.dwconv[0].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07006296 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006297
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07006298 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__scalar_acc2;
6299 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07006300 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006301 xnn_params.f32.dwconv[1].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07006302 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006303
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07006304 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__scalar_acc2;
6305 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07006306 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006307 xnn_params.f32.dwconv[2].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07006308 xnn_params.f32.dwconv[2].primary_tile = 9;
6309
6310 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__scalar_acc2;
6311 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar_acc2;
6312 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
6313 xnn_params.f32.dwconv[3].channel_tile = 1;
6314 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006315
6316 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08006317 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__scalar_c1,
6318 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__scalar_c1,
6319 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
6320 .primary_tile = 9,
6321 .incremental_tile = 8,
6322 .channel_tile = 1,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006323 };
6324 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08006325 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__scalar_c1,
6326 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__scalar_c1,
6327 .primary_tile = 9,
6328 .incremental_tile = 8,
6329 .channel_tile = 1,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006330 };
6331 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08006332 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1,
6333 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1,
6334 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
6335 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
6336 .row_tile = 7,
6337 .channel_tile = 1,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006338 };
6339 xnn_params.f32.maxpool = (struct maxpool_parameters) {
6340 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -07006341 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006342 .mr = 9,
6343 .qr = 8,
6344 };
6345 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
6346 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
6347 .mr = 4,
6348 };
6349 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
6350 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
6351 .mr = 9,
6352 };
6353 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
6354 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
6355 .mr = 9,
6356 .qr = 8,
6357 };
6358 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
6359 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
6360 .pixel_tile = 1,
6361 .channel_tile = 2,
6362 };
Marat Dukhane5efb162021-12-31 10:26:13 -08006363 xnn_params.f32.abs = (struct vunary_parameters) {
6364 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4,
6365 .element_tile = 4,
6366 };
Marat Dukhana198f002022-01-04 18:45:11 -08006367 xnn_params.f32.clamp = (struct vunary_parameters) {
6368 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__scalar_x4,
6369 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6370 .element_tile = 4,
6371 };
6372 xnn_params.f32.elu = (struct vunary_parameters) {
6373 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4,
6374 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
6375 .element_tile = 4,
6376 };
Marat Dukhan561d0682021-12-23 16:12:35 -08006377 xnn_params.f32.hswish = (struct vunary_parameters) {
6378 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08006379 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08006380 .element_tile = 4,
6381 };
Marat Dukhana198f002022-01-04 18:45:11 -08006382 xnn_params.f32.lrelu = (struct vunary_parameters) {
6383 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4,
6384 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
6385 .element_tile = 4,
Marat Dukhan4a79ff22022-01-01 12:16:48 -08006386 };
Marat Dukhane5efb162021-12-31 10:26:13 -08006387 xnn_params.f32.neg = (struct vunary_parameters) {
6388 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4,
6389 .element_tile = 4,
6390 };
Marat Dukhan0e801372022-01-04 00:10:41 -08006391 xnn_params.f32.rndne = (struct vunary_parameters) {
Marat Dukhana198f002022-01-04 18:45:11 -08006392 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x1,
6393 .element_tile = 1,
Marat Dukhan0e801372022-01-04 00:10:41 -08006394 };
6395 xnn_params.f32.rndz = (struct vunary_parameters) {
Marat Dukhana198f002022-01-04 18:45:11 -08006396 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x1,
6397 .element_tile = 1,
Marat Dukhan0e801372022-01-04 00:10:41 -08006398 };
6399 xnn_params.f32.rndu = (struct vunary_parameters) {
Marat Dukhana198f002022-01-04 18:45:11 -08006400 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x1,
6401 .element_tile = 1,
Marat Dukhan0e801372022-01-04 00:10:41 -08006402 };
6403 xnn_params.f32.rndd = (struct vunary_parameters) {
Marat Dukhana198f002022-01-04 18:45:11 -08006404 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x1,
6405 .element_tile = 1,
Marat Dukhan0e801372022-01-04 00:10:41 -08006406 };
Marat Dukhance834ad2022-01-03 00:22:01 -08006407 xnn_params.f32.sigmoid = (struct vunary_parameters) {
6408 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
6409 .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
6410 .element_tile = 2,
6411 };
Marat Dukhane5efb162021-12-31 10:26:13 -08006412 xnn_params.f32.sqr = (struct vunary_parameters) {
6413 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4,
6414 .element_tile = 4,
6415 };
Marat Dukhane72b2822021-12-30 14:46:58 -08006416 xnn_params.f32.sqrt = (struct vunary_parameters) {
6417 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
6418 .element_tile = 1,
6419 };
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006420 xnn_params.f32.prelu = (struct prelu_parameters) {
6421 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
Marat Dukhana198f002022-01-04 18:45:11 -08006422 .row_tile = 4,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006423 .channel_tile = 4,
6424 };
Marat Dukhan4a5c7712022-01-05 22:43:13 -08006425 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
6426 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
6427 .init = xnn_init_f32_expminus_scalar_rr2_p5_params,
6428 .element_tile = 4,
6429 };
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006430 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
6431 xnn_params.f32.vadd = (struct vbinary_parameters) {
6432 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__scalar_x8,
6433 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
6434 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08006435 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006436 .element_tile = 8,
6437 };
6438 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Marat Dukhana198f002022-01-04 18:45:11 -08006439 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
6440 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
6441 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
Marat Dukhanf6004972021-12-30 11:23:02 -08006442 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhana198f002022-01-04 18:45:11 -08006443 .element_tile = 2,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006444 };
6445 xnn_params.f32.vmax = (struct vbinary_parameters) {
6446 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__scalar_x8,
6447 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
6448 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
6449 .element_tile = 8,
6450 };
6451 xnn_params.f32.vmin = (struct vbinary_parameters) {
6452 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__scalar_x8,
6453 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
6454 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
6455 .element_tile = 8,
6456 };
6457 xnn_params.f32.vmul = (struct vbinary_parameters) {
6458 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__scalar_x8,
6459 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
6460 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08006461 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006462 .element_tile = 8,
6463 };
6464 xnn_params.f32.vsub = (struct vbinary_parameters) {
6465 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__scalar_x8,
6466 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__scalar_x8,
6467 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__scalar_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08006468 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006469 .element_tile = 8,
6470 };
6471 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
6472 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
6473 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
6474 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
6475 .element_tile = 8,
6476 };
6477 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
6478 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__scalar_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07006479 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006480 .channel_tile = 1,
6481 .row_tile = 2,
6482 };
6483 #ifndef XNN_NO_NCHW_OPERATORS
6484 init_flags |= XNN_INIT_FLAG_CHW_OPT;
6485
6486 xnn_params.f32.spmm = (struct spmm_parameters) {
6487 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
6488 .mr = 8,
6489 .nr = 1,
6490 };
6491 xnn_params.f32.spmm2 = (struct spmm_parameters) {
6492 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
6493 .mr = 8,
6494 .nr = 2,
6495 };
6496 xnn_params.f32.spmm4 = (struct spmm_parameters) {
6497 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
6498 .mr = 8,
6499 .nr = 4,
6500 };
6501 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
6502 .ukernel_with_symm_padding =
6503 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
6504 .output_channel_tile = 4,
6505 .output_height_tile = 1,
6506 .output_width_tile = 1,
6507 };
6508 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
6509 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2,
6510 .output_width_tile = 1,
6511 .output_height_tile = 2,
6512 };
6513 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
6514 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2,
6515 .output_width_tile = 1,
6516 .output_height_tile = 1,
6517 };
6518 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
6519 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5,
6520 .output_width_tile = 1,
6521 .output_height_tile = 1,
6522 };
6523 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
6524 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5,
6525 .output_width_tile = 1,
6526 .output_height_tile = 1,
6527 };
6528 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
6529 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
6530 .channel_tile = 1,
6531 };
6532 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
6533 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
6534 .channel_tile = 1,
6535 .pixel_tile = 4,
6536 };
6537 #endif // XNN_NO_NCHW_OPERATORS
6538 #endif // XNN_NO_F32_OPERATORS
6539
Marat Dukhana198f002022-01-04 18:45:11 -08006540 /************************** VCVT RISC-V micro-kernels *************************/
6541 #ifndef XNN_NO_VCVT_OPERATORS
6542 init_flags |= XNN_INIT_FLAG_VCVT;
6543
6544 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
6545 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_x4,
6546 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
6547 .element_tile = 4,
6548 };
6549 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
6550 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_fabsf_x2,
6551 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_fabsf_params,
6552 .element_tile = 2,
6553 };
6554 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
6555 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__scalar_lrintf_x4,
6556 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_lrintf_params,
6557 .element_tile = 4,
6558 };
6559 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
6560 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_lrintf_x4,
6561 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_lrintf_params,
6562 .element_tile = 4,
6563 };
6564 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
6565 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__scalar_x4,
6566 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
6567 .element_tile = 4,
6568 };
6569 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
6570 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__scalar_x4,
6571 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
6572 .element_tile = 4,
6573 };
6574 #endif // XNN_NO_VCVT_OPERATORS
6575
6576 /************************** X32 RISC-V micro-kernels **************************/
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006577 #ifndef XNN_NO_X32_OPERATORS
6578 init_flags |= XNN_INIT_FLAG_X32;
6579
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006580 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
6581 xnn_params.x32.zip = (struct zip_parameters) {
6582 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
6583 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
6584 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
6585 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
6586 };
6587 #ifndef XNN_NO_NCHW_OPERATORS
6588 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
6589 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
6590 .channel_tile = 1,
6591 .pixel_tile = 1,
6592 };
6593 #endif // XNN_NO_NCHW_OPERATORS
6594 #endif // XNN_NO_X32_OPERATORS
6595
Marat Dukhana198f002022-01-04 18:45:11 -08006596 /************************** XX RISC-V micro-kernels ***************************/
Marat Dukhan0461f2d2021-08-08 12:36:29 -07006597 #ifndef XNN_NO_XX_OPERATORS
6598 init_flags |= XNN_INIT_FLAG_XX;
6599
6600 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
6601 xnn_params.xx.fill = (struct fill_parameters) {
6602 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
6603 .row_tile = 1,
6604 };
6605 xnn_params.xx.pad = (struct pad_parameters) {
6606 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
6607 .row_tile = 1,
6608 };
Marat Dukhana198f002022-01-04 18:45:11 -08006609 #endif // XNN_NO_XX_OPERATORS
Marat Dukhan0461f2d2021-08-08 12:36:29 -07006610
XNNPACK Teamb455b122019-09-27 18:10:33 -07006611#else
6612 #error "Unsupported architecture"
6613#endif
Marat Dukhan496389f2021-04-07 15:47:12 -07006614
6615 memcpy(&xnn_params.allocator, init_allocator, sizeof(struct xnn_allocator));
Marat Dukhan854fb6b2020-06-19 12:33:44 -07006616 xnn_params.init_flags = init_flags;
XNNPACK Teamb455b122019-09-27 18:10:33 -07006617}
6618
Zhi An Ng0db15d32021-12-10 16:45:06 -08006619#if XNN_PLATFORM_WINDOWS
Marat Dukhan57133c02020-04-13 00:54:59 -07006620 static BOOL CALLBACK init_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) {
6621 init();
6622 return TRUE;
6623 }
6624#endif
6625
Marat Dukhan04f03be2019-11-19 12:36:47 -08006626enum xnn_status xnn_initialize(const struct xnn_allocator* allocator) {
Marat Dukhana198f002022-01-04 18:45:11 -08006627 #if !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
Marat Dukhand343c222019-10-07 09:22:14 -07006628 if (!cpuinfo_initialize()) {
6629 return xnn_status_out_of_memory;
6630 }
Marat Dukhana198f002022-01-04 18:45:11 -08006631 #endif // !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
Marat Dukhan496389f2021-04-07 15:47:12 -07006632 if (allocator == NULL) {
6633 allocator = &xnn_default_allocator;
6634 }
6635 #ifdef _MSC_VER
Marat Dukhandf94d982021-06-01 12:21:33 -07006636 _InterlockedCompareExchangePointer((PVOID volatile*) &init_allocator, (PVOID) allocator, NULL);
Marat Dukhan496389f2021-04-07 15:47:12 -07006637 #else
6638 __sync_bool_compare_and_swap(&init_allocator, NULL, allocator);
6639 #endif
Zhi An Ng0db15d32021-12-10 16:45:06 -08006640 #if XNN_PLATFORM_WINDOWS
Marat Dukhan57133c02020-04-13 00:54:59 -07006641 InitOnceExecuteOnce(&init_guard, &init_windows, NULL, NULL);
6642 #else
6643 pthread_once(&init_guard, &init);
6644 #endif
Marat Dukhan854fb6b2020-06-19 12:33:44 -07006645 if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) != 0) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07006646 return xnn_status_success;
6647 } else {
6648 return xnn_status_unsupported_hardware;
6649 }
6650}
6651
6652enum xnn_status xnn_deinitialize(void) {
Marat Dukhana198f002022-01-04 18:45:11 -08006653 #if !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
Marat Dukhand343c222019-10-07 09:22:14 -07006654 cpuinfo_deinitialize();
Marat Dukhana198f002022-01-04 18:45:11 -08006655 #endif // !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
XNNPACK Teamb455b122019-09-27 18:10:33 -07006656 return xnn_status_success;
Marat Dukhan3b9b4bc2021-08-24 00:23:07 -07006657}